[PATCH] powerpc: Optimized conversion of IBM long double to int128/int64
Rajalakshmi Srinivasaraghavan
raji@linux.vnet.ibm.com
Tue Oct 23 16:31:00 GMT 2018
This new implementation of fixunstfdi and fixunstfti
gives 16X performance improvement.
The design is focused on:
- Making sure the end result was a pure leaf function that
only needed builtins or inline functions.
- Assumed power8 direct register transfer and accessed the IBM
long double as int bit field structure.
- Understanding the quirks of IBM long double and decompose the
code in to a set of optimized sub cases.
Tested on powerpc64le.
2018-10-20 Steven Munroe <munroesj52@gmail.com>
Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
* libgcc/config/rs6000/t-ppc64-fp (LIB2ADD): Add
$(srcdir)/config/rs6000/fixunstfti.c.
* libgcc/config/rs6000/ppc64-fp.c (__fixunstfdi): Remove definition.
* libgcc/config/rs6000/fixunstfti.c: New file.
* libgcc/config/rs6000/fixunstfdi.c: Likewise.
* libgcc/config/rs6000/ibm-ldouble.h: Likewise.
---
libgcc/config/rs6000/fixunstfdi.c | 124 ++++++++++++++++++++++++++++
libgcc/config/rs6000/fixunstfti.c | 125 +++++++++++++++++++++++++++++
libgcc/config/rs6000/ibm-ldouble.h | 121 ++++++++++++++++++++++++++++
libgcc/config/rs6000/ppc64-fp.c | 24 ------
libgcc/config/rs6000/t-ppc64-fp | 5 +-
5 files changed, 374 insertions(+), 25 deletions(-)
create mode 100755 libgcc/config/rs6000/fixunstfdi.c
create mode 100755 libgcc/config/rs6000/fixunstfti.c
create mode 100755 libgcc/config/rs6000/ibm-ldouble.h
diff --git a/libgcc/config/rs6000/fixunstfdi.c b/libgcc/config/rs6000/fixunstfdi.c
new file mode 100755
index 00000000000..1b1a4f280bd
--- /dev/null
+++ b/libgcc/config/rs6000/fixunstfdi.c
@@ -0,0 +1,124 @@
+/* Convert IBM long double to 64bit unsigned integer.
+
+ Copyright (C) 2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ In addition to the permissions in the GNU Lesser General Public
+ License, the Free Software Foundation gives you unlimited
+ permission to link the compiled version of this file into
+ combinations with other programs, and to distribute those
+ combinations without any restriction coming from the use of this
+ file. (The Lesser General Public License restrictions do apply in
+ other respects; for example, they cover modification of the file,
+ and distribution when not linked into a combine executable.)
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined(__powerpc64__) || defined (__64BIT__) || defined(__ppc64__)
+#include <stdint.h>
+#include "ibm-ldouble.h"
+
+typedef unsigned int UDItype __attribute__ ((mode (DI)));
+typedef float TFtype __attribute__ ((mode (TF)));
+extern UDItype __fixunstfdi (TFtype);
+
+#define TWO53 9007199254740992.0L
+#define TWO64 18446744073709551616.0L
+
+UDItype
+__fixunstfdi (TFtype a)
+{
+ unsigned long result;
+ unsigned long qi0, qi1;
+ union ibm_extended_long_double ld;
+ uint64_t l0, l1;
+ long exp0, exp1;
+ const uint64_t two52 = 0x10000000000000;
+ if (__builtin_unpack_longdouble (a, 0) < TWO53)
+ {
+ /* In this case the integer portion is completely contained
+ within the high double. So use the hardware convert to
+ integer doubleword, and then extend to int. */
+ l1 = __builtin_unpack_longdouble (a, 0);
+ result = l1;
+ }
+ else
+ {
+ if (a < TWO64)
+ {
+ ld.ld = a;
+ l0 = two52 | ((uint64_t)ld.d[0].ieee.mantissa0 << 32)
+ | ld.d[0].ieee.mantissa1;
+ l1 = two52 | ((uint64_t)ld.d[1].ieee.mantissa0 << 32)
+ | ld.d[1].ieee.mantissa1;
+ exp0 = ld.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS;
+ exp1 = ld.d[1].ieee.exponent - IEEE754_DOUBLE_BIAS;
+ /* The high double shift is (non-negative) because in this
+ case we know the value it greater than 2^53 -1. */
+ qi0 = l0;
+ qi0 = qi0 << (exp0 - 52);
+ /* The low double is tricky because it could be
+ zero/denormal and have a large negative exponent. */
+ if ( exp1 > -1022)
+ {
+ /* Need to right justify the integer portion of the
+ low double. This may be a left or right shift. */
+ exp1 = exp1 - 52;
+ if (exp1 < 0)
+ {
+ /* Negative exponent, shift right to truncate. */
+ l1 = l1 >> (-exp1);
+ /* If we shift all the significant bit away, but
+ the signs differ then the sign bit has
+ significance. */
+ if (l1 == 0)
+ l1 = ld.d[1].ieee.negative;
+ qi1 = l1;
+ }
+ else
+ {
+ /* Non-negative exponent, shift left. */
+ qi1 = l1;
+ qi1 = qi1 << (exp1);
+ }
+ /* In this case high dbl must be positive so only have
+ to check if low dbl is negative. If the signs
+ differ, then subtract the converted low from the
+ high as quadwords. Otherwise simply add the
+ converted quadwords. */
+ if (ld.d[1].ieee.negative)
+ result = qi0 - qi1;
+ else
+ result = qi0 + qi1;
+ }
+ else
+ {
+ /* Here the low double is denormal or zero. So only
+ the converted high double is significant. */
+ result = qi0;
+ }
+ }
+ else
+ {
+ /* Overflow case. Convert the high double then replicate
+ to high/low int. This will generate the overflow
+ value and sets CVI. */
+ l0 = __builtin_unpack_longdouble (a, 0);
+ result = l0;
+ }
+ }
+ return (result);
+}
+#endif
diff --git a/libgcc/config/rs6000/fixunstfti.c b/libgcc/config/rs6000/fixunstfti.c
new file mode 100755
index 00000000000..68a8da9c91b
--- /dev/null
+++ b/libgcc/config/rs6000/fixunstfti.c
@@ -0,0 +1,125 @@
+/* Convert IBM long double to 128bit unsigned integer.
+
+ Copyright (C) 2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ In addition to the permissions in the GNU Lesser General Public
+ License, the Free Software Foundation gives you unlimited
+ permission to link the compiled version of this file into
+ combinations with other programs, and to distribute those
+ combinations without any restriction coming from the use of this
+ file. (The Lesser General Public License restrictions do apply in
+ other respects; for example, they cover modification of the file,
+ and distribution when not linked into a combine executable.)
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined(__powerpc64__) || defined (__64BIT__) || defined(__ppc64__)
+#include <stdint.h>
+#include "ibm-ldouble.h"
+
+typedef unsigned int UTItype __attribute__ ((mode (TI)));
+typedef float TFtype __attribute__ ((mode (TF)));
+extern UTItype __fixunstfti (TFtype);
+
+#define TWO53 9007199254740992.0L
+#define TWO128 340282366920938463463374607431768211456.0L
+
+UTItype
+__fixunstfti (TFtype a)
+{
+ unsigned __int128 result;
+ unsigned __int128 qi0, qi1;
+ union ibm_extended_long_double ld;
+ uint64_t l0, l1;
+ long exp0, exp1;
+ const uint64_t two52 = 0x10000000000000;
+ if (__builtin_unpack_longdouble (a, 0) < TWO53)
+ {
+ /* In this case the integer portion is completely contained
+ within the high double. So use the hardware convert to
+ integer doubleword, and then extend to __int128. */
+ l1 = __builtin_unpack_longdouble (a, 0);
+ result = l1;
+ }
+ else
+ {
+ if (a < TWO128)
+ {
+ ld.ld = a;
+ l0 = two52 | ((uint64_t)ld.d[0].ieee.mantissa0 << 32)
+ | ld.d[0].ieee.mantissa1;
+ l1 = two52 | ((uint64_t)ld.d[1].ieee.mantissa0 << 32)
+ | ld.d[1].ieee.mantissa1;
+ exp0 = ld.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS;
+ exp1 = ld.d[1].ieee.exponent - IEEE754_DOUBLE_BIAS;
+ /* The high double shift is (non-negative) because in this
+ case we know the value it greater than 2^53 -1. */
+ qi0 = l0;
+ qi0 = qi0 << (exp0 - 52);
+ /* The low double is tricky because it could be
+ zero/denormal and have a large negative exponent. */
+ if ( exp1 > -1022)
+ {
+ /* Need to right justify the integer portion of the
+ low double. This may be a left or right shift. */
+ exp1 = exp1 - 52;
+ if (exp1 < 0)
+ {
+ /* Negative exponent, shift right to truncate. */
+ l1 = l1 >> (-exp1);
+ /* If we shift all the significant bit away, but
+ the signs differ then the sign bit has
+ significance. */
+ if (l1 == 0)
+ l1 = ld.d[1].ieee.negative;
+ qi1 = l1;
+ }
+ else
+ {
+ /* Non-negative exponent, shift left. */
+ qi1 = l1;
+ qi1 = qi1 << (exp1);
+ }
+ /* In this case high dbl must be positive so only have
+ to check if low dbl is negative. If the signs
+ differ, then subtract the converted low from the
+ high as quadwords. Otherwise simply add the
+ converted quadwords. */
+ if (ld.d[1].ieee.negative)
+ result = qi0 - qi1;
+ else
+ result = qi0 + qi1;
+ }
+ else
+ {
+ /* Here the low double is denormal or zero. So only
+ the converted high double is significant. */
+ result = qi0;
+ }
+ }
+ else
+ {
+ /* Overflow case. Convert the high double then replicate
+ to high/low __int128. This will generate the overflow
+ value and sets CVI. */
+ l0 = __builtin_unpack_longdouble (a, 0);
+ result = l0;
+ result = (result << 64) + l0;
+ }
+ }
+ return (result);
+}
+#endif
diff --git a/libgcc/config/rs6000/ibm-ldouble.h b/libgcc/config/rs6000/ibm-ldouble.h
new file mode 100755
index 00000000000..2fb89cd252a
--- /dev/null
+++ b/libgcc/config/rs6000/ibm-ldouble.h
@@ -0,0 +1,121 @@
+/* Copyright (C) 2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <endian.h>
+union ieee754_float
+ {
+ float f;
+
+ /* This is the IEEE 754 single-precision format. */
+ struct
+ {
+#if __BYTE_ORDER == __BIG_ENDIAN
+ unsigned int negative:1;
+ unsigned int exponent:8;
+ unsigned int mantissa:23;
+#endif /* Big endian. */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ unsigned int mantissa:23;
+ unsigned int exponent:8;
+ unsigned int negative:1;
+#endif /* Little endian. */
+ } ieee;
+
+ /* This format makes it easier to see if a NaN is a signalling NaN. */
+ struct
+ {
+#if __BYTE_ORDER == __BIG_ENDIAN
+ unsigned int negative:1;
+ unsigned int exponent:8;
+ unsigned int quiet_nan:1;
+ unsigned int mantissa:22;
+#endif /* Big endian. */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ unsigned int mantissa:22;
+ unsigned int quiet_nan:1;
+ unsigned int exponent:8;
+ unsigned int negative:1;
+#endif /* Little endian. */
+ } ieee_nan;
+ };
+
+#define IEEE754_FLOAT_BIAS 0x7f /* Added to exponent. */
+
+
+union ieee754_double
+ {
+ double d;
+
+ /* This is the IEEE 754 double-precision format. */
+ struct
+ {
+#if __BYTE_ORDER == __BIG_ENDIAN
+ unsigned int negative:1;
+ unsigned int exponent:11;
+ /* Together these comprise the mantissa. */
+ unsigned int mantissa0:20;
+ unsigned int mantissa1:32;
+#endif /* Big endian. */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ /* Together these comprise the mantissa. */
+ unsigned int mantissa1:32;
+ unsigned int mantissa0:20;
+ unsigned int exponent:11;
+ unsigned int negative:1;
+#endif /* Little endian. */
+ } ieee;
+
+ /* This format makes it easier to see if a NaN is a signalling NaN. */
+ struct
+ {
+#if __BYTE_ORDER == __BIG_ENDIAN
+ unsigned int negative:1;
+ unsigned int exponent:11;
+ unsigned int quiet_nan:1;
+ /* Together these comprise the mantissa. */
+ unsigned int mantissa0:19;
+ unsigned int mantissa1:32;
+#else
+ /* Together these comprise the mantissa. */
+ unsigned int mantissa1:32;
+ unsigned int mantissa0:19;
+ unsigned int quiet_nan:1;
+ unsigned int exponent:11;
+ unsigned int negative:1;
+#endif
+ } ieee_nan;
+ };
+
+#define IEEE754_DOUBLE_BIAS 0x3ff /* Added to exponent. */
+
+
+/* IBM extended format for long double.
+
+ Each long double is made up of two IEEE doubles. The value of the
+ long double is the sum of the values of the two parts. The most
+ significant part is required to be the value of the long double
+ rounded to the nearest double, as specified by IEEE. For Inf
+ values, the least significant part is required to be one of +0.0 or
+ -0.0. No other requirements are made; so, for example, 1.0 may be
+ represented as (1.0, +0.0) or (1.0, -0.0), and the low part of a
+ NaN is don't-care. */
+
+union ibm_extended_long_double
+ {
+ long double ld;
+ union ieee754_double d[2];
+ };
diff --git a/libgcc/config/rs6000/ppc64-fp.c b/libgcc/config/rs6000/ppc64-fp.c
index faffc82eeda..97921632dcb 100644
--- a/libgcc/config/rs6000/ppc64-fp.c
+++ b/libgcc/config/rs6000/ppc64-fp.c
@@ -183,30 +183,6 @@ __floatundisf (UDItype u)
return (SFtype) f;
}
-DItype
-__fixunstfdi (TFtype a)
-{
- if (a < 0)
- return 0;
-
- /* Compute high word of result, as a flonum. */
- const TFtype b = (a / (((UDItype) 1) << (sizeof (SItype) * 8)));
- /* Convert that to fixed (but not to DItype!),
- and shift it into the high word. */
- UDItype v = (USItype) b;
- v <<= (sizeof (SItype) * 8);
- /* Remove high part from the TFtype, leaving the low part as flonum. */
- a -= (TFtype) v;
- /* Convert that to fixed (but not to DItype!) and add it in.
- Sometimes A comes out negative. This is significant, since
- A has more bits than a long int does. */
- if (a < 0)
- v -= (USItype) (-a);
- else
- v += (USItype) a;
- return v;
-}
-
/* This version is needed to prevent recursion; fixunsdfdi in libgcc
calls fixdfdi, which in turn calls calls fixunsdfdi. */
diff --git a/libgcc/config/rs6000/t-ppc64-fp b/libgcc/config/rs6000/t-ppc64-fp
index 26d1730bcdb..37b75a931ff 100644
--- a/libgcc/config/rs6000/t-ppc64-fp
+++ b/libgcc/config/rs6000/t-ppc64-fp
@@ -1,2 +1,5 @@
# Can be used unconditionally, wrapped in __powerpc64__ || __64BIT__ __ppc64__.
-LIB2ADD += $(srcdir)/config/rs6000/ppc64-fp.c
+LIB2FUNCS_EXCLUDE = _fixunstfdi
+LIB2ADD += $(srcdir)/config/rs6000/ppc64-fp.c \
+ $(srcdir)/config/rs6000/fixunstfti.c \
+ $(srcdir)/config/rs6000/fixunstfdi.c
--
2.18.0
More information about the Gcc-patches
mailing list