[PATCH] powerpc: Optimized conversion of IBM long double to int128/int64

Tue Oct 23 16:31:00 GMT 2018

This new implementation of fixunstfdi and fixunstfti
gives 16X performance improvement.
The design is focused on:
- Making sure the end result was a pure leaf function that
  only needed builtins or inline functions.
- Assumed power8 direct register transfer and accessed the IBM
  long double as int bit field structure.
- Understanding the quirks of IBM long double and decompose the
  code in to a set of optimized sub cases.
Tested on powerpc64le.

2018-10-20  Steven Munroe  <munroesj52@gmail.com>
            Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>

	* libgcc/config/rs6000/t-ppc64-fp (LIB2ADD): Add
	$(srcdir)/config/rs6000/fixunstfti.c.
	* libgcc/config/rs6000/ppc64-fp.c (__fixunstfdi): Remove definition.
	* libgcc/config/rs6000/fixunstfti.c: New file.
	* libgcc/config/rs6000/fixunstfdi.c: Likewise.
	* libgcc/config/rs6000/ibm-ldouble.h: Likewise.
---
 libgcc/config/rs6000/fixunstfdi.c  | 124 ++++++++++++++++++++++++++++
 libgcc/config/rs6000/fixunstfti.c  | 125 +++++++++++++++++++++++++++++
 libgcc/config/rs6000/ibm-ldouble.h | 121 ++++++++++++++++++++++++++++
 libgcc/config/rs6000/ppc64-fp.c    |  24 ------
 libgcc/config/rs6000/t-ppc64-fp    |   5 +-
 5 files changed, 374 insertions(+), 25 deletions(-)
 create mode 100755 libgcc/config/rs6000/fixunstfdi.c
 create mode 100755 libgcc/config/rs6000/fixunstfti.c
 create mode 100755 libgcc/config/rs6000/ibm-ldouble.h

diff --git a/libgcc/config/rs6000/fixunstfdi.c b/libgcc/config/rs6000/fixunstfdi.c
new file mode 100755
index 00000000000..1b1a4f280bd
--- /dev/null
+++ b/libgcc/config/rs6000/fixunstfdi.c
@@ -0,0 +1,124 @@
+/* Convert IBM long double to 64bit unsigned integer.
+
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined(__powerpc64__) || defined (__64BIT__) || defined(__ppc64__)
+#include <stdint.h>
+#include "ibm-ldouble.h"
+
+typedef unsigned int UDItype __attribute__ ((mode (DI)));
+typedef float TFtype __attribute__ ((mode (TF)));
+extern UDItype __fixunstfdi (TFtype);
+
+#define TWO53 9007199254740992.0L
+#define TWO64 18446744073709551616.0L
+
+UDItype
+__fixunstfdi (TFtype a)
+{
+  unsigned long result;
+  unsigned long qi0, qi1;
+  union ibm_extended_long_double ld;
+  uint64_t l0, l1;
+  long exp0, exp1;
+  const uint64_t two52 = 0x10000000000000;
+  if (__builtin_unpack_longdouble (a, 0) < TWO53)
+    {
+      /* In this case the integer portion is completely contained
+         within the high double.  So use the hardware convert to
+         integer doubleword, and then extend to int.  */
+      l1 = __builtin_unpack_longdouble (a, 0);
+      result = l1;
+    }
+  else
+    {
+      if (a < TWO64)
+	{
+	  ld.ld = a;
+	  l0 = two52 | ((uint64_t)ld.d[0].ieee.mantissa0 << 32)
+                     | ld.d[0].ieee.mantissa1;
+	  l1 = two52 | ((uint64_t)ld.d[1].ieee.mantissa0 << 32)
+                     | ld.d[1].ieee.mantissa1;
+	  exp0 = ld.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS;
+	  exp1 = ld.d[1].ieee.exponent - IEEE754_DOUBLE_BIAS;
+	  /* The high double shift is (non-negative) because in this
+	     case we know the value it greater than 2^53 -1.  */
+	  qi0 = l0;
+	  qi0 = qi0 << (exp0 - 52);
+	  /* The low double is tricky because it could be
+	     zero/denormal and have a large negative exponent.  */
+	  if ( exp1 > -1022)
+	    {
+	      /* Need to right justify the integer portion of the
+	         low double.  This may be a left or right shift.  */
+	      exp1 = exp1 - 52;
+	      if (exp1 < 0)
+	        {
+	          /* Negative exponent,  shift right to truncate.  */
+	          l1 = l1 >> (-exp1);
+	          /* If we shift all the significant bit away, but
+	             the signs differ then the sign bit has
+	             significance.  */
+	          if (l1 == 0)
+	            l1 = ld.d[1].ieee.negative;
+	          qi1 = l1;
+	        }
+	      else
+	        {
+	          /* Non-negative exponent,  shift left.  */
+	          qi1 = l1;
+	          qi1 = qi1 << (exp1);
+	        }
+	      /* In this case high dbl must be positive so only have
+	         to check if low dbl is negative.  If the signs
+	         differ,  then subtract the converted low from the
+	         high as quadwords.  Otherwise simply add the
+	         converted quadwords.  */
+	      if (ld.d[1].ieee.negative)
+	        result = qi0 - qi1;
+	      else
+	        result = qi0 + qi1;
+            }
+          else
+            {
+	      /* Here the low double is denormal or zero.  So only
+	         the converted high double is significant.  */
+	      result = qi0;
+	    }
+        }
+      else
+        {
+	  /* Overflow case.  Convert the high double then replicate
+	     to high/low int.  This will generate the overflow
+	     value and sets CVI.  */
+	  l0 = __builtin_unpack_longdouble (a, 0);
+	  result = l0;
+        }
+    }
+  return (result);
+}
+#endif
diff --git a/libgcc/config/rs6000/fixunstfti.c b/libgcc/config/rs6000/fixunstfti.c
new file mode 100755
index 00000000000..68a8da9c91b
--- /dev/null
+++ b/libgcc/config/rs6000/fixunstfti.c
@@ -0,0 +1,125 @@
+/* Convert IBM long double to 128bit unsigned integer.
+
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   In addition to the permissions in the GNU Lesser General Public
+   License, the Free Software Foundation gives you unlimited
+   permission to link the compiled version of this file into
+   combinations with other programs, and to distribute those
+   combinations without any restriction coming from the use of this
+   file.  (The Lesser General Public License restrictions do apply in
+   other respects; for example, they cover modification of the file,
+   and distribution when not linked into a combine executable.)
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined(__powerpc64__) || defined (__64BIT__) || defined(__ppc64__)
+#include <stdint.h>
+#include "ibm-ldouble.h"
+
+typedef unsigned int UTItype __attribute__ ((mode (TI)));
+typedef float TFtype __attribute__ ((mode (TF)));
+extern UTItype __fixunstfti (TFtype);
+
+#define TWO53 9007199254740992.0L
+#define TWO128 340282366920938463463374607431768211456.0L
+
+UTItype
+__fixunstfti (TFtype a)
+{
+  unsigned __int128 result;
+  unsigned __int128 qi0, qi1;
+  union ibm_extended_long_double ld;
+  uint64_t l0, l1;
+  long exp0, exp1;
+  const uint64_t two52 = 0x10000000000000;
+  if (__builtin_unpack_longdouble (a, 0) < TWO53)
+    {
+      /* In this case the integer portion is completely contained
+         within the high double.  So use the hardware convert to
+         integer doubleword, and then extend to __int128.  */
+      l1 = __builtin_unpack_longdouble (a, 0);
+      result = l1;
+    }
+  else
+    {
+      if (a < TWO128)
+	{
+	  ld.ld = a;
+	  l0 = two52 | ((uint64_t)ld.d[0].ieee.mantissa0 << 32)
+                     | ld.d[0].ieee.mantissa1;
+	  l1 = two52 | ((uint64_t)ld.d[1].ieee.mantissa0 << 32)
+                     | ld.d[1].ieee.mantissa1;
+	  exp0 = ld.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS;
+	  exp1 = ld.d[1].ieee.exponent - IEEE754_DOUBLE_BIAS;
+	  /* The high double shift is (non-negative) because in this
+	     case we know the value it greater than 2^53 -1.  */
+	  qi0 = l0;
+	  qi0 = qi0 << (exp0 - 52);
+	  /* The low double is tricky because it could be
+	     zero/denormal and have a large negative exponent.  */
+	  if ( exp1 > -1022)
+	    {
+	      /* Need to right justify the integer portion of the
+	         low double.  This may be a left or right shift.  */
+	      exp1 = exp1 - 52;
+	      if (exp1 < 0)
+	        {
+	          /* Negative exponent,  shift right to truncate.  */
+	          l1 = l1 >> (-exp1);
+	          /* If we shift all the significant bit away, but
+	             the signs differ then the sign bit has
+	             significance.  */
+	          if (l1 == 0)
+	            l1 = ld.d[1].ieee.negative;
+	          qi1 = l1;
+	        }
+	      else
+	        {
+	          /* Non-negative exponent,  shift left.  */
+	          qi1 = l1;
+	          qi1 = qi1 << (exp1);
+	        }
+	      /* In this case high dbl must be positive so only have
+	         to check if low dbl is negative.  If the signs
+	         differ,  then subtract the converted low from the
+	         high as quadwords.  Otherwise simply add the
+	         converted quadwords.  */
+	      if (ld.d[1].ieee.negative)
+	        result = qi0 - qi1;
+	      else
+	        result = qi0 + qi1;
+            }
+          else
+            {
+	      /* Here the low double is denormal or zero.  So only
+	         the converted high double is significant.  */
+	      result = qi0;
+	    }
+        }
+      else
+        {
+	  /* Overflow case.  Convert the high double then replicate
+	     to high/low __int128.  This will generate the overflow
+	     value and sets CVI.  */
+	  l0 = __builtin_unpack_longdouble (a, 0);
+	  result = l0;
+          result = (result << 64) + l0;
+        }
+    }
+  return (result);
+}
+#endif
diff --git a/libgcc/config/rs6000/ibm-ldouble.h b/libgcc/config/rs6000/ibm-ldouble.h
new file mode 100755
index 00000000000..2fb89cd252a
--- /dev/null
+++ b/libgcc/config/rs6000/ibm-ldouble.h
@@ -0,0 +1,121 @@
+/* Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <endian.h>
+union ieee754_float
+  {
+    float f;
+
+    /* This is the IEEE 754 single-precision format.  */
+    struct
+      {
+#if	__BYTE_ORDER == __BIG_ENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:8;
+	unsigned int mantissa:23;
+#endif				/* Big endian.  */
+#if	__BYTE_ORDER == __LITTLE_ENDIAN
+	unsigned int mantissa:23;
+	unsigned int exponent:8;
+	unsigned int negative:1;
+#endif				/* Little endian.  */
+      } ieee;
+
+    /* This format makes it easier to see if a NaN is a signalling NaN.  */
+    struct
+      {
+#if	__BYTE_ORDER == __BIG_ENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:8;
+	unsigned int quiet_nan:1;
+	unsigned int mantissa:22;
+#endif				/* Big endian.  */
+#if	__BYTE_ORDER == __LITTLE_ENDIAN
+	unsigned int mantissa:22;
+	unsigned int quiet_nan:1;
+	unsigned int exponent:8;
+	unsigned int negative:1;
+#endif				/* Little endian.  */
+      } ieee_nan;
+  };
+
+#define IEEE754_FLOAT_BIAS	0x7f /* Added to exponent.  */
+
+
+union ieee754_double
+  {
+    double d;
+
+    /* This is the IEEE 754 double-precision format.  */
+    struct
+      {
+#if	__BYTE_ORDER == __BIG_ENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:11;
+	/* Together these comprise the mantissa.  */
+	unsigned int mantissa0:20;
+	unsigned int mantissa1:32;
+#endif				/* Big endian.  */
+#if	__BYTE_ORDER == __LITTLE_ENDIAN
+	/* Together these comprise the mantissa.  */
+	unsigned int mantissa1:32;
+	unsigned int mantissa0:20;
+	unsigned int exponent:11;
+	unsigned int negative:1;
+#endif				/* Little endian.  */
+      } ieee;
+
+    /* This format makes it easier to see if a NaN is a signalling NaN.  */
+    struct
+      {
+#if	__BYTE_ORDER == __BIG_ENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:11;
+	unsigned int quiet_nan:1;
+	/* Together these comprise the mantissa.  */
+	unsigned int mantissa0:19;
+	unsigned int mantissa1:32;
+#else
+	/* Together these comprise the mantissa.  */
+	unsigned int mantissa1:32;
+	unsigned int mantissa0:19;
+	unsigned int quiet_nan:1;
+	unsigned int exponent:11;
+	unsigned int negative:1;
+#endif
+      } ieee_nan;
+  };
+
+#define IEEE754_DOUBLE_BIAS	0x3ff /* Added to exponent.  */
+
+
+/* IBM extended format for long double.
+
+   Each long double is made up of two IEEE doubles.  The value of the
+   long double is the sum of the values of the two parts.  The most
+   significant part is required to be the value of the long double
+   rounded to the nearest double, as specified by IEEE.  For Inf
+   values, the least significant part is required to be one of +0.0 or
+   -0.0.  No other requirements are made; so, for example, 1.0 may be
+   represented as (1.0, +0.0) or (1.0, -0.0), and the low part of a
+   NaN is don't-care.  */
+
+union ibm_extended_long_double
+  {
+    long double ld;
+    union ieee754_double d[2];
+   };
diff --git a/libgcc/config/rs6000/ppc64-fp.c b/libgcc/config/rs6000/ppc64-fp.c
index faffc82eeda..97921632dcb 100644
--- a/libgcc/config/rs6000/ppc64-fp.c
+++ b/libgcc/config/rs6000/ppc64-fp.c
@@ -183,30 +183,6 @@ __floatundisf (UDItype u)
   return (SFtype) f;
 }
 
-DItype
-__fixunstfdi (TFtype a)
-{
-  if (a < 0)
-    return 0;
-
-  /* Compute high word of result, as a flonum.  */
-  const TFtype b = (a / (((UDItype) 1) << (sizeof (SItype) * 8)));
-  /* Convert that to fixed (but not to DItype!),
-     and shift it into the high word.  */
-  UDItype v = (USItype) b;
-  v <<= (sizeof (SItype) * 8);
-  /* Remove high part from the TFtype, leaving the low part as flonum.  */
-  a -= (TFtype) v;
-  /* Convert that to fixed (but not to DItype!) and add it in.
-     Sometimes A comes out negative.  This is significant, since
-     A has more bits than a long int does.  */
-  if (a < 0)
-    v -= (USItype) (-a);
-  else
-    v += (USItype) a;
-  return v;
-}
-
 /* This version is needed to prevent recursion; fixunsdfdi in libgcc
    calls fixdfdi, which in turn calls calls fixunsdfdi.  */
 
diff --git a/libgcc/config/rs6000/t-ppc64-fp b/libgcc/config/rs6000/t-ppc64-fp
index 26d1730bcdb..37b75a931ff 100644
--- a/libgcc/config/rs6000/t-ppc64-fp
+++ b/libgcc/config/rs6000/t-ppc64-fp
@@ -1,2 +1,5 @@
 # Can be used unconditionally, wrapped in __powerpc64__ || __64BIT__ __ppc64__.
-LIB2ADD += $(srcdir)/config/rs6000/ppc64-fp.c
+LIB2FUNCS_EXCLUDE = _fixunstfdi
+LIB2ADD += $(srcdir)/config/rs6000/ppc64-fp.c \
+	   $(srcdir)/config/rs6000/fixunstfti.c \
+	   $(srcdir)/config/rs6000/fixunstfdi.c
-- 
2.18.0