This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

wide-int more performance fixes for wide multiplication.


This patch is the last performance patch that i have for wide-int.

This patch changes large multiply from taking precision/hbpwi * precision/hbpwi multiplies to taking #significant_bits1/hbpwi * #significant_bits2/hbpwi multiplications. That was a significant number of multiplies on machines that had a large largest integer.

This patch was mostly tested before richard sandifords patch which uses the longlong stuff. That patch gets many of the cases that this would get. but if either of the operands does not fit into a hwi, then things got really expensive without this patch.

This patch works a lot differently from the version in the branch. In the branch, the inputs were assumed to unsigned numbers and the multiply happened with what ever came in. After the multiply happened and if a signed multiply was requested, the product was adjusted.

in this version, the adjustment happens first if we are doing signed multiply. Thus the operands are always unsigned numbers to the actual multiply loops. We may need an extra bit for this, but we have it. The advantage of this is that negative numbers have a lot of leading 1s and compensating for these is complex. The easy solution is to convert this to a positive number and then count many hwi's are needed to represent that. Of course, if exactly one of the operands was negative, then we end up negating the product at the end.

This patch was tested on x86-64 with richards longlong short cut commented out to get it to hit a lot of times. And for this it makes a lot of difference in the number of multiplies that are performed. Without commenting that out, the number of hits is small when one looks only at the bootstrap and the regression tests. However, this patch will also get more hit more often for ports that decide to revert back to having a HWI be 32 bits because any time one of the operands is larger than 32 bits, this patch will take control. Also, this patch hits if any of the operands really do not fit in a hwi.

This patch only works after that patch to change tree-vrp has been applied.

http://gcc.gnu.org/ml/gcc-patches/2013-12/msg00877.html

kenny
Index: gcc/wide-int.cc
===================================================================
--- gcc/wide-int.cc	(revision 205765)
+++ gcc/wide-int.cc	(working copy)
@@ -1275,23 +1275,31 @@ wi::mul_internal (HOST_WIDE_INT *val, co
   unsigned int j;
   unsigned int blocks_needed = BLOCKS_NEEDED (prec);
   unsigned int half_blocks_needed = blocks_needed * 2;
-  /* The sizes here are scaled to support a 2x largest mode by 2x
-     largest mode yielding a 4x largest mode result.  This is what is
-     needed by vpn.  */
 
+  /* The sizes here are scaled to support a 2*largest mode + a little
+     by 2*largest mode + a little yielding a 4*largest mode + a
+     little.  This is what is needed by vpn.  */
   unsigned HOST_HALF_WIDE_INT
-    u[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT];
+    u[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT];
+  unsigned int ulen;
   unsigned HOST_HALF_WIDE_INT
-    v[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT];
-  /* The '2' in 'R' is because we are internally doing a full
+    v[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT];
+  unsigned int vlen;
+  unsigned int uvlen;
+  unsigned int vallen;
+
+  /* The '4' in 'R' is because we are internally doing a wide
      multiply.  */
   unsigned HOST_HALF_WIDE_INT
-    r[2 * 4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT];
-  HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1;
+    r[4 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT];
+
+  /* True if the result needs to be negated.  */
+  bool is_neg = false;
 
   /* If the top level routine did not really pass in an overflow, then
      just make sure that we never attempt to set it.  */
   bool needs_overflow = (overflow != 0);
+  const HOST_WIDE_INT zero = 0; 
   if (needs_overflow)
     *overflow = false;
 
@@ -1382,61 +1392,96 @@ wi::mul_internal (HOST_WIDE_INT *val, co
       return 1;
     }
 
-  /* We do unsigned mul and then correct it.  */
-  wi_unpack (u, (const unsigned HOST_WIDE_INT *) op1val, op1len,
-	     half_blocks_needed, prec, SIGNED);
-  wi_unpack (v, (const unsigned HOST_WIDE_INT *) op2val, op2len,
-	     half_blocks_needed, prec, SIGNED);
-
-  /* The 2 is for a full mult.  */
-  memset (r, 0, half_blocks_needed * 2
-	  * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT);
+  ulen = op1len * 2;
+  vlen = op2len * 2;
+
+  if (sgn == SIGNED)
+    {
+      if (wi::neg_p (op1))
+	{
+	  HOST_WIDE_INT nop1[2 * WIDE_INT_MAX_ELTS];
+	  
+	  int nop1len 
+	    = wi::sub_large (nop1, &zero, 1, op1val, op1len, prec + 1, SIGNED, 0);
+	  is_neg = !is_neg;
+	  ulen = nop1len * 2;
+	  wi_unpack (u, (const unsigned HOST_WIDE_INT*)nop1, nop1len,
+		     ulen, prec, SIGNED);
+	}
+      else
+	wi_unpack (u, (const unsigned HOST_WIDE_INT*)op1val, op1len,
+		   ulen, prec, SIGNED);
+
+      if (wi::neg_p (op2))
+	{
+	  HOST_WIDE_INT nop2[2 * WIDE_INT_MAX_ELTS];
+	  
+	  int nop2len 
+	    = wi::sub_large (nop2, &zero, 1, op2val, op2len, prec + 1, SIGNED, 0);
+	  is_neg = !is_neg;
+	  vlen = nop2len * 2;
+	  wi_unpack (v, (const unsigned HOST_WIDE_INT*)nop2, nop2len,
+		     vlen, prec, SIGNED);
+	}
+      else
+	wi_unpack (v, (const unsigned HOST_WIDE_INT*)op2val, op2len,
+		   vlen, prec, SIGNED);
+    }
+  else
+    {
+      /* UNSIGNED mul. */
+      /* For large positive integers inside regular wide-ints, we must
+	 explictly expand out the 1s to full width for the mul to be
+	 correct. Note that the top bit will never be set for a
+	 unsigned number in offset_int of widest-int.  */
+      if (wi::neg_p (op1))
+	ulen = half_blocks_needed;
+      if (wi::neg_p (op2))
+	vlen = half_blocks_needed;
+
+      wi_unpack (u, (const unsigned HOST_WIDE_INT*)op1val, op1len,
+		 ulen, prec, SIGNED);
+
+      wi_unpack (v, (const unsigned HOST_WIDE_INT*)op2val, op2len,
+		 vlen, prec, SIGNED);
+    }
 
-  for (j = 0; j < half_blocks_needed; j++)
+  uvlen = ulen + vlen;
+  memset (r, 0, uvlen * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT);
+  
+  /* Do the actually multiply.  */
+  for (j = 0; j < vlen; j++)
     {
       k = 0;
-      for (i = 0; i < half_blocks_needed; i++)
+      for (i = 0; i < ulen; i++)
 	{
 	  t = ((unsigned HOST_WIDE_INT)u[i] * (unsigned HOST_WIDE_INT)v[j]
 	       + r[i + j] + k);
 	  r[i + j] = t & HALF_INT_MASK;
 	  k = t >> HOST_BITS_PER_HALF_WIDE_INT;
 	}
-      r[j + half_blocks_needed] = k;
+      r[j + ulen] = k;
     }
 
-  /* We did unsigned math above.  For signed we must adjust the
-     product (assuming we need to see that).  */
-  if (sgn == SIGNED && (high || needs_overflow))
+  /* We did unsigned math above.  For signed we may need to adjust the
+     product if exactly 1 of the operands was negative.  */
+  if (is_neg)
     {
-      unsigned HOST_WIDE_INT b;
-      if (wi::neg_p (op1))
+      t = (unsigned HOST_WIDE_INT)(~r[0]) + 1;
+      r[0] = t & HALF_INT_MASK;
+      for (i = 1; i < uvlen; i++)
 	{
-	  b = 0;
-	  for (i = 0; i < half_blocks_needed; i++)
-	    {
-	      t = (unsigned HOST_WIDE_INT)r[i + half_blocks_needed]
-		- (unsigned HOST_WIDE_INT)v[i] - b;
-	      r[i + half_blocks_needed] = t & HALF_INT_MASK;
-	      b = t >> (HOST_BITS_PER_WIDE_INT - 1);
-	    }
-	}
-      if (wi::neg_p (op2))
-	{
-	  b = 0;
-	  for (i = 0; i < half_blocks_needed; i++)
-	    {
-	      t = (unsigned HOST_WIDE_INT)r[i + half_blocks_needed]
-		- (unsigned HOST_WIDE_INT)u[i] - b;
-	      r[i + half_blocks_needed] = t & HALF_INT_MASK;
-	      b = t >> (HOST_BITS_PER_WIDE_INT - 1);
-	    }
+	  t = (unsigned HOST_WIDE_INT)(~r[i]) + (t >> HOST_BITS_PER_HALF_WIDE_INT);
+	  r[i] = t & HALF_INT_MASK;
 	}
     }
 
-  if (needs_overflow)
+  /* We only need to do the expensive check for overflow if the value
+     really was big enough to overflow.  */
+  if (needs_overflow && (uvlen >= half_blocks_needed))
     {
       HOST_WIDE_INT top;
+      const HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1;
 
       /* For unsigned, overflow is true if any of the top bits are set.
 	 For signed, overflow is true if any of the top bits are not equal
@@ -1450,7 +1495,7 @@ wi::mul_internal (HOST_WIDE_INT *val, co
 	  top &= mask;
 	}
 
-      for (i = half_blocks_needed; i < half_blocks_needed * 2; i++)
+      for (i = half_blocks_needed; i < uvlen; i++)
 	if (((HOST_WIDE_INT)(r[i] & mask)) != top)
 	  *overflow = true;
     }
@@ -1459,15 +1504,42 @@ wi::mul_internal (HOST_WIDE_INT *val, co
     {
       /* compute [prec] <- ([prec] * [prec]) >> [prec] */
       wi_pack ((unsigned HOST_WIDE_INT *) val,
-	       &r[half_blocks_needed], half_blocks_needed);
-      return canonize (val, blocks_needed, prec);
+	       r, uvlen);
+      vallen = canonize (val, (uvlen + 1) >> 1, prec);
+
+      /* Shift is not always safe to write over one of the
+	 operands, so we must copy.  */ 
+      HOST_WIDE_INT tval[2 * WIDE_INT_MAX_ELTS];
+      memcpy (tval, val, vallen * CHAR_BIT / HOST_BITS_PER_WIDE_INT); 
+      vallen = wi::lrshift_large (val, tval, vallen, prec*2, prec, prec);
     }
   else
     {
+      if (uvlen > half_blocks_needed)
+	uvlen = half_blocks_needed;
       /* compute [prec] <- ([prec] * [prec]) && ((1 << [prec]) - 1) */
-      wi_pack ((unsigned HOST_WIDE_INT *) val, r, half_blocks_needed);
-      return canonize (val, blocks_needed, prec);
+      wi_pack ((unsigned HOST_WIDE_INT *) val, r, uvlen);
+      vallen = canonize (val, (uvlen + 1) >> 1, prec);
     }
+  
+#if 0
+  static int cnt = 0;
+  printf ("cnt=%d op1=", cnt++);
+  for (i = 0; i < op1len; i++)
+    printf ("0x%lx ", op1[i]);
+
+  printf ("  op2=");
+  for (i = 0; i < op2len; i++)
+    printf ("0x%lx ", op2[i]);
+  printf ("  result=");
+  for (i = 0; i < vallen; i++)
+    printf ("0x%lx ", val[i]);
+  if (needs_overflow && *overflow)
+    printf ("  OVERFLOW");
+  printf ("\n");
+#endif
+
+  return vallen;
 }
 
 /* Compute the population count of X.  */
Index: gcc/wide-int.h
===================================================================
--- gcc/wide-int.h	(revision 205765)
+++ gcc/wide-int.h	(working copy)
@@ -2406,7 +2408,7 @@ wi::mul (const T1 &x, const T2 &y)
     }
   else
     result.set_len (mul_internal (val, xi.val, xi.len, yi.val, yi.len,
-				  precision, UNSIGNED, 0, false));
+				  precision, SIGNED, 0, false));
   return result;
 }
 

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]