This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
wide-int more performance fixes for wide multiplication.
- From: Kenneth Zadeck <zadeck at naturalbridge dot com>
- To: Richard Sandiford <rdsandiford at googlemail dot com>, Richard Biener <rguenther at suse dot de>, Mike Stump <mikestump at comcast dot net>, gcc-patches <gcc-patches at gcc dot gnu dot org>
- Date: Mon, 09 Dec 2013 14:07:50 -0500
- Subject: wide-int more performance fixes for wide multiplication.
- Authentication-results: sourceware.org; auth=none
This patch is the last performance patch that i have for wide-int.
This patch changes large multiply from taking precision/hbpwi *
precision/hbpwi multiplies to taking
#significant_bits1/hbpwi * #significant_bits2/hbpwi multiplications.
That was a significant number of multiplies on machines that had a large
largest integer.
This patch was mostly tested before richard sandifords patch which uses
the longlong stuff. That patch gets many of the cases that this would
get. but if either of the operands does not fit into a hwi, then things
got really expensive without this patch.
This patch works a lot differently from the version in the branch. In
the branch, the inputs were assumed to unsigned numbers and the multiply
happened with what ever came in. After the multiply happened and if a
signed multiply was requested, the product was adjusted.
in this version, the adjustment happens first if we are doing signed
multiply. Thus the operands are always unsigned numbers to the actual
multiply loops. We may need an extra bit for this, but we have it.
The advantage of this is that negative numbers have a lot of leading 1s
and compensating for these is complex. The easy solution is to convert
this to a positive number and then count many hwi's are needed to
represent that. Of course, if exactly one of the operands was
negative, then we end up negating the product at the end.
This patch was tested on x86-64 with richards longlong short cut
commented out to get it to hit a lot of times. And for this it makes a
lot of difference in the number of multiplies that are performed.
Without commenting that out, the number of hits is small when one looks
only at the bootstrap and the regression tests. However, this patch
will also get more hit more often for ports that decide to revert back
to having a HWI be 32 bits because any time one of the operands is
larger than 32 bits, this patch will take control. Also, this patch
hits if any of the operands really do not fit in a hwi.
This patch only works after that patch to change tree-vrp has been applied.
http://gcc.gnu.org/ml/gcc-patches/2013-12/msg00877.html
kenny
Index: gcc/wide-int.cc
===================================================================
--- gcc/wide-int.cc (revision 205765)
+++ gcc/wide-int.cc (working copy)
@@ -1275,23 +1275,31 @@ wi::mul_internal (HOST_WIDE_INT *val, co
unsigned int j;
unsigned int blocks_needed = BLOCKS_NEEDED (prec);
unsigned int half_blocks_needed = blocks_needed * 2;
- /* The sizes here are scaled to support a 2x largest mode by 2x
- largest mode yielding a 4x largest mode result. This is what is
- needed by vpn. */
+ /* The sizes here are scaled to support a 2*largest mode + a little
+ by 2*largest mode + a little yielding a 4*largest mode + a
+ little. This is what is needed by vpn. */
unsigned HOST_HALF_WIDE_INT
- u[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT];
+ u[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT];
+ unsigned int ulen;
unsigned HOST_HALF_WIDE_INT
- v[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT];
- /* The '2' in 'R' is because we are internally doing a full
+ v[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT];
+ unsigned int vlen;
+ unsigned int uvlen;
+ unsigned int vallen;
+
+ /* The '4' in 'R' is because we are internally doing a wide
multiply. */
unsigned HOST_HALF_WIDE_INT
- r[2 * 4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT];
- HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1;
+ r[4 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT];
+
+ /* True if the result needs to be negated. */
+ bool is_neg = false;
/* If the top level routine did not really pass in an overflow, then
just make sure that we never attempt to set it. */
bool needs_overflow = (overflow != 0);
+ const HOST_WIDE_INT zero = 0;
if (needs_overflow)
*overflow = false;
@@ -1382,61 +1392,96 @@ wi::mul_internal (HOST_WIDE_INT *val, co
return 1;
}
- /* We do unsigned mul and then correct it. */
- wi_unpack (u, (const unsigned HOST_WIDE_INT *) op1val, op1len,
- half_blocks_needed, prec, SIGNED);
- wi_unpack (v, (const unsigned HOST_WIDE_INT *) op2val, op2len,
- half_blocks_needed, prec, SIGNED);
-
- /* The 2 is for a full mult. */
- memset (r, 0, half_blocks_needed * 2
- * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT);
+ ulen = op1len * 2;
+ vlen = op2len * 2;
+
+ if (sgn == SIGNED)
+ {
+ if (wi::neg_p (op1))
+ {
+ HOST_WIDE_INT nop1[2 * WIDE_INT_MAX_ELTS];
+
+ int nop1len
+ = wi::sub_large (nop1, &zero, 1, op1val, op1len, prec + 1, SIGNED, 0);
+ is_neg = !is_neg;
+ ulen = nop1len * 2;
+ wi_unpack (u, (const unsigned HOST_WIDE_INT*)nop1, nop1len,
+ ulen, prec, SIGNED);
+ }
+ else
+ wi_unpack (u, (const unsigned HOST_WIDE_INT*)op1val, op1len,
+ ulen, prec, SIGNED);
+
+ if (wi::neg_p (op2))
+ {
+ HOST_WIDE_INT nop2[2 * WIDE_INT_MAX_ELTS];
+
+ int nop2len
+ = wi::sub_large (nop2, &zero, 1, op2val, op2len, prec + 1, SIGNED, 0);
+ is_neg = !is_neg;
+ vlen = nop2len * 2;
+ wi_unpack (v, (const unsigned HOST_WIDE_INT*)nop2, nop2len,
+ vlen, prec, SIGNED);
+ }
+ else
+ wi_unpack (v, (const unsigned HOST_WIDE_INT*)op2val, op2len,
+ vlen, prec, SIGNED);
+ }
+ else
+ {
+ /* UNSIGNED mul. */
+ /* For large positive integers inside regular wide-ints, we must
+ explictly expand out the 1s to full width for the mul to be
+ correct. Note that the top bit will never be set for a
+ unsigned number in offset_int of widest-int. */
+ if (wi::neg_p (op1))
+ ulen = half_blocks_needed;
+ if (wi::neg_p (op2))
+ vlen = half_blocks_needed;
+
+ wi_unpack (u, (const unsigned HOST_WIDE_INT*)op1val, op1len,
+ ulen, prec, SIGNED);
+
+ wi_unpack (v, (const unsigned HOST_WIDE_INT*)op2val, op2len,
+ vlen, prec, SIGNED);
+ }
- for (j = 0; j < half_blocks_needed; j++)
+ uvlen = ulen + vlen;
+ memset (r, 0, uvlen * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT);
+
+ /* Do the actually multiply. */
+ for (j = 0; j < vlen; j++)
{
k = 0;
- for (i = 0; i < half_blocks_needed; i++)
+ for (i = 0; i < ulen; i++)
{
t = ((unsigned HOST_WIDE_INT)u[i] * (unsigned HOST_WIDE_INT)v[j]
+ r[i + j] + k);
r[i + j] = t & HALF_INT_MASK;
k = t >> HOST_BITS_PER_HALF_WIDE_INT;
}
- r[j + half_blocks_needed] = k;
+ r[j + ulen] = k;
}
- /* We did unsigned math above. For signed we must adjust the
- product (assuming we need to see that). */
- if (sgn == SIGNED && (high || needs_overflow))
+ /* We did unsigned math above. For signed we may need to adjust the
+ product if exactly 1 of the operands was negative. */
+ if (is_neg)
{
- unsigned HOST_WIDE_INT b;
- if (wi::neg_p (op1))
+ t = (unsigned HOST_WIDE_INT)(~r[0]) + 1;
+ r[0] = t & HALF_INT_MASK;
+ for (i = 1; i < uvlen; i++)
{
- b = 0;
- for (i = 0; i < half_blocks_needed; i++)
- {
- t = (unsigned HOST_WIDE_INT)r[i + half_blocks_needed]
- - (unsigned HOST_WIDE_INT)v[i] - b;
- r[i + half_blocks_needed] = t & HALF_INT_MASK;
- b = t >> (HOST_BITS_PER_WIDE_INT - 1);
- }
- }
- if (wi::neg_p (op2))
- {
- b = 0;
- for (i = 0; i < half_blocks_needed; i++)
- {
- t = (unsigned HOST_WIDE_INT)r[i + half_blocks_needed]
- - (unsigned HOST_WIDE_INT)u[i] - b;
- r[i + half_blocks_needed] = t & HALF_INT_MASK;
- b = t >> (HOST_BITS_PER_WIDE_INT - 1);
- }
+ t = (unsigned HOST_WIDE_INT)(~r[i]) + (t >> HOST_BITS_PER_HALF_WIDE_INT);
+ r[i] = t & HALF_INT_MASK;
}
}
- if (needs_overflow)
+ /* We only need to do the expensive check for overflow if the value
+ really was big enough to overflow. */
+ if (needs_overflow && (uvlen >= half_blocks_needed))
{
HOST_WIDE_INT top;
+ const HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1;
/* For unsigned, overflow is true if any of the top bits are set.
For signed, overflow is true if any of the top bits are not equal
@@ -1450,7 +1495,7 @@ wi::mul_internal (HOST_WIDE_INT *val, co
top &= mask;
}
- for (i = half_blocks_needed; i < half_blocks_needed * 2; i++)
+ for (i = half_blocks_needed; i < uvlen; i++)
if (((HOST_WIDE_INT)(r[i] & mask)) != top)
*overflow = true;
}
@@ -1459,15 +1504,42 @@ wi::mul_internal (HOST_WIDE_INT *val, co
{
/* compute [prec] <- ([prec] * [prec]) >> [prec] */
wi_pack ((unsigned HOST_WIDE_INT *) val,
- &r[half_blocks_needed], half_blocks_needed);
- return canonize (val, blocks_needed, prec);
+ r, uvlen);
+ vallen = canonize (val, (uvlen + 1) >> 1, prec);
+
+ /* Shift is not always safe to write over one of the
+ operands, so we must copy. */
+ HOST_WIDE_INT tval[2 * WIDE_INT_MAX_ELTS];
+ memcpy (tval, val, vallen * CHAR_BIT / HOST_BITS_PER_WIDE_INT);
+ vallen = wi::lrshift_large (val, tval, vallen, prec*2, prec, prec);
}
else
{
+ if (uvlen > half_blocks_needed)
+ uvlen = half_blocks_needed;
/* compute [prec] <- ([prec] * [prec]) && ((1 << [prec]) - 1) */
- wi_pack ((unsigned HOST_WIDE_INT *) val, r, half_blocks_needed);
- return canonize (val, blocks_needed, prec);
+ wi_pack ((unsigned HOST_WIDE_INT *) val, r, uvlen);
+ vallen = canonize (val, (uvlen + 1) >> 1, prec);
}
+
+#if 0
+ static int cnt = 0;
+ printf ("cnt=%d op1=", cnt++);
+ for (i = 0; i < op1len; i++)
+ printf ("0x%lx ", op1[i]);
+
+ printf (" op2=");
+ for (i = 0; i < op2len; i++)
+ printf ("0x%lx ", op2[i]);
+ printf (" result=");
+ for (i = 0; i < vallen; i++)
+ printf ("0x%lx ", val[i]);
+ if (needs_overflow && *overflow)
+ printf (" OVERFLOW");
+ printf ("\n");
+#endif
+
+ return vallen;
}
/* Compute the population count of X. */
Index: gcc/wide-int.h
===================================================================
--- gcc/wide-int.h (revision 205765)
+++ gcc/wide-int.h (working copy)
@@ -2406,7 +2408,7 @@ wi::mul (const T1 &x, const T2 &y)
}
else
result.set_len (mul_internal (val, xi.val, xi.len, yi.val, yi.len,
- precision, UNSIGNED, 0, false));
+ precision, SIGNED, 0, false));
return result;
}