wide-int more performance fixes for wide multiplication.

Sat Dec 14 11:40:00 GMT 2013

Hi Kenny,

Sorry for the slow response.

Kenneth Zadeck <zadeck@naturalbridge.com> writes:
> Index: gcc/wide-int.cc
> ===================================================================
> --- gcc/wide-int.cc	(revision 205765)
> +++ gcc/wide-int.cc	(working copy)
> @@ -1275,23 +1275,31 @@ wi::mul_internal (HOST_WIDE_INT *val, co
>    unsigned int j;
>    unsigned int blocks_needed = BLOCKS_NEEDED (prec);
>    unsigned int half_blocks_needed = blocks_needed * 2;
> -  /* The sizes here are scaled to support a 2x largest mode by 2x
> -     largest mode yielding a 4x largest mode result.  This is what is
> -     needed by vpn.  */
>  
> +  /* The sizes here are scaled to support a 2*largest mode + a little
> +     by 2*largest mode + a little yielding a 4*largest mode + a
> +     little.  This is what is needed by vpn.  */
>    unsigned HOST_HALF_WIDE_INT
> -    u[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT];
> +    u[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT];
> +  unsigned int ulen;
>    unsigned HOST_HALF_WIDE_INT
> -    v[4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT];
> -  /* The '2' in 'R' is because we are internally doing a full
> +    v[2 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT];
> +  unsigned int vlen;
> +  unsigned int uvlen;
> +  unsigned int vallen;
> +
> +  /* The '4' in 'R' is because we are internally doing a wide
>       multiply.  */
>    unsigned HOST_HALF_WIDE_INT
> -    r[2 * 4 * MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_HALF_WIDE_INT];
> -  HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1;
> +    r[4 * WIDE_INT_MAX_PRECISION / HOST_BITS_PER_HALF_WIDE_INT];

While we're here I think we should convert these arrays into allocas,
so that we're not hard-coding the specialness of vpn in wide-int.cc.
I.e. rather than having arrays of maximum size, use XALLOCAVEC to
allocate a specific number of stack HWIs, once we know how many
we actually need.  That includes the new arrays added in the patch.

> +  /* True if the result needs to be negated.  */
> +  bool is_neg = false;
>  
>    /* If the top level routine did not really pass in an overflow, then
>       just make sure that we never attempt to set it.  */
>    bool needs_overflow = (overflow != 0);
> +  const HOST_WIDE_INT zero = 0; 
>    if (needs_overflow)
>      *overflow = false;
>  
> @@ -1382,61 +1392,96 @@ wi::mul_internal (HOST_WIDE_INT *val, co
>        return 1;
>      }
>  
> -  /* We do unsigned mul and then correct it.  */
> -  wi_unpack (u, (const unsigned HOST_WIDE_INT *) op1val, op1len,
> -	     half_blocks_needed, prec, SIGNED);
> -  wi_unpack (v, (const unsigned HOST_WIDE_INT *) op2val, op2len,
> -	     half_blocks_needed, prec, SIGNED);
> -
> -  /* The 2 is for a full mult.  */
> -  memset (r, 0, half_blocks_needed * 2
> -	  * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT);
> +  ulen = op1len * 2;
> +  vlen = op2len * 2;
> +
> +  if (sgn == SIGNED)
> +    {
> +      if (wi::neg_p (op1))
> +	{
> +	  HOST_WIDE_INT nop1[2 * WIDE_INT_MAX_ELTS];
> +	  

E.g. following on from the above:

	  /* Add an extra HWI so that we can represent the negative of
	     the minimum.  */
	  HOST_WIDE_INT *nop1 = XALLOCAVEC (HOST_WIDE_INT, op1len + 1);

> +	  int nop1len 
> +	    = wi::sub_large (nop1, &zero, 1, op1val, op1len, prec + 1, SIGNED, 0);

Not sure about the "prec + 1" here, since it could cause nop1len to be
greater than the maximum length for "prec".  And you go on to unpack
nop1 in "prec" rather than "prec + 1".

AIUI, once you've taken the absolutes, you've got two unsigned numbers
of precision "prec" and create a "prec * 2" result, so naybe:
sub_large (..., prec, UNSIGNED, 0) instead?

> +	  is_neg = !is_neg;
> +	  ulen = nop1len * 2;
> +	  wi_unpack (u, (const unsigned HOST_WIDE_INT*)nop1, nop1len,
> +		     ulen, prec, SIGNED);

Back on the alloca theme, we'd have:

	  u = XALLOCAVEC (unsigned HOST_HALF_WIDE_INT, ulen);

before this and other unpacks.

Note that there's supposed to be space after ")" in casts.  Lots of
instances in the patch :-)

> +      /* UNSIGNED mul. */
> +      /* For large positive integers inside regular wide-ints, we must
> +	 explictly expand out the 1s to full width for the mul to be
> +	 correct. Note that the top bit will never be set for a
> +	 unsigned number in offset_int of widest-int.  */

s/of/or/.  But I think the comment is confusing because the unsignedness
is a property of the multiplication rather than the inputs.  We should never
be doing an unsigned operation on widest_int to begin with.  And if
offset_ints are being treated as having a sign then the same is true there.

If we ever do have an unsigned multiplication on those types for some reason
then -1 will (rightly) be treated as the maximum unsigned value.

Maybe easiest just to drop the note?  Or if you don't like that then maybe:

   Note that this case is typically only used for variable-precision
   wide_ints.

> +      if (wi::neg_p (op1))
> +	ulen = half_blocks_needed;
> +      if (wi::neg_p (op2))
> +	vlen = half_blocks_needed;
> +
> +      wi_unpack (u, (const unsigned HOST_WIDE_INT*)op1val, op1len,
> +		 ulen, prec, SIGNED);
> +
> +      wi_unpack (v, (const unsigned HOST_WIDE_INT*)op2val, op2len,
> +		 vlen, prec, SIGNED);
> +    }
>  
> -  for (j = 0; j < half_blocks_needed; j++)
> +  uvlen = ulen + vlen;

The alloca change here would be:

  r = XALLOCAVEC (unsigned HOST_HALF_WIDE_INT, uvlen);

> +  memset (r, 0, uvlen * HOST_BITS_PER_HALF_WIDE_INT / CHAR_BIT);
> +  
> +  /* Do the actually multiply.  */
> +  for (j = 0; j < vlen; j++)
>      {
>        k = 0;
> -      for (i = 0; i < half_blocks_needed; i++)
> +      for (i = 0; i < ulen; i++)
>  	{
>  	  t = ((unsigned HOST_WIDE_INT)u[i] * (unsigned HOST_WIDE_INT)v[j]
>  	       + r[i + j] + k);
>  	  r[i + j] = t & HALF_INT_MASK;
>  	  k = t >> HOST_BITS_PER_HALF_WIDE_INT;
>  	}
> -      r[j + half_blocks_needed] = k;
> +      r[j + ulen] = k;
>      }
>  
> -  /* We did unsigned math above.  For signed we must adjust the
> -     product (assuming we need to see that).  */
> -  if (sgn == SIGNED && (high || needs_overflow))
> +  /* We did unsigned math above.  For signed we may need to adjust the
> +     product if exactly 1 of the operands was negative.  */
> +  if (is_neg)
>      {
> -      unsigned HOST_WIDE_INT b;
> -      if (wi::neg_p (op1))
> +      t = (unsigned HOST_WIDE_INT)(~r[0]) + 1;

r[0] ^ HALF_INT_MASK is more portable than ~r[0] here.  If HOST_WIDE_INT
is int then HOST_HALF_WIDE_INT is short, and the short will be promoted
to int before being inverted.  E.g. with ~, a half-int of 0x0000 would
become 0xffffffff rather than 0x0000ffff.

(Same for the ~ inside the loop of course.)

> +      const HOST_WIDE_INT mask = ((HOST_WIDE_INT)1 << HOST_BITS_PER_HALF_WIDE_INT) - 1;

Long line.  But it looks like this is just HALF_INT_MASK.

> @@ -1459,15 +1504,42 @@ wi::mul_internal (HOST_WIDE_INT *val, co
>      {
>        /* compute [prec] <- ([prec] * [prec]) >> [prec] */
>        wi_pack ((unsigned HOST_WIDE_INT *) val,
> -	       &r[half_blocks_needed], half_blocks_needed);
> -      return canonize (val, blocks_needed, prec);
> +	       r, uvlen);

This wi_pack now fits more naturally on a single line.

> +      vallen = canonize (val, (uvlen + 1) >> 1, prec);
> +
> +      /* Shift is not always safe to write over one of the
> +	 operands, so we must copy.  */ 
> +      HOST_WIDE_INT tval[2 * WIDE_INT_MAX_ELTS];
> +      memcpy (tval, val, vallen * CHAR_BIT / HOST_BITS_PER_WIDE_INT); 

vallen * sizeof (HOST_WIDE_INT) would be more typical.
But why not unpack into tval directly and avoid the copy?

> +#if 0
> +  static int cnt = 0;
> +  printf ("cnt=%d op1=", cnt++);
> +  for (i = 0; i < op1len; i++)
> +    printf ("0x%lx ", op1[i]);
> +
> +  printf ("  op2=");
> +  for (i = 0; i < op2len; i++)
> +    printf ("0x%lx ", op2[i]);
> +  printf ("  result=");
> +  for (i = 0; i < vallen; i++)
> +    printf ("0x%lx ", val[i]);
> +  if (needs_overflow && *overflow)
> +    printf ("  OVERFLOW");
> +  printf ("\n");
> +#endif

I think this should either be removed or put under:

#define DEBUG_MUL 0

  if (DEBUG_MUL)
    {
    }

protection, so that at least it gets compile testing while disabled.

Thanks,
Richard