PING^4: [RS6000] rotate and mask constants [PR94393]

will schmidt will_schmidt@vnet.ibm.com
Wed Nov 3 18:58:53 GMT 2021


On Mon, 2021-10-25 at 14:41 -0500, Pat Haugen via Gcc-patches wrote:
> Ping.
> 
> On 8/10/21 10:49 AM, Pat Haugen via Gcc-patches wrote:
> > On 7/27/21 1:35 PM, will schmidt wrote:
> > > On Fri, 2021-07-23 at 15:23 -0500, Pat Haugen via Gcc-patches wrote:
> > > > Ping 
> > > > https://gcc.gnu.org/pipermail/gcc-patches/2020-October/555760.html
> > > > 
> > > > I've done a current bootstrap/regtest on powerpc64/powerpc64le with
> > > > no regressions.
> > > > 
> > > > -Pat
> > > 
> > > That patch was previously posted by Alan Modra.
> > > Given the time lapse this may need to be re-posted entirely, pending
> > > what the maintainers suggest.. :-)
> > > 
> > > 
> > 
> > Including full patch.

A non-authorative re-review.  :-)

> > 
> > 
> > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> > index 60f406a4ff6..8e758a1f2dd 100644
> > --- a/gcc/config/rs6000/rs6000.c
> > +++ b/gcc/config/rs6000/rs6000.c
> > @@ -1131,6 +1131,8 @@ static tree rs6000_handle_altivec_attribute (tree *, tree, tree, int, bool *);
> >  static tree rs6000_handle_struct_attribute (tree *, tree, tree, int, bool *);
> >  static tree rs6000_builtin_vectorized_libmass (combined_fn, tree, tree);
> >  static void rs6000_emit_set_long_const (rtx, HOST_WIDE_INT);
> > +static bool rotate_and_mask_constant (unsigned HOST_WIDE_INT, HOST_WIDE_INT *,
> > +				      int *, unsigned HOST_WIDE_INT *);
> >  static int rs6000_memory_move_cost (machine_mode, reg_class_t, bool);
> >  static bool rs6000_debug_rtx_costs (rtx, machine_mode, int, int, int *, bool);
> >  static int rs6000_debug_address_cost (rtx, machine_mode, addr_space_t,

Prototype for rotate_and_mask_constant function.  OK.


> > @@ -5973,7 +5975,7 @@ num_insns_constant_gpr (HOST_WIDE_INT value)
> >  }
> >  
> >  /* Helper for num_insns_constant.  Allow constants formed by the
> > -   num_insns_constant_gpr sequences, plus li -1, rldicl/rldicr/rlwinm,
> > +   num_insns_constant_gpr sequences, and li/lis+rldicl/rldicr/rldic/rlwinm,
> >     and handle modes that require multiple gprs.  */

Ok.

> >  
> >  static int
> > @@ -5988,8 +5990,8 @@ num_insns_constant_multi (HOST_WIDE_INT value, machine_mode mode)
> >        if (insns > 2
> >  	  /* We won't get more than 2 from num_insns_constant_gpr
> >  	     except when TARGET_POWERPC64 and mode is DImode or
> > -	     wider, so the register mode must be DImode.  */
> > -	  && rs6000_is_valid_and_mask (GEN_INT (low), DImode))
> > +	     wider.  */
> > +	  && rotate_and_mask_constant (low, NULL, NULL, NULL))
> >  	insns = 2;
> >        total += insns;
> >        /* If BITS_PER_WORD is the number of bits in HOST_WIDE_INT, doing
> > @@ -10077,6 +10079,244 @@ rs6000_emit_set_const (rtx dest, rtx source)
> >    return true;
> >  }
> >  
> > +/* Rotate DImode word, being careful to handle the case where
> > +   HOST_WIDE_INT is larger than DImode.  */
> > +
> > +static inline unsigned HOST_WIDE_INT
> > +rotate_di (unsigned HOST_WIDE_INT x, unsigned int shift)
> > +{
> > +  unsigned HOST_WIDE_INT mask_hi, mask_lo;
> > +
> > +  mask_hi = (HOST_WIDE_INT_1U << 63 << 1) - (HOST_WIDE_INT_1U << shift);
> > +  mask_lo = (HOST_WIDE_INT_1U << shift) - 1;
> > +  x = ((x << shift) & mask_hi) | ((x >> (64 - shift)) & mask_lo);
> > +  x = (x ^ (HOST_WIDE_INT_1U << 63)) - (HOST_WIDE_INT_1U << 63);
> > +  return x;
> > +}

ok.

> > +
> > +/* Can C be formed by rotating a 16-bit positive value left by C16LSB?  */

Perhaps rephrase as "Attempt to form a constant by ...  C16LSB."

> > +
> > +static inline bool
> > +is_rotate_positive_constant (unsigned HOST_WIDE_INT c, int c16lsb,
> > +			     HOST_WIDE_INT *val, int *shift,
> > +			     unsigned HOST_WIDE_INT *mask)
> > +{
> > +  if ((c & ~(HOST_WIDE_INT_UC (0x7fff) << c16lsb)) == 0)
> > +    {
> > +      /* eg. c = 1100 0000 0000 ... 0000
> > +	 -> val = 0x3000, shift = 49, mask = -1ull.  */
> > +      if (val)
> > +	{
> > +	  c >>= c16lsb;
> > +	  /* Make the value and shift canonical in the sense of
> > +	     selecting the smallest value.  For the example above
> > +	     -> val = 3, shift = 61.  */

Comment seems reasonable.  I did not review or confirm the maths
involved. 

> > +	  int trail_zeros = ctz_hwi (c);
> > +	  c >>= trail_zeros;
> > +	  c16lsb += trail_zeros;
> > +	  *val = c;
> > +	  *shift = c16lsb;
> > +	  *mask = HOST_WIDE_INT_M1U;
> > +	}
> > +      return true;
> > +    }
> > +  return false;
> > +}



> > +
> > +/* Can C be formed by rotating a 16-bit negative value left by C16LSB?  */
> > +
> > +static inline bool
> > +is_rotate_negative_constant (unsigned HOST_WIDE_INT c, int c16lsb,
> > +			     HOST_WIDE_INT *val, int *shift,
> > +			     unsigned HOST_WIDE_INT *mask)
> > +{
> > +  if ((c | (HOST_WIDE_INT_UC (0x7fff) << c16lsb)) == HOST_WIDE_INT_M1U)
> > +    {
> > +      if (val)
> > +	{
> > +	  c >>= c16lsb;
> > +	  /* When there are no leading ones, C16LSB is 49 and we have
> > +	     an implicit 1 bit to make up our 16-bit negative value,
> > +	     which is why the shift here is 15 rather than 16.  */
> > +	  c |= HOST_WIDE_INT_M1U << 15;
> > +	  /* Canonicalize by removing trailing ones in the constant.
> > +	     eg. 0xbfff -> 0xfffe.  */

Comment seems reasonable.  I did not confirm the math involved.

> > +	  int trail_ones = ctz_hwi (~c);
> > +	  c = (HOST_WIDE_INT) c >> trail_ones;
> > +	  c16lsb += trail_ones;
> > +	  *val = c;
> > +	  *shift = c16lsb;
> > +	  *mask = HOST_WIDE_INT_M1U;
> > +	}
> > +      return true;
> > +    }
> > +  return false;
> > +}
> > +
> > +/* Detect cases where a constant can be formed by li; rldicl, li; rldicr,
> > +   lis; rldicl, lis; rldicr, or li; rldic.  */
> > +
> > +static bool
> > +rotate_and_mask_constant (unsigned HOST_WIDE_INT c,
> > +			  HOST_WIDE_INT *val, int *shift,
> > +			  unsigned HOST_WIDE_INT *mask)
> > +{
> > +  /* We know C is a DImode value properly sign extended to
> > +     HOST_WIDE_INT that can't be formed by lis; addi.  So if C is
> > +     positive there must be a 1 bit somewhere in bits 32 thru 62, and
> > +     if C is negative there must be a 0 bit in the same range.  That
> > +     puts constraints on the max leading zeros or ones.  Thus no need
> > +     to check the C16LSB (lsb of 16-bit constant that fits in li)
> > +     expressions below for negative values.  */

ok

> > +  int lead_zeros = clz_hwi (c);
> > +
> > +  /* First check for rotated constants.  */
> > +  int c16lsb = HOST_BITS_PER_WIDE_INT - 15 - lead_zeros;
> > +  if (is_rotate_positive_constant (c, c16lsb, val, shift, mask))
> > +    return true;
> > +
> > +  /* Handle wrap-around 16-bit constants too, by rotating the
> > +     potential wrapped constant so that it no longer wraps.  This also
> > +     has constraints on the max leading zeros or ones such that the
> > +     C16LSB expression below will never be negative.  */

ok

> > +  unsigned HOST_WIDE_INT rc = rotate_di (c, 32);
> > +  c16lsb = HOST_BITS_PER_WIDE_INT - 15 - clz_hwi (rc);
> > +  if (is_rotate_positive_constant (rc, c16lsb, val, shift, mask))
> > +    {
> > +      if (val)
> > +	*shift ^= 32;
> > +      return true;
> > +    }
> > +
> > +  int lead_ones = clz_hwi (~c);
> > +  c16lsb = HOST_BITS_PER_WIDE_INT - 15 - lead_ones;
> > +  if (is_rotate_negative_constant (c, c16lsb, val, shift, mask))
> > +    return true;
> > +
> > +  c16lsb = HOST_BITS_PER_WIDE_INT - 15 - clz_hwi (~rc);
> > +  if (is_rotate_negative_constant (rc, c16lsb, val, shift, mask))
> > +    {
> > +      if (val)
> > +	*shift ^= 32;
> > +      return true;
> > +    }
> > +
> > +  /* 00...01xxxxxxxxxxxxxxx1..11 (up to 15 x's, any number of leading
> > +     0's and trailing 1's) can be implemented as a li, rldicl.  */
> > +  c16lsb = HOST_BITS_PER_WIDE_INT - 16 - lead_zeros;
> > +  if ((c | (HOST_WIDE_INT_M1U << c16lsb)) == HOST_WIDE_INT_M1U)
> > +    {
> > +      /* eg. c = 0000 1011 1111 1111 ... 1111
> > +	 -> val = sext(0xbfff), shift = 44, mask = 0x0fffffffffffffff.  */
> > +      if (val)
> > +	{
> > +	  c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 16);
> > +	  *val = c;
> > +	  *shift = c == HOST_WIDE_INT_M1U ? 0 : c16lsb;
> > +	  *mask = HOST_WIDE_INT_M1U >> lead_zeros;
> > +	}
> > +      return true;
> > +    }
> > +  /* 00...01xxxxxxxxxxxxxxx00..01..11 (up to 15 x's followed by 16 0's,
> > +     any number of leading 0's and trailing 1's) can be implemented as
> > +     lis, rldicl.  */
> > +  c16lsb = HOST_BITS_PER_WIDE_INT - 32 - lead_zeros;
> > +  if (c16lsb >= 0
> > +      && (c & ((HOST_WIDE_INT_1U << (c16lsb + 16))
> > +	       - (HOST_WIDE_INT_1U << c16lsb))) == 0
> > +      && (c | (HOST_WIDE_INT_M1U << c16lsb)) == HOST_WIDE_INT_M1U)
> > +    {
> > +      if (val)
> > +	{
> > +	  c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 32);
> > +	  *val = c;
> > +	  *shift = c16lsb;
> > +	  *mask = HOST_WIDE_INT_M1U >> lead_zeros;
> > +	}
> > +      return true;
> > +    }

ok

> > +  /* TRAIL_ZEROS is less than 49 here since 49 or more trailing zeros
> > +     are handled above as a rotate of a positive 16-bit value.  */

I don't easily see a "rotate .. positive 16-bit value" comment above.  
Possibly the  "00....01xxxxx1..11 (up to 15 x's ..." chunk of code.
May be worth embellishing previous comments so the reference is
obvious.

> > +  int trail_zeros = ctz_hwi (c);
> > +  /* 1..1xxxxxxxxxxxxxxx1..10..0 (any number of leading 1's, up to 15
> > +     x's followed by any number of 1's followed by any number of 0's)
> > +     can be implemented as li, rldicr.  */
> > +  c16lsb = HOST_BITS_PER_WIDE_INT - 15 - lead_ones;
> > +  if ((c | (HOST_WIDE_INT_M1U << c16lsb)
> > +       | ~(HOST_WIDE_INT_M1U << trail_zeros)) == HOST_WIDE_INT_M1U)
> > +    {
> > +      if (val)
> > +	{
> > +	  c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 15);
> > +	  *val = c;
> > +	  *shift = c == HOST_WIDE_INT_M1U ? 0 : c16lsb;
> > +	  *mask = HOST_WIDE_INT_M1U << trail_zeros;
> > +	}
> > +      return true;
> > +    }
> > +  /* 1..1xxxxxxxxxxxxxxx0..01..10..0 (any number of leading 1's, up to
> > +     15 x's followed by 16 0's, any number of 1's followed by any
> > +     number of 0's) can be implemented as lis, rldicr.  */
> > +  c16lsb = HOST_BITS_PER_WIDE_INT - 31 - lead_ones;
> > +  if (c16lsb >= trail_zeros
> > +      && (c & ((HOST_WIDE_INT_1U << (c16lsb + 16))
> > +	       - (HOST_WIDE_INT_1U << c16lsb))) == 0
> > +      && (c | (HOST_WIDE_INT_M1U << c16lsb)
> > +	  | ~(HOST_WIDE_INT_M1U << trail_zeros)) == HOST_WIDE_INT_M1U)
> > +    {
> > +      if (val)
> > +	{
> > +	  c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 31);
> > +	  *val = c;
> > +	  *shift = c16lsb;
> > +	  *mask = HOST_WIDE_INT_M1U << trail_zeros;
> > +	}
> > +      return true;
> > +    }
> > +  /* 00..011..1xxxxxxxxxxxxxx0..0 (up to 15 x's, any number of leading
> > +     0's followed by any number of 1's and any number of trailing 0's)
> > +     can be implemented as li, rldic.  */
> > +  c16lsb = trail_zeros;
> > +  if ((c | ~(HOST_WIDE_INT_M1U << (c16lsb + 15))
> > +       | ~(HOST_WIDE_INT_M1U >> lead_zeros)) == HOST_WIDE_INT_M1U)
> > +    {
> > +      if (val)
> > +	{
> > +	  c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 16);
> > +	  *val = c;
> > +	  *shift = c16lsb;
> > +	  *mask = ((HOST_WIDE_INT_M1U << c16lsb)
> > +		   & (HOST_WIDE_INT_M1U >> lead_zeros));
> > +	}
> > +      return true;
> > +    }
> > +  /* 11..1xxxxxxxxxxxxxx0..01..1 (up to 15 x's, any number of leading
> > +     1's and any number of trailing 0's followed by any number of 1's)
> > +     can be implemented as li, rldic.  */
> > +  int trail_ones = ctz_hwi (~c);
> > +  c16lsb = HOST_BITS_PER_WIDE_INT - 15 - lead_ones;
> > +  if ((c & (HOST_WIDE_INT_M1U << trail_ones)
> > +       & ~(HOST_WIDE_INT_M1U << c16lsb)) == 0)
> > +    {
> > +      if (val)
> > +	{
> > +	  c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 15);
> > +	  /* Canonicalize by removing trailing zeros in the constant.
> > +	     eg. 0x8000 -> 0xffff.  */
> > +	  trail_zeros = ctz_hwi (c);
> > +	  c = (HOST_WIDE_INT) c >> trail_zeros;
> > +	  c16lsb += trail_zeros;
> > +	  *val = c;
> > +	  *shift = c16lsb;
> > +	  *mask = ((HOST_WIDE_INT_M1U << c16lsb)
> > +		   | ~(HOST_WIDE_INT_M1U << trail_ones));
> > +	}
> > +      return true;
> > +    }
> > +
> > +  return false;
> > +}

ok

> > +
> >  /* Subroutine of rs6000_emit_set_const, handling PowerPC64 DImode.
> >     Output insns to set DEST equal to the constant C as a series of
> >     lis, ori and shl instructions.  */
> > @@ -10086,6 +10326,9 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
> >  {
> >    rtx temp;
> >    HOST_WIDE_INT ud1, ud2, ud3, ud4;
> > +  HOST_WIDE_INT val = c;
> > +  int shift;
> > +  unsigned HOST_WIDE_INT mask;
> >  
> >    ud1 = c & 0xffff;
> >    c = c >> 16;
> > @@ -10111,6 +10354,17 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
> >  			gen_rtx_IOR (DImode, copy_rtx (temp),
> >  				     GEN_INT (ud1)));
> >      }
> > +  else if (rotate_and_mask_constant (val, &val, &shift, &mask))
> > +    {
> > +      temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
> > +      emit_move_insn (temp, GEN_INT (val));
> > +      rtx x = copy_rtx (temp);
> > +      if (shift != 0)
> > +	x = gen_rtx_ROTATE (DImode, x, GEN_INT (shift));
> > +      if (mask != HOST_WIDE_INT_M1U)
> > +	x = gen_rtx_AND (DImode, x, GEN_INT (mask));
> > +      emit_move_insn (dest, x);
> > +    }
> >    else if (ud3 == 0 && ud4 == 0)
> >      {
> >        temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
> > diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> > index a84438f8545..a76bb560ae7 100644
> > --- a/gcc/config/rs6000/rs6000.md
> > +++ b/gcc/config/rs6000/rs6000.md
> > @@ -9356,24 +9356,8 @@ (define_insn "*movdi_internal64"
> >  	   *,          *,          *,
> >  	   p8v,        p8v")])
> >  
> > -; Some DImode loads are best done as a load of -1 followed by a mask
> > -; instruction.
> > -(define_split
> > -  [(set (match_operand:DI 0 "int_reg_operand_not_pseudo")
> > -	(match_operand:DI 1 "const_int_operand"))]
> > -  "TARGET_POWERPC64
> > -   && num_insns_constant (operands[1], DImode) > 1
> > -   && !IN_RANGE (INTVAL (operands[1]), -0x80000000, 0xffffffff)
> > -   && rs6000_is_valid_and_mask (operands[1], DImode)"
> > -  [(set (match_dup 0)
> > -	(const_int -1))
> > -   (set (match_dup 0)
> > -	(and:DI (match_dup 0)
> > -		(match_dup 1)))]
> > -  "")
> > -
> > -;; Split a load of a large constant into the appropriate five-instruction
> > -;; sequence.  Handle anything in a constant number of insns.
> > +;; Split a load of a large constant into a sequence of two to five
> > +;; instructions.

Was/is the five-instruction sequence special aside from being the
resulting number of instructions?   I wonder if dropping the "two to
five" entirely is appropriate.


> >  ;; When non-easy constants can go in the TOC, this should use
> >  ;; easy_fp_constant predicate.
> >  (define_split
> > diff --git a/gcc/testsuite/gcc.target/powerpc/rot_cst.h b/gcc/testsuite/gcc.target/powerpc/rot_cst.h
> > new file mode 100644
> > index 00000000000..a57b5aca6cf
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/powerpc/rot_cst.h
> > @@ -0,0 +1,5270 @@
> > +unsigned long long __attribute__ ((__noinline__, __noclone__))
> > +c1 (void)
> > +{
> > +  return 0xc000000000000000ULL;
> > +}
> > 
<snip>

> > 
> > +}

Looks like an exhaustive test, seems good.  :-)

> > diff --git a/gcc/testsuite/gcc.target/powerpc/rot_cst1.c b/gcc/testsuite/gcc.target/powerpc/rot_cst1.c
> > new file mode 100644
> > index 00000000000..d7781cf8ac7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/powerpc/rot_cst1.c
> > @@ -0,0 +1,1077 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2" } */
> > +
> > +#include "rot_cst.h"
> > +
> > +struct fun {
> > +  unsigned long long (*f) (void);
> > +  unsigned long long val;
> > +};
> > +
> > +volatile struct fun values[] = {
> > +  { c1,  0xc000000000000000ULL },
> > +  { c2,  0x0c00000000000000ULL },
> > +  { c3,  0x00c0000000000000ULL },
> > +  { c4,  0x000c000000000000ULL },
> > +  { c5,  0x0000c00000000000ULL },
> > +  { c6,  0x00000c0000000000ULL },
<snip>
> > +  { c1995, 0xffffffff3ff63fffULL },
> > +  { c1996, 0xda1e000000000001ULL },
> > +  { c1997, 0xf3a3800000000001ULL },
> > +  { c1998, 0x1e0e000000000001ULL },
> > +  { c1999, 0xb7c6000000000001ULL },
> > +  { c2000, 0x36ba000000000001ULL }
> > +};
> > +
> > +int
> > +main (void)
> > +{
> > +  volatile struct fun *t;
> > +
> > +  for (t = values; t < values + sizeof (values) / sizeof (values[0]); ++t)
> > +    if (t->f () != t->val)
> > +      __builtin_abort ();
> > +  return 0;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/powerpc/rot_cst2.c b/gcc/testsuite/gcc.target/powerpc/rot_cst2.c
> > new file mode 100644
> > index 00000000000..7faa9cccadc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/powerpc/rot_cst2.c
> > @@ -0,0 +1,10 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +#include "rot_cst.h"
> > +
> > +/* c10, c11, c12, c14, c15, c16, c30, c31, c32, c39, c40, c43, c44 are
> > +   2 insn functions, the rest all 3 instructions.  */
> > +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 3149 { target { lp64 } } } } */
> > +
> > +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 3784 { target { ilp32 } } } } */


Generic concern about fragility given the large instruction numbers,
but I don't have a better idea for this situation.. 

lgtm
thanks
-Will


> > 
> 
> 



More information about the Gcc-patches mailing list