PING^4: [RS6000] rotate and mask constants [PR94393]
will schmidt
will_schmidt@vnet.ibm.com
Wed Nov 3 18:58:53 GMT 2021
On Mon, 2021-10-25 at 14:41 -0500, Pat Haugen via Gcc-patches wrote:
> Ping.
>
> On 8/10/21 10:49 AM, Pat Haugen via Gcc-patches wrote:
> > On 7/27/21 1:35 PM, will schmidt wrote:
> > > On Fri, 2021-07-23 at 15:23 -0500, Pat Haugen via Gcc-patches wrote:
> > > > Ping
> > > > https://gcc.gnu.org/pipermail/gcc-patches/2020-October/555760.html
> > > >
> > > > I've done a current bootstrap/regtest on powerpc64/powerpc64le with
> > > > no regressions.
> > > >
> > > > -Pat
> > >
> > > That patch was previously posted by Alan Modra.
> > > Given the time lapse this may need to be re-posted entirely, pending
> > > what the maintainers suggest.. :-)
> > >
> > >
> >
> > Including full patch.
A non-authorative re-review. :-)
> >
> >
> > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> > index 60f406a4ff6..8e758a1f2dd 100644
> > --- a/gcc/config/rs6000/rs6000.c
> > +++ b/gcc/config/rs6000/rs6000.c
> > @@ -1131,6 +1131,8 @@ static tree rs6000_handle_altivec_attribute (tree *, tree, tree, int, bool *);
> > static tree rs6000_handle_struct_attribute (tree *, tree, tree, int, bool *);
> > static tree rs6000_builtin_vectorized_libmass (combined_fn, tree, tree);
> > static void rs6000_emit_set_long_const (rtx, HOST_WIDE_INT);
> > +static bool rotate_and_mask_constant (unsigned HOST_WIDE_INT, HOST_WIDE_INT *,
> > + int *, unsigned HOST_WIDE_INT *);
> > static int rs6000_memory_move_cost (machine_mode, reg_class_t, bool);
> > static bool rs6000_debug_rtx_costs (rtx, machine_mode, int, int, int *, bool);
> > static int rs6000_debug_address_cost (rtx, machine_mode, addr_space_t,
Prototype for rotate_and_mask_constant function. OK.
> > @@ -5973,7 +5975,7 @@ num_insns_constant_gpr (HOST_WIDE_INT value)
> > }
> >
> > /* Helper for num_insns_constant. Allow constants formed by the
> > - num_insns_constant_gpr sequences, plus li -1, rldicl/rldicr/rlwinm,
> > + num_insns_constant_gpr sequences, and li/lis+rldicl/rldicr/rldic/rlwinm,
> > and handle modes that require multiple gprs. */
Ok.
> >
> > static int
> > @@ -5988,8 +5990,8 @@ num_insns_constant_multi (HOST_WIDE_INT value, machine_mode mode)
> > if (insns > 2
> > /* We won't get more than 2 from num_insns_constant_gpr
> > except when TARGET_POWERPC64 and mode is DImode or
> > - wider, so the register mode must be DImode. */
> > - && rs6000_is_valid_and_mask (GEN_INT (low), DImode))
> > + wider. */
> > + && rotate_and_mask_constant (low, NULL, NULL, NULL))
> > insns = 2;
> > total += insns;
> > /* If BITS_PER_WORD is the number of bits in HOST_WIDE_INT, doing
> > @@ -10077,6 +10079,244 @@ rs6000_emit_set_const (rtx dest, rtx source)
> > return true;
> > }
> >
> > +/* Rotate DImode word, being careful to handle the case where
> > + HOST_WIDE_INT is larger than DImode. */
> > +
> > +static inline unsigned HOST_WIDE_INT
> > +rotate_di (unsigned HOST_WIDE_INT x, unsigned int shift)
> > +{
> > + unsigned HOST_WIDE_INT mask_hi, mask_lo;
> > +
> > + mask_hi = (HOST_WIDE_INT_1U << 63 << 1) - (HOST_WIDE_INT_1U << shift);
> > + mask_lo = (HOST_WIDE_INT_1U << shift) - 1;
> > + x = ((x << shift) & mask_hi) | ((x >> (64 - shift)) & mask_lo);
> > + x = (x ^ (HOST_WIDE_INT_1U << 63)) - (HOST_WIDE_INT_1U << 63);
> > + return x;
> > +}
ok.
> > +
> > +/* Can C be formed by rotating a 16-bit positive value left by C16LSB? */
Perhaps rephrase as "Attempt to form a constant by ... C16LSB."
> > +
> > +static inline bool
> > +is_rotate_positive_constant (unsigned HOST_WIDE_INT c, int c16lsb,
> > + HOST_WIDE_INT *val, int *shift,
> > + unsigned HOST_WIDE_INT *mask)
> > +{
> > + if ((c & ~(HOST_WIDE_INT_UC (0x7fff) << c16lsb)) == 0)
> > + {
> > + /* eg. c = 1100 0000 0000 ... 0000
> > + -> val = 0x3000, shift = 49, mask = -1ull. */
> > + if (val)
> > + {
> > + c >>= c16lsb;
> > + /* Make the value and shift canonical in the sense of
> > + selecting the smallest value. For the example above
> > + -> val = 3, shift = 61. */
Comment seems reasonable. I did not review or confirm the maths
involved.
> > + int trail_zeros = ctz_hwi (c);
> > + c >>= trail_zeros;
> > + c16lsb += trail_zeros;
> > + *val = c;
> > + *shift = c16lsb;
> > + *mask = HOST_WIDE_INT_M1U;
> > + }
> > + return true;
> > + }
> > + return false;
> > +}
> > +
> > +/* Can C be formed by rotating a 16-bit negative value left by C16LSB? */
> > +
> > +static inline bool
> > +is_rotate_negative_constant (unsigned HOST_WIDE_INT c, int c16lsb,
> > + HOST_WIDE_INT *val, int *shift,
> > + unsigned HOST_WIDE_INT *mask)
> > +{
> > + if ((c | (HOST_WIDE_INT_UC (0x7fff) << c16lsb)) == HOST_WIDE_INT_M1U)
> > + {
> > + if (val)
> > + {
> > + c >>= c16lsb;
> > + /* When there are no leading ones, C16LSB is 49 and we have
> > + an implicit 1 bit to make up our 16-bit negative value,
> > + which is why the shift here is 15 rather than 16. */
> > + c |= HOST_WIDE_INT_M1U << 15;
> > + /* Canonicalize by removing trailing ones in the constant.
> > + eg. 0xbfff -> 0xfffe. */
Comment seems reasonable. I did not confirm the math involved.
> > + int trail_ones = ctz_hwi (~c);
> > + c = (HOST_WIDE_INT) c >> trail_ones;
> > + c16lsb += trail_ones;
> > + *val = c;
> > + *shift = c16lsb;
> > + *mask = HOST_WIDE_INT_M1U;
> > + }
> > + return true;
> > + }
> > + return false;
> > +}
> > +
> > +/* Detect cases where a constant can be formed by li; rldicl, li; rldicr,
> > + lis; rldicl, lis; rldicr, or li; rldic. */
> > +
> > +static bool
> > +rotate_and_mask_constant (unsigned HOST_WIDE_INT c,
> > + HOST_WIDE_INT *val, int *shift,
> > + unsigned HOST_WIDE_INT *mask)
> > +{
> > + /* We know C is a DImode value properly sign extended to
> > + HOST_WIDE_INT that can't be formed by lis; addi. So if C is
> > + positive there must be a 1 bit somewhere in bits 32 thru 62, and
> > + if C is negative there must be a 0 bit in the same range. That
> > + puts constraints on the max leading zeros or ones. Thus no need
> > + to check the C16LSB (lsb of 16-bit constant that fits in li)
> > + expressions below for negative values. */
ok
> > + int lead_zeros = clz_hwi (c);
> > +
> > + /* First check for rotated constants. */
> > + int c16lsb = HOST_BITS_PER_WIDE_INT - 15 - lead_zeros;
> > + if (is_rotate_positive_constant (c, c16lsb, val, shift, mask))
> > + return true;
> > +
> > + /* Handle wrap-around 16-bit constants too, by rotating the
> > + potential wrapped constant so that it no longer wraps. This also
> > + has constraints on the max leading zeros or ones such that the
> > + C16LSB expression below will never be negative. */
ok
> > + unsigned HOST_WIDE_INT rc = rotate_di (c, 32);
> > + c16lsb = HOST_BITS_PER_WIDE_INT - 15 - clz_hwi (rc);
> > + if (is_rotate_positive_constant (rc, c16lsb, val, shift, mask))
> > + {
> > + if (val)
> > + *shift ^= 32;
> > + return true;
> > + }
> > +
> > + int lead_ones = clz_hwi (~c);
> > + c16lsb = HOST_BITS_PER_WIDE_INT - 15 - lead_ones;
> > + if (is_rotate_negative_constant (c, c16lsb, val, shift, mask))
> > + return true;
> > +
> > + c16lsb = HOST_BITS_PER_WIDE_INT - 15 - clz_hwi (~rc);
> > + if (is_rotate_negative_constant (rc, c16lsb, val, shift, mask))
> > + {
> > + if (val)
> > + *shift ^= 32;
> > + return true;
> > + }
> > +
> > + /* 00...01xxxxxxxxxxxxxxx1..11 (up to 15 x's, any number of leading
> > + 0's and trailing 1's) can be implemented as a li, rldicl. */
> > + c16lsb = HOST_BITS_PER_WIDE_INT - 16 - lead_zeros;
> > + if ((c | (HOST_WIDE_INT_M1U << c16lsb)) == HOST_WIDE_INT_M1U)
> > + {
> > + /* eg. c = 0000 1011 1111 1111 ... 1111
> > + -> val = sext(0xbfff), shift = 44, mask = 0x0fffffffffffffff. */
> > + if (val)
> > + {
> > + c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 16);
> > + *val = c;
> > + *shift = c == HOST_WIDE_INT_M1U ? 0 : c16lsb;
> > + *mask = HOST_WIDE_INT_M1U >> lead_zeros;
> > + }
> > + return true;
> > + }
> > + /* 00...01xxxxxxxxxxxxxxx00..01..11 (up to 15 x's followed by 16 0's,
> > + any number of leading 0's and trailing 1's) can be implemented as
> > + lis, rldicl. */
> > + c16lsb = HOST_BITS_PER_WIDE_INT - 32 - lead_zeros;
> > + if (c16lsb >= 0
> > + && (c & ((HOST_WIDE_INT_1U << (c16lsb + 16))
> > + - (HOST_WIDE_INT_1U << c16lsb))) == 0
> > + && (c | (HOST_WIDE_INT_M1U << c16lsb)) == HOST_WIDE_INT_M1U)
> > + {
> > + if (val)
> > + {
> > + c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 32);
> > + *val = c;
> > + *shift = c16lsb;
> > + *mask = HOST_WIDE_INT_M1U >> lead_zeros;
> > + }
> > + return true;
> > + }
ok
> > + /* TRAIL_ZEROS is less than 49 here since 49 or more trailing zeros
> > + are handled above as a rotate of a positive 16-bit value. */
I don't easily see a "rotate .. positive 16-bit value" comment above.
Possibly the "00....01xxxxx1..11 (up to 15 x's ..." chunk of code.
May be worth embellishing previous comments so the reference is
obvious.
> > + int trail_zeros = ctz_hwi (c);
> > + /* 1..1xxxxxxxxxxxxxxx1..10..0 (any number of leading 1's, up to 15
> > + x's followed by any number of 1's followed by any number of 0's)
> > + can be implemented as li, rldicr. */
> > + c16lsb = HOST_BITS_PER_WIDE_INT - 15 - lead_ones;
> > + if ((c | (HOST_WIDE_INT_M1U << c16lsb)
> > + | ~(HOST_WIDE_INT_M1U << trail_zeros)) == HOST_WIDE_INT_M1U)
> > + {
> > + if (val)
> > + {
> > + c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 15);
> > + *val = c;
> > + *shift = c == HOST_WIDE_INT_M1U ? 0 : c16lsb;
> > + *mask = HOST_WIDE_INT_M1U << trail_zeros;
> > + }
> > + return true;
> > + }
> > + /* 1..1xxxxxxxxxxxxxxx0..01..10..0 (any number of leading 1's, up to
> > + 15 x's followed by 16 0's, any number of 1's followed by any
> > + number of 0's) can be implemented as lis, rldicr. */
> > + c16lsb = HOST_BITS_PER_WIDE_INT - 31 - lead_ones;
> > + if (c16lsb >= trail_zeros
> > + && (c & ((HOST_WIDE_INT_1U << (c16lsb + 16))
> > + - (HOST_WIDE_INT_1U << c16lsb))) == 0
> > + && (c | (HOST_WIDE_INT_M1U << c16lsb)
> > + | ~(HOST_WIDE_INT_M1U << trail_zeros)) == HOST_WIDE_INT_M1U)
> > + {
> > + if (val)
> > + {
> > + c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 31);
> > + *val = c;
> > + *shift = c16lsb;
> > + *mask = HOST_WIDE_INT_M1U << trail_zeros;
> > + }
> > + return true;
> > + }
> > + /* 00..011..1xxxxxxxxxxxxxx0..0 (up to 15 x's, any number of leading
> > + 0's followed by any number of 1's and any number of trailing 0's)
> > + can be implemented as li, rldic. */
> > + c16lsb = trail_zeros;
> > + if ((c | ~(HOST_WIDE_INT_M1U << (c16lsb + 15))
> > + | ~(HOST_WIDE_INT_M1U >> lead_zeros)) == HOST_WIDE_INT_M1U)
> > + {
> > + if (val)
> > + {
> > + c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 16);
> > + *val = c;
> > + *shift = c16lsb;
> > + *mask = ((HOST_WIDE_INT_M1U << c16lsb)
> > + & (HOST_WIDE_INT_M1U >> lead_zeros));
> > + }
> > + return true;
> > + }
> > + /* 11..1xxxxxxxxxxxxxx0..01..1 (up to 15 x's, any number of leading
> > + 1's and any number of trailing 0's followed by any number of 1's)
> > + can be implemented as li, rldic. */
> > + int trail_ones = ctz_hwi (~c);
> > + c16lsb = HOST_BITS_PER_WIDE_INT - 15 - lead_ones;
> > + if ((c & (HOST_WIDE_INT_M1U << trail_ones)
> > + & ~(HOST_WIDE_INT_M1U << c16lsb)) == 0)
> > + {
> > + if (val)
> > + {
> > + c = (c >> c16lsb) | (HOST_WIDE_INT_M1U << 15);
> > + /* Canonicalize by removing trailing zeros in the constant.
> > + eg. 0x8000 -> 0xffff. */
> > + trail_zeros = ctz_hwi (c);
> > + c = (HOST_WIDE_INT) c >> trail_zeros;
> > + c16lsb += trail_zeros;
> > + *val = c;
> > + *shift = c16lsb;
> > + *mask = ((HOST_WIDE_INT_M1U << c16lsb)
> > + | ~(HOST_WIDE_INT_M1U << trail_ones));
> > + }
> > + return true;
> > + }
> > +
> > + return false;
> > +}
ok
> > +
> > /* Subroutine of rs6000_emit_set_const, handling PowerPC64 DImode.
> > Output insns to set DEST equal to the constant C as a series of
> > lis, ori and shl instructions. */
> > @@ -10086,6 +10326,9 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
> > {
> > rtx temp;
> > HOST_WIDE_INT ud1, ud2, ud3, ud4;
> > + HOST_WIDE_INT val = c;
> > + int shift;
> > + unsigned HOST_WIDE_INT mask;
> >
> > ud1 = c & 0xffff;
> > c = c >> 16;
> > @@ -10111,6 +10354,17 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
> > gen_rtx_IOR (DImode, copy_rtx (temp),
> > GEN_INT (ud1)));
> > }
> > + else if (rotate_and_mask_constant (val, &val, &shift, &mask))
> > + {
> > + temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
> > + emit_move_insn (temp, GEN_INT (val));
> > + rtx x = copy_rtx (temp);
> > + if (shift != 0)
> > + x = gen_rtx_ROTATE (DImode, x, GEN_INT (shift));
> > + if (mask != HOST_WIDE_INT_M1U)
> > + x = gen_rtx_AND (DImode, x, GEN_INT (mask));
> > + emit_move_insn (dest, x);
> > + }
> > else if (ud3 == 0 && ud4 == 0)
> > {
> > temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
> > diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> > index a84438f8545..a76bb560ae7 100644
> > --- a/gcc/config/rs6000/rs6000.md
> > +++ b/gcc/config/rs6000/rs6000.md
> > @@ -9356,24 +9356,8 @@ (define_insn "*movdi_internal64"
> > *, *, *,
> > p8v, p8v")])
> >
> > -; Some DImode loads are best done as a load of -1 followed by a mask
> > -; instruction.
> > -(define_split
> > - [(set (match_operand:DI 0 "int_reg_operand_not_pseudo")
> > - (match_operand:DI 1 "const_int_operand"))]
> > - "TARGET_POWERPC64
> > - && num_insns_constant (operands[1], DImode) > 1
> > - && !IN_RANGE (INTVAL (operands[1]), -0x80000000, 0xffffffff)
> > - && rs6000_is_valid_and_mask (operands[1], DImode)"
> > - [(set (match_dup 0)
> > - (const_int -1))
> > - (set (match_dup 0)
> > - (and:DI (match_dup 0)
> > - (match_dup 1)))]
> > - "")
> > -
> > -;; Split a load of a large constant into the appropriate five-instruction
> > -;; sequence. Handle anything in a constant number of insns.
> > +;; Split a load of a large constant into a sequence of two to five
> > +;; instructions.
Was/is the five-instruction sequence special aside from being the
resulting number of instructions? I wonder if dropping the "two to
five" entirely is appropriate.
> > ;; When non-easy constants can go in the TOC, this should use
> > ;; easy_fp_constant predicate.
> > (define_split
> > diff --git a/gcc/testsuite/gcc.target/powerpc/rot_cst.h b/gcc/testsuite/gcc.target/powerpc/rot_cst.h
> > new file mode 100644
> > index 00000000000..a57b5aca6cf
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/powerpc/rot_cst.h
> > @@ -0,0 +1,5270 @@
> > +unsigned long long __attribute__ ((__noinline__, __noclone__))
> > +c1 (void)
> > +{
> > + return 0xc000000000000000ULL;
> > +}
> >
<snip>
> >
> > +}
Looks like an exhaustive test, seems good. :-)
> > diff --git a/gcc/testsuite/gcc.target/powerpc/rot_cst1.c b/gcc/testsuite/gcc.target/powerpc/rot_cst1.c
> > new file mode 100644
> > index 00000000000..d7781cf8ac7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/powerpc/rot_cst1.c
> > @@ -0,0 +1,1077 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2" } */
> > +
> > +#include "rot_cst.h"
> > +
> > +struct fun {
> > + unsigned long long (*f) (void);
> > + unsigned long long val;
> > +};
> > +
> > +volatile struct fun values[] = {
> > + { c1, 0xc000000000000000ULL },
> > + { c2, 0x0c00000000000000ULL },
> > + { c3, 0x00c0000000000000ULL },
> > + { c4, 0x000c000000000000ULL },
> > + { c5, 0x0000c00000000000ULL },
> > + { c6, 0x00000c0000000000ULL },
<snip>
> > + { c1995, 0xffffffff3ff63fffULL },
> > + { c1996, 0xda1e000000000001ULL },
> > + { c1997, 0xf3a3800000000001ULL },
> > + { c1998, 0x1e0e000000000001ULL },
> > + { c1999, 0xb7c6000000000001ULL },
> > + { c2000, 0x36ba000000000001ULL }
> > +};
> > +
> > +int
> > +main (void)
> > +{
> > + volatile struct fun *t;
> > +
> > + for (t = values; t < values + sizeof (values) / sizeof (values[0]); ++t)
> > + if (t->f () != t->val)
> > + __builtin_abort ();
> > + return 0;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/powerpc/rot_cst2.c b/gcc/testsuite/gcc.target/powerpc/rot_cst2.c
> > new file mode 100644
> > index 00000000000..7faa9cccadc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/powerpc/rot_cst2.c
> > @@ -0,0 +1,10 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +#include "rot_cst.h"
> > +
> > +/* c10, c11, c12, c14, c15, c16, c30, c31, c32, c39, c40, c43, c44 are
> > + 2 insn functions, the rest all 3 instructions. */
> > +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 3149 { target { lp64 } } } } */
> > +
> > +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 3784 { target { ilp32 } } } } */
Generic concern about fragility given the large instruction numbers,
but I don't have a better idea for this situation..
lgtm
thanks
-Will
> >
>
>
More information about the Gcc-patches
mailing list