This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] [Aarch64] Variable shift count truncation issues
Michael Collison <Michael.Collison@arm.com> writes:
> Updated the patch per Richard's suggestions to allow scheduling of
> instructions before reload.
Thanks, this looks good to me FWIW, but obviously I can't approve it.
Richard
> Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk?
>
> 2017-05-22 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
> Michael Collison <michael.collison@arm.com>
>
> PR target/70119
> * config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
> New pattern.
> (*aarch64_reg_<mode>3_neg_mask2): New pattern.
> (*aarch64_reg_<mode>3_minus_mask): New pattern.
> (*aarch64_<optab>_reg_di3_mask2): New pattern.
> * config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
> of shift when the shift amount is masked with constant equal to
> the size of the mode.
> * config/aarch64/predicates.md (subreg_lowpart_operator): New
> predicate.
>
>
> 2016-05-22 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
> Michael Collison <michael.collison@arm.com>
>
> PR target/70119
> * gcc.target/aarch64/var_shift_mask_1.c: New test.
>
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@linaro.org]
> Sent: Thursday, June 15, 2017 6:40 AM
> To: Michael Collison <Michael.Collison@arm.com>
> Cc: Wilco Dijkstra <Wilco.Dijkstra@arm.com>; Christophe Lyon <christophe.lyon@linaro.org>; GCC Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>
> Subject: Re: [PATCH] [Aarch64] Variable shift count truncation issues
>
> Michael Collison <Michael.Collison@arm.com> writes:
>> +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
>> + [(set (match_operand:GPI 0 "register_operand" "=r")
>> + (SHIFT:GPI
>> + (match_operand:GPI 1 "register_operand" "r")
>> + (match_operator 4 "subreg_lowpart_operator"
>> + [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
>> + (match_operand 3 "const_int_operand" "n")))])))
>> + (clobber (match_scratch:SI 5 "=&r"))]
>> + "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
>> + "#"
>> + "&& reload_completed"
>> + [(const_int 0)]
>> + {
>> + emit_insn (gen_negsi2 (operands[5], operands[2]));
>> +
>> + rtx and_op = gen_rtx_AND (SImode, operands[5], operands[3]);
>> + rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
>> + SUBREG_BYTE (operands[4]));
>> + emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
>> + DONE;
>> + }
>> +)
>
> Thanks, I agree this looks correct from the split/reload_completed POV.
> I think we can go one better though, either:
>
> (a) Still allow the split when !reload_completed, and use:
>
> if (GET_MODE (operands[5]) == SCRATCH)
> operands[5] = gen_reg_rtx (SImode);
>
> This will allow the individual instructions to be scheduled by sched1.
>
> (b) Continue to restrict the split to reload_completed, change operand 0
> to =&r so that it can be used as a temporary, and drop operand 5 entirely.
>
> Or perhaps do both:
>
> (define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
> [(set (match_operand:GPI 0 "register_operand" "=&r")
> (SHIFT:GPI
> (match_operand:GPI 1 "register_operand" "r")
> (match_operator 4 "subreg_lowpart_operator"
> [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
> (match_operand 3 "const_int_operand" "n")))])))]
> "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
> "#"
> "&& 1"
> [(const_int 0)]
> {
> rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (<GPI:MODE>mode)
> : operands[0]);
> emit_insn (gen_negsi2 (tmp, operands[2]));
>
> rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
> rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
> SUBREG_BYTE (operands[4]));
> emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
> DONE;
> }
> )
>
> Sorry for the run-around. I should have realised earlier that these patterns didn't really need a distinct register after RA.
>
> Thanks,
> Richard
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 5707e53..45377a2 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -7407,17 +7407,26 @@ cost_plus:
> }
> else
> {
> - if (speed)
> + if (VECTOR_MODE_P (mode))
> {
> - if (VECTOR_MODE_P (mode))
> - {
> - /* Vector shift (register). */
> - *cost += extra_cost->vect.alu;
> - }
> - else
> + if (speed)
> + /* Vector shift (register). */
> + *cost += extra_cost->vect.alu;
> + }
> + else
> + {
> + if (speed)
> + /* LSLV. */
> + *cost += extra_cost->alu.shift_reg;
> +
> + if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
> + && CONST_INT_P (XEXP (op1, 1))
> + && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
> {
> - /* LSLV. */
> - *cost += extra_cost->alu.shift_reg;
> + *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
> + /* We already demanded XEXP (op1, 0) to be REG_P, so
> + don't recurse into it. */
> + return true;
> }
> }
> return false; /* All arguments need to be in registers. */
> @@ -7446,14 +7455,27 @@ cost_plus:
> }
> else
> {
> -
> - /* ASR (register) and friends. */
> - if (speed)
> + if (VECTOR_MODE_P (mode))
> {
> - if (VECTOR_MODE_P (mode))
> + if (speed)
> + /* Vector shift (register). */
> *cost += extra_cost->vect.alu;
> - else
> + }
> + else
> + {
> + if (speed)
> + /* ASR (register) and friends. */
> *cost += extra_cost->alu.shift_reg;
> +
> + if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
> + && CONST_INT_P (XEXP (op1, 1))
> + && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
> + {
> + *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
> + /* We already demanded XEXP (op1, 0) to be REG_P, so
> + don't recurse into it. */
> + return true;
> + }
> }
> return false; /* All arguments need to be in registers. */
> }
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index d89df66..ff5720c 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -3942,6 +3942,97 @@
> }
> )
>
> +;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments
> +;; they truncate the shift/rotate amount by the size of the registers they
> +;; operate on: 32 for W-regs, 63 for X-regs. This allows us to optimise away
> +;; such redundant masking instructions. GCC can do that automatically when
> +;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD
> +;; because some of the SISD shift alternatives don't perform this truncations.
> +;; So this pattern exists to catch such cases.
> +
> +(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
> + [(set (match_operand:GPI 0 "register_operand" "=r")
> + (SHIFT:GPI
> + (match_operand:GPI 1 "register_operand" "r")
> + (match_operator 4 "subreg_lowpart_operator"
> + [(and:GPI (match_operand:GPI 2 "register_operand" "r")
> + (match_operand 3 "const_int_operand" "n"))])))]
> + "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0"
> + "<shift>\t%<w>0, %<w>1, %<w>2"
> + [(set_attr "type" "shift_reg")]
> +)
> +
> +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
> + [(set (match_operand:GPI 0 "register_operand" "=&r")
> + (SHIFT:GPI
> + (match_operand:GPI 1 "register_operand" "r")
> + (match_operator 4 "subreg_lowpart_operator"
> + [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
> + (match_operand 3 "const_int_operand" "n")))])))]
> + "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
> + "#"
> + "&& 1"
> + [(const_int 0)]
> + {
> + rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> + : operands[0]);
> + emit_insn (gen_negsi2 (tmp, operands[2]));
> +
> + rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
> + rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
> + SUBREG_BYTE (operands[4]));
> + emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
> + DONE;
> + }
> +)
> +
> +(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask"
> + [(set (match_operand:GPI 0 "register_operand" "=&r")
> + (ashift:GPI
> + (match_operand:GPI 1 "register_operand" "r")
> + (minus:QI (match_operand 2 "const_int_operand" "n")
> + (match_operator 5 "subreg_lowpart_operator"
> + [(and:SI (match_operand:SI 3 "register_operand" "r")
> + (match_operand 4 "const_int_operand" "n"))]))))]
> + "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
> + && INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
> + "#"
> + "&& 1"
> + [(const_int 0)]
> + {
> + rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> + : operands[0]);
> +
> + emit_insn (gen_negsi2 (tmp, operands[3]));
> +
> + rtx and_op = gen_rtx_AND (SImode, tmp, operands[4]);
> + rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[5]), and_op,
> + SUBREG_BYTE (operands[5]));
> +
> + emit_insn (gen_ashl<mode>3 (operands[0], operands[1], subreg_tmp));
> + DONE;
> + }
> +)
> +
> +(define_insn "*aarch64_<optab>_reg_di3_mask2"
> + [(set (match_operand:DI 0 "register_operand" "=r")
> + (SHIFT:DI
> + (match_operand:DI 1 "register_operand" "r")
> + (match_operator 4 "subreg_lowpart_operator"
> + [(and:SI (match_operand:SI 2 "register_operand" "r")
> + (match_operand 3 "aarch64_shift_imm_di" "Usd"))])))]
> + "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == 0)"
> +{
> + rtx xop[3];
> + xop[0] = operands[0];
> + xop[1] = operands[1];
> + xop[2] = gen_lowpart (GET_MODE (operands[4]), operands[2]);
> + output_asm_insn ("<shift>\t%x0, %x1, %x2", xop);
> + return "";
> +}
> + [(set_attr "type" "shift_reg")]
> +)
> +
> ;; Logical left shift using SISD or Integer instruction
> (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
> [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
> index cd7ded9..ad8a43c 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -35,6 +35,10 @@
> (and (match_code "const_int")
> (match_test "op == CONST0_RTX (mode)")))
>
> +(define_special_predicate "subreg_lowpart_operator"
> + (and (match_code "subreg")
> + (match_test "subreg_lowpart_p (op)")))
> +
> (define_predicate "aarch64_ccmp_immediate"
> (and (match_code "const_int")
> (match_test "IN_RANGE (INTVAL (op), -31, 31)")))
> diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
> new file mode 100644
> index 0000000..e2b020e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
> @@ -0,0 +1,61 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* The integer variable shift and rotate instructions truncate their
> + shift amounts by the datasize. Make sure that we don't emit a redundant
> + masking operation. */
> +
> +unsigned
> +f1 (unsigned x, int y)
> +{
> + return x << (y & 31);
> +}
> +
> +unsigned long
> +f2 (unsigned long x, int y)
> +{
> + return x << (y & 63);
> +}
> +
> +unsigned long
> +f3 (unsigned long bit_addr, int y)
> +{
> + unsigned long bitnumb = bit_addr & 63;
> + return (1L << bitnumb);
> +}
> +
> +unsigned int
> +f4 (unsigned int x, unsigned int y)
> +{
> + y &= 31;
> + return x >> y | (x << (32 - y));
> +}
> +
> +unsigned long
> +f5 (unsigned long x, unsigned long y)
> +{
> + y &= 63;
> + return x >> y | (x << (64 - y));
> +}
> +
> +unsigned long
> +f6 (unsigned long x, unsigned long y)
> +{
> +
> + return (x << (64 - (y & 63)));
> +
> +}
> +
> +unsigned long
> +f7 (unsigned long x, unsigned long y)
> +{
> + return (x << -(y & 63));
> +}
> +
> +/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 4 } } */
> +/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */
> +/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */
> +/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */