This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH][ARM] Improve 64-bit shifts (non-NEON)
- From: Andrew Stubbs <ams at codesourcery dot com>
- To: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>, "patches at linaro dot org" <patches at linaro dot org>
- Date: Fri, 27 Jan 2012 16:07:14 +0000
- Subject: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
Hi all,
This patch introduces a new, more efficient set of DImode shift
sequences for values stored in core-registers (as opposed to VFP/NEON
registers).
The new sequences take advantage of knowledge of what the ARM
instructions do with out-of-range shift amounts.
The following are examples or a simple test case, like this one:
long long
f (long long *a, int b)
{
return *a << b;
}
In ARM mode, old left-shift vs. the new one:
stmfd sp!, {r4, r5} | ldrd r2, [r0]
rsb r4, r1, #32 | mov ip, r1
ldr r5, [r0, #4] | stmfd sp!, {r4, r5}
subs ip, r1, #32 | sub r5, ip, #32
ldr r0, [r0, #0] | rsb r4, ip, #32
mov r3, r5, asl r1 | mov r1, r3, asl ip
orr r3, r3, r0, lsr r4 | mov r0, r2, asl ip
mov r2, r0, asl r1 | orr r1, r1, r2, asl r5
movpl r3, r0, asl ip | orr r1, r1, r2, lsr r4
mov r0, r2 | ldmfd sp!, {r4, r5}
mov r1, r3 | bx lr
ldmfd sp!, {r4, r5} |
bx lr |
In Thumb mode, old left-shift vs. new:
ldr r2, [r0, #0] | ldrd r2, [r0]
ldr r3, [r0, #4] | push {r4, r5, r6}
push {r4, r5, r6} | sub r6, r1, #32
rsb r6, r1, #32 | mov r4, r1
sub r4, r1, #32 | rsb r5, r1, #32
lsls r3, r3, r1 | lsls r6, r2, r6
lsrs r6, r2, r6 | lsls r1, r3, r1
lsls r5, r2, r4 | lsrs r5, r2, r5
orrs r3, r3, r6 | lsls r0, r2, r4
lsls r0, r2, r1 | orrs r1, r1, r6
bics r1, r5, r4, asr #32 | orrs r1, r1, r5
it cs | pop {r4, r5, r6}
movcs r1, r3 | bx lr
pop {r4, r5, r6} |
bx lr |
Logical right shift is essentially the same sequence as the left shift
above. However, arithmetic right shift requires something slightly
different. Here it is in ARM mode, old vs. new:
stmfd sp!, {r4, r5} | ldrd r2, [r0]
rsb r4, r1, #32 | mov ip, r1
ldr r5, [r0, #0] | stmfd sp!, {r4, r5}
subs ip, r1, #32 | rsb r5, ip, #32
ldr r0, [r0, #4] | subs r4, ip, #32
mov r2, r5, lsr r1 | mov r0, r2, lsr ip
orr r2, r2, r0, asl r4 | mov r1, r3, asr ip
mov r3, r0, asr r1 | orr r0, r0, r3, asl r5
movpl r2, r0, asr ip | orrge r0, r0, r3, asr r4
mov r1, r3 | ldmfd sp!, {r4, r5}
mov r0, r2 | bx lr
ldmfd sp!, {r4, r5} |
bx lr |
I won't bore you with the Thumb mode comparison.
The shift-by-constant cases have also been reimplemented, although the
resultant sequences are much the same as before. (Doing this isn't
strictly necessary just yet, but when I post my next patch to do 64-bit
shifts in NEON, this feature will be required by the fall-back
alternatives.)
I've run a regression test on a cross-compiler, and I should have native
test results next week sometime. Also some benchmark results.
Is this OK for stage 1?
Andrew
2012-01-27 Andrew Stubbs <ams@codesourcery.com>
* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
prototype.
* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
(ashrdi3,lshrdi3): Likewise.
---
gcc/config/arm/arm-protos.h | 3 +
gcc/config/arm/arm.c | 198 +++++++++++++++++++++++++++++++++++++++++++
gcc/config/arm/arm.md | 90 ++++++++++++++------
3 files changed, 264 insertions(+), 27 deletions(-)
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 296550a..df8d7a9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -242,6 +242,9 @@ struct tune_params
extern const struct tune_params *current_tune;
extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+ rtx);
#endif /* RTX_CODE */
#endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 0bded8d..eefc45c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25139,5 +25139,203 @@ vfp3_const_double_for_fract_bits (rtx operand)
return 0;
}
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal
+ on ARM, since we know that shifts by negative amounts are no-ops.
+
+ It's safe for the input and output to be the same register, but
+ early-clobber rules apply for the shift amount and scratch registers.
+
+ Shift by register requires both scratch registers. Shift by a constant
+ less than 32 in Thumb2 mode requires SCRATCH1 only. In all other cases
+ the scratch registers may be NULL.
+
+ Additionally, ashiftrt by a register also clobbers the CC register. */
+void
+arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
+ rtx amount, rtx scratch1, rtx scratch2)
+{
+ rtx out_high = gen_highpart (SImode, out);
+ rtx out_low = gen_lowpart (SImode, out);
+ rtx in_high = gen_highpart (SImode, in);
+ rtx in_low = gen_lowpart (SImode, in);
+
+ /* Bits flow from up-stream to down-stream. */
+ rtx out_up = code == ASHIFT ? out_low : out_high;
+ rtx out_down = code == ASHIFT ? out_high : out_low;
+ rtx in_up = code == ASHIFT ? in_low : in_high;
+ rtx in_down = code == ASHIFT ? in_high : in_low;
+
+ gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+ gcc_assert (out
+ && (REG_P (out) || GET_CODE (out) == SUBREG)
+ && GET_MODE (out) == DImode);
+ gcc_assert (in
+ && (REG_P (in) || GET_CODE (in) == SUBREG)
+ && GET_MODE (in) == DImode);
+ gcc_assert (amount
+ && (((REG_P (amount) || GET_CODE (amount) == SUBREG)
+ && GET_MODE (amount) == SImode)
+ || CONST_INT_P (amount)));
+ gcc_assert (scratch1 == NULL
+ || (GET_CODE (scratch1) == SCRATCH)
+ || (GET_MODE (scratch1) == SImode
+ && REG_P (scratch1)));
+ gcc_assert (scratch2 == NULL
+ || (GET_CODE (scratch2) == SCRATCH)
+ || (GET_MODE (scratch2) == SImode
+ && REG_P (scratch2)));
+ gcc_assert (!REG_P (out) || !REG_P (amount)
+ || !HARD_REGISTER_P (out)
+ || (REGNO (out) != REGNO (amount)
+ && REGNO (out) + 1 != REGNO (amount)));
+
+ /* Macros to make following code more readable. */
+ #define SUB_32(DEST,SRC) \
+ gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
+ #define RSB_32(DEST,SRC) \
+ gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
+ #define SUB_S_32(DEST,SRC) \
+ gen_addsi3_compare0 ((DEST), (SRC), \
+ gen_rtx_CONST_INT (VOIDmode, -32))
+ #define SET(DEST,SRC) \
+ gen_rtx_SET (SImode, (DEST), (SRC))
+ #define SHIFT(CODE,SRC,AMOUNT) \
+ gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
+ #define LSHIFT(CODE,SRC,AMOUNT) \
+ gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
+ SImode, (SRC), (AMOUNT))
+ #define REV_LSHIFT(CODE,SRC,AMOUNT) \
+ gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
+ SImode, (SRC), (AMOUNT))
+ #define ORR(A,B) \
+ gen_rtx_IOR (SImode, (A), (B))
+ #define IF(COND,RTX) \
+ gen_rtx_COND_EXEC (VOIDmode, \
+ gen_rtx_ ## COND (CCmode, cc_reg, \
+ const0_rtx), \
+ (RTX))
+
+ if (CONST_INT_P (amount))
+ {
+ /* Shifts by a constant amount. */
+ if (INTVAL (amount) <= 0)
+ /* Match what shift-by-register would do. */
+ emit_insn (gen_movdi (out, in));
+ else if (INTVAL (amount) >= 64)
+ {
+ /* Match what shift-by-register would do. */
+ if (code == ASHIFTRT)
+ {
+ rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31);
+ emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx)));
+ emit_insn (SET (out_up, SHIFT (code, in_up, const31_rtx)));
+ }
+ else
+ emit_insn (gen_movdi (out, const0_rtx));
+ }
+ else if (INTVAL (amount) < 32)
+ {
+ /* Shifts by a constant less than 32. */
+ rtx reverse_amount = gen_rtx_CONST_INT (VOIDmode,
+ 32 - INTVAL (amount));
+
+ emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+ emit_insn (SET (out_down,
+ ORR (REV_LSHIFT (code, in_up, reverse_amount),
+ out_down)));
+ emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+ }
+ else
+ {
+ /* Shifts by a constant greater than 31. */
+ rtx adj_amount = gen_rtx_CONST_INT (VOIDmode, INTVAL (amount) - 32);
+
+ emit_insn (SET (out_down, SHIFT (code, in_up, adj_amount)));
+ if (code == ASHIFTRT)
+ emit_insn (gen_ashrsi3 (out_up, in_up,
+ gen_rtx_CONST_INT (VOIDmode, 31)));
+ else
+ emit_insn (SET (out_up, const0_rtx));
+ }
+ }
+ else
+ {
+ /* Shifts by a variable amount. */
+ rtx cc_reg = gen_rtx_REG (CC_NCVmode, CC_REGNUM);
+
+ gcc_assert (scratch1 && REG_P (scratch1));
+ gcc_assert (scratch2 && REG_P (scratch2));
+
+ switch (code)
+ {
+ case ASHIFT:
+ emit_insn (SUB_32 (scratch1, amount));
+ emit_insn (RSB_32 (scratch2, amount));
+ break;
+ case ASHIFTRT:
+ emit_insn (RSB_32 (scratch1, amount));
+ emit_insn (SUB_S_32 (scratch2, amount));
+ break;
+ case LSHIFTRT:
+ emit_insn (RSB_32 (scratch1, amount));
+ emit_insn (SUB_32 (scratch2, amount));
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+
+ if (!TARGET_THUMB2)
+ {
+ /* If this were only called during expand we could just use the else
+ case and let combine deal with it, but this can also be called
+ from post-reload splitters. */
+ emit_insn (SET (out_down,
+ ORR (SHIFT (ASHIFT, in_up, scratch1), out_down)));
+ if (code == ASHIFTRT)
+ {
+ emit_insn (IF (GE,
+ SET (out_down,
+ ORR (SHIFT (ASHIFTRT, in_up, scratch2),
+ out_down))));
+ }
+ else
+ emit_insn (SET (out_down, ORR (SHIFT (LSHIFTRT, in_up, scratch2),
+ out_down)));
+ }
+ else
+ {
+ /* Thumb2 can't do shift and or in one insn. */
+ emit_insn (SET (scratch1, SHIFT (ASHIFT, in_up, scratch1)));
+ emit_insn (gen_iorsi3 (out_down, out_down, scratch1));
+
+ if (code == ASHIFTRT)
+ {
+ emit_insn (IF (GE, SET (scratch2,
+ SHIFT (ASHIFTRT, in_up, scratch2))));
+ emit_insn (IF (GE, SET (out_down, ORR (out_down, scratch2))));
+ }
+ else
+ {
+ emit_insn (SET (scratch2, SHIFT (LSHIFTRT, in_up, scratch2)));
+ emit_insn (gen_iorsi3 (out_down, out_down, scratch2));
+ }
+ }
+
+ emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+ }
+
+ #undef SUB_32
+ #undef RSB_32
+ #undef SUB_S_32
+ #undef SET
+ #undef SHIFT
+ #undef LSHIFT
+ #undef REV_LSHIFT
+ #undef ORR
+ #undef IF
+}
+
#include "gt-arm.h"
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 751997f..7cae822 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -3466,21 +3466,33 @@
(match_operand:SI 2 "reg_or_int_operand" "")))]
"TARGET_32BIT"
"
- if (GET_CODE (operands[2]) == CONST_INT)
+ if (!CONST_INT_P (operands[2])
+ && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+ ; /* No special preparation statements; expand pattern as above. */
+ else
{
- if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+ rtx scratch1, scratch2;
+
+ if (GET_CODE (operands[2]) == CONST_INT
+ && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
{
emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
DONE;
}
- /* Ideally we shouldn't fail here if we could know that operands[1]
- ends up already living in an iwmmxt register. Otherwise it's
- cheaper to have the alternate code being generated than moving
- values to iwmmxt regs and back. */
- FAIL;
+
+ /* Ideally we should use iwmmxt here if we could know that operands[1]
+ ends up already living in an iwmmxt register. Otherwise it's
+ cheaper to have the alternate code being generated than moving
+ values to iwmmxt regs and back. */
+
+ /* Expand operation using core-registers.
+ 'FAIL' would achieve the same thing, but this is a bit smarter. */
+ scratch1 = gen_reg_rtx (SImode);
+ scratch2 = gen_reg_rtx (SImode);
+ arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
+ operands[2], scratch1, scratch2);
+ DONE;
}
- else if (!TARGET_REALLY_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK))
- FAIL;
"
)
@@ -3525,21 +3537,33 @@
(match_operand:SI 2 "reg_or_int_operand" "")))]
"TARGET_32BIT"
"
- if (GET_CODE (operands[2]) == CONST_INT)
+ if (!CONST_INT_P (operands[2])
+ && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+ ; /* No special preparation statements; expand pattern as above. */
+ else
{
- if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+ rtx scratch1, scratch2;
+
+ if (GET_CODE (operands[2]) == CONST_INT
+ && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
{
emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
DONE;
}
- /* Ideally we shouldn't fail here if we could know that operands[1]
- ends up already living in an iwmmxt register. Otherwise it's
- cheaper to have the alternate code being generated than moving
- values to iwmmxt regs and back. */
- FAIL;
+
+ /* Ideally we should use iwmmxt here if we could know that operands[1]
+ ends up already living in an iwmmxt register. Otherwise it's
+ cheaper to have the alternate code being generated than moving
+ values to iwmmxt regs and back. */
+
+ /* Expand operation using core-registers.
+ 'FAIL' would achieve the same thing, but this is a bit smarter. */
+ scratch1 = gen_reg_rtx (SImode);
+ scratch2 = gen_reg_rtx (SImode);
+ arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
+ operands[2], scratch1, scratch2);
+ DONE;
}
- else if (!TARGET_REALLY_IWMMXT)
- FAIL;
"
)
@@ -3582,21 +3606,33 @@
(match_operand:SI 2 "reg_or_int_operand" "")))]
"TARGET_32BIT"
"
- if (GET_CODE (operands[2]) == CONST_INT)
+ if (!CONST_INT_P (operands[2])
+ && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+ ; /* No special preparation statements; expand pattern as above. */
+ else
{
- if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+ rtx scratch1, scratch2;
+
+ if (GET_CODE (operands[2]) == CONST_INT
+ && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
{
emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
DONE;
}
- /* Ideally we shouldn't fail here if we could know that operands[1]
- ends up already living in an iwmmxt register. Otherwise it's
- cheaper to have the alternate code being generated than moving
- values to iwmmxt regs and back. */
- FAIL;
+
+ /* Ideally we should use iwmmxt here if we could know that operands[1]
+ ends up already living in an iwmmxt register. Otherwise it's
+ cheaper to have the alternate code being generated than moving
+ values to iwmmxt regs and back. */
+
+ /* Expand operation using core-registers.
+ 'FAIL' would achieve the same thing, but this is a bit smarter. */
+ scratch1 = gen_reg_rtx (SImode);
+ scratch2 = gen_reg_rtx (SImode);
+ arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
+ operands[2], scratch1, scratch2);
+ DONE;
}
- else if (!TARGET_REALLY_IWMMXT)
- FAIL;
"
)