This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH: ARM] PR 45335 Use ldrd and strd to access two consecutive words
- From: Carrot Wei <carrot at google dot com>
- To: Richard Earnshaw <rearnsha at arm dot com>
- Cc: ramana dot radhakrishnan at arm dot com, gcc-patches at gcc dot gnu dot org
- Date: Sat, 4 Sep 2010 20:41:41 +0800
- Subject: Re: [PATCH: ARM] PR 45335 Use ldrd and strd to access two consecutive words
- References: <AANLkTim+nD_6m5PwDBkz0BVz1Ftr+DYyih17nXX5LMue@mail.gmail.com> <AANLkTimisL+aARNpQZGxD2tFFkO0dsZgAPWC84gC3J0t@mail.gmail.com> <1282658136.22948.34.camel@e102325-lin.cambridge.arm.com> <AANLkTin6a-gkgSvuhLpRHfxFVK_DJgUpRhuqnN9PD_QT@mail.gmail.com> <1283354531.25967.50.camel@e102346-lin.cambridge.arm.com>
On Wed, Sep 1, 2010 at 11:22 PM, Richard Earnshaw <rearnsha@arm.com> wrote:
> If you submit an updated patch, please re-include the changelog entry,
> even if it's the same.
>
> There are two obvious problems with this patch:
>
> 1) You presume that ldrd is always cheaper than ldm(2 regs). ?This isn't
> the case on Cortex-a9. ?I'm not expecting you to work out all the
> details of when A9 should use LDM and when it should use ldrd, but your
> code needs to ascertain the costs of each alternative and make a
> decision based on that answer, not on a static choice.
>
> 2) Your code fails to check for volatile mems. ?These must not be
> transformed and the original load/store instructions must be preserved.
>
1. A new function thumb2_prefer_ldmstm is used to choose ldm/stm or ldrd/strd.
The default behavior is to output ldrd/strd. One should update this function if
ldm/stm is better.
2. Function thumb2_legitimate_ldrd_p is updated to check volatile memory access.
Following is the new patch
ChangeLog:
2010-09-04 Wei Guozhi <carrot@google.com>
PR target/45335
* gcc/config/arm/thumb2.md (thumb2_ldrd, thumb2_ldrd_reg1,
thumb2_ldrd_reg2 and peephole2): New insn pattern and related
peephole2.
(thumb2_strd, thumb2_strd_reg1, thumb2_strd_reg2 and peephole2):
New insn pattern and related peephole2.
* gcc/config/arm/arm.c (thumb2_legitimate_ldrd_p): New function.
(thumb2_check_ldrd_operands): New function.
(thumb2_prefer_ldmstm): New function.
* gcc/config/arm/arm-protos.h (thumb2_legitimate_ldrd_p): New prototype.
(thumb2_check_ldrd_operands): New prototype.
(thumb2_prefer_ldmstm): New prototype.
* gcc/config/arm/ldmstm.md (ldm2_ia, stm2_ia, ldm2_db, stm2_db):
Change the ldm/stm patterns with 2 words to ARM only.
* gcc/config/arm/constraints.md (Py): New thumb2 constant constraint
suitable to ldrd/strd instructions.
2010-09-04 Wei Guozhi <carrot@google.com>
PR target/45335
* gcc.target/arm/pr45335.c: New test.
* gcc.target/arm/pr40457-1.c: Changed to load 3 words.
* gcc.target/arm/pr40457-2.c: Changed to store 3 words.
Index: thumb2.md
===================================================================
--- thumb2.md (revision 163853)
+++ thumb2.md (working copy)
@@ -1257,3 +1257,226 @@ (define_peephole2
"
operands[2] = GEN_INT (32 - INTVAL (operands[2]));
")
+
+(define_insn "*thumb2_ldrd"
+ [(parallel [(set (match_operand:SI 0 "s_register_operand" "")
+ (mem:SI (plus:SI
+ (match_operand:SI 2 "s_register_operand" "rk")
+ (match_operand:SI 3 "const_int_operand" "Py"))))
+ (set (match_operand:SI 1 "s_register_operand" "")
+ (mem:SI (plus:SI (match_dup 2)
+ (match_operand:SI 4 "const_int_operand" "Py"))))])]
+ "TARGET_THUMB2 && thumb2_check_ldrd_operands (operands[0], operands[1],
+ operands[2], operands[3], operands[4], 1)"
+ "*
+ {
+ HOST_WIDE_INT offset1 = INTVAL (operands[3]);
+ HOST_WIDE_INT offset2 = INTVAL (operands[4]);
+ if (offset1 > offset2)
+ {
+ /* Swap the operands so that memory [base+offset1] is loaded into
+ operands[0]. */
+ rtx tmp = operands[0];
+ operands[0] = operands[1];
+ operands[1] = tmp;
+ tmp = operands[3];
+ operands[3] = operands[4];
+ operands[4] = tmp;
+ offset1 = INTVAL (operands[3]);
+ offset2 = INTVAL (operands[4]);
+ }
+ if (thumb2_prefer_ldmstm (operands[0], operands[1],
+ operands[2], operands[3], operands[4], 1))
+ return \"ldmdb\\t%2, {%0, %1}\";
+ else if (fix_cm3_ldrd && (operands[2] == operands[0]))
+ {
+ if (offset1 <= -256)
+ {
+ output_asm_insn (\"sub\\t%2, %2, %n3\", operands);
+ output_asm_insn (\"ldr\\t%1, [%2, #4]\", operands);
+ output_asm_insn (\"ldr\\t%0, [%2]\", operands);
+ }
+ else
+ {
+ output_asm_insn (\"ldr\\t%1, [%2, %4]\", operands);
+ output_asm_insn (\"ldr\\t%0, [%2, %3]\", operands);
+ }
+ return \"\";
+ }
+ else
+ return \"ldrd\\t%0, %1, [%2, %3]\";
+ }"
+)
+
+(define_insn "*thumb2_ldrd_reg1"
+ [(parallel [(set (match_operand:SI 0 "s_register_operand" "")
+ (mem:SI (match_operand:SI 2 "s_register_operand" "rk")))
+ (set (match_operand:SI 1 "s_register_operand" "")
+ (mem:SI (plus:SI (match_dup 2)
+ (match_operand:SI 3 "const_int_operand" "Py"))))])]
+ "TARGET_THUMB2 && thumb2_check_ldrd_operands (operands[0], operands[1],
+ operands[2], 0, operands[3], 1)"
+ "*
+ {
+ HOST_WIDE_INT offset2 = INTVAL (operands[3]);
+ if (offset2 == 4)
+ {
+ if (thumb2_prefer_ldmstm (operands[0], operands[1],
+ operands[2], 0, operands[3], 1))
+ return \"ldmia\\t%2, {%0, %1}\";
+ if (fix_cm3_ldrd && (operands[2] == operands[0]))
+ {
+ output_asm_insn (\"ldr\\t%1, [%2, %3]\", operands);
+ output_asm_insn (\"ldr\\t%0, [%2]\", operands);
+ return \"\";
+ }
+ return \"ldrd\\t%0, %1, [%2]\";
+ }
+ else
+ {
+ if (fix_cm3_ldrd && (operands[2] == operands[1]))
+ {
+ output_asm_insn (\"ldr\\t%0, [%2]\", operands);
+ output_asm_insn (\"ldr\\t%1, [%2, %3]\", operands);
+ }
+ return \"ldrd\\t%1, %0, [%2, %3]\";
+ }
+ }"
+)
+
+(define_insn "*thumb2_ldrd_reg2"
+ [(parallel [(set (match_operand:SI 0 "s_register_operand" "")
+ (mem:SI (plus:SI
+ (match_operand:SI 2 "s_register_operand" "rk")
+ (match_operand:SI 3 "const_int_operand" "Py"))))
+ (set (match_operand:SI 1 "s_register_operand" "")
+ (mem:SI (match_dup 2)))])]
+ "TARGET_THUMB2 && thumb2_check_ldrd_operands (operands[0], operands[1],
+ operands[2], operands[3], 0, 1)"
+ "*
+ {
+ HOST_WIDE_INT offset1 = INTVAL (operands[3]);
+ if (offset1 == -4)
+ {
+ if (fix_cm3_ldrd && (operands[2] == operands[0]))
+ {
+ output_asm_insn (\"ldr\\t%1, [%2]\", operands);
+ output_asm_insn (\"ldr\\t%0, [%2, %3]\", operands);
+ return \"\";
+ }
+ return \"ldrd\\t%0, %1, [%2, %3]\";
+ }
+ else
+ {
+ if (thumb2_prefer_ldmstm (operands[0], operands[1],
+ operands[2], operands[3], 0, 1))
+ return \"ldmia\\t%2, {%1, %0}\";
+ if (fix_cm3_ldrd && (operands[2] == operands[1]))
+ {
+ output_asm_insn (\"ldr\\t%0, [%2, %3]\", operands);
+ output_asm_insn (\"ldr\\t%1, [%2]\", operands);
+ return \"\";
+ }
+ return \"ldrd\\t%1, %0, [%2]\";
+ }
+ }"
+)
+
+(define_peephole2
+ [(set (match_operand:SI 0 "s_register_operand" "")
+ (match_operand:SI 2 "memory_operand" ""))
+ (set (match_operand:SI 1 "s_register_operand" "")
+ (match_operand:SI 3 "memory_operand" ""))]
+ "TARGET_THUMB2 && thumb2_legitimate_ldrd_p (operands[0], operands[1],
+ operands[2], operands[3], 1)"
+ [(parallel [(set (match_operand:SI 0 "s_register_operand" "")
+ (match_operand:SI 2 "memory_operand" ""))
+ (set (match_operand:SI 1 "s_register_operand" "")
+ (match_operand:SI 3 "memory_operand" ""))])]
+ ""
+)
+
+(define_insn "*thumb2_strd"
+ [(parallel [(set (mem:SI
+ (plus:SI (match_operand:SI 2 "s_register_operand" "rk")
+ (match_operand:SI 3 "const_int_operand" "Py")))
+ (match_operand:SI 0 "s_register_operand" ""))
+ (set (mem:SI (plus:SI (match_dup 2)
+ (match_operand:SI 4 "const_int_operand" "Py")))
+ (match_operand:SI 1 "s_register_operand" ""))])]
+ "TARGET_THUMB2 && thumb2_check_ldrd_operands (operands[0], operands[1],
+ operands[2], operands[3], operands[4], 0)"
+ "*
+ {
+ HOST_WIDE_INT offset1 = INTVAL (operands[3]);
+ HOST_WIDE_INT offset2 = INTVAL (operands[4]);
+ if (thumb2_prefer_ldmstm (operands[0], operands[1],
+ operands[2], operands[3], operands[4], 0))
+ return \"stmdb\\t%2, {%0, %1}\";
+ if (offset1 < offset2 )
+ return \"strd\\t%0, %1, [%2, %3]\";
+ else
+ return \"strd\\t%1, %0, [%2, %4]\";
+ }"
+)
+
+(define_insn "*thumb2_strd_reg1"
+ [(parallel [(set (mem:SI (match_operand:SI 2 "s_register_operand" "rk"))
+ (match_operand:SI 0 "s_register_operand" ""))
+ (set (mem:SI (plus:SI (match_dup 2)
+ (match_operand:SI 3 "const_int_operand" "Py")))
+ (match_operand:SI 1 "s_register_operand" ""))])]
+ "TARGET_THUMB2 && thumb2_check_ldrd_operands (operands[0], operands[1],
+ operands[2], 0, operands[3], 0)"
+ "*
+ {
+ HOST_WIDE_INT offset2 = INTVAL (operands[3]);
+ if (offset2 == 4)
+ {
+ if (thumb2_prefer_ldmstm (operands[0], operands[1],
+ operands[2], 0, operands[3], 0))
+ return \"stmia\\t%2, {%0, %1}\";
+ return \"strd\\t%0, %1, [%2]\";
+ }
+ else
+ return \"strd\\t%1, %0, [%2, %3]\";
+ }"
+)
+
+(define_insn "*thumb2_strd_reg2"
+ [(parallel [(set (mem:SI (plus:SI
+ (match_operand:SI 2 "s_register_operand" "rk")
+ (match_operand:SI 3 "const_int_operand" "Py")))
+ (match_operand:SI 0 "s_register_operand" ""))
+ (set (mem:SI (match_dup 2))
+ (match_operand:SI 1 "s_register_operand" ""))])]
+ "TARGET_THUMB2 && thumb2_check_ldrd_operands (operands[0], operands[1],
+ operands[2], operands[3], 0, 0)"
+ "*
+ {
+ HOST_WIDE_INT offset1 = INTVAL (operands[3]);
+ if (offset1 == -4)
+ return \"strd\\t%0, %1, [%2, %3]\";
+ else
+ {
+ if (thumb2_prefer_ldmstm (operands[0], operands[1],
+ operands[2], operands[3], 0, 0))
+ return \"stmia\\t%2, {%1, %0}\";
+ return \"strd\\t%1, %0, [%2]\";
+ }
+ }"
+)
+
+(define_peephole2
+ [(set (match_operand:SI 2 "memory_operand" "")
+ (match_operand:SI 0 "s_register_operand" ""))
+ (set (match_operand:SI 3 "memory_operand" "")
+ (match_operand:SI 1 "s_register_operand" ""))]
+ "TARGET_THUMB2 && thumb2_legitimate_ldrd_p (operands[0], operands[1],
+ operands[2], operands[3], 0)"
+ [(parallel [(set (match_operand:SI 2 "memory_operand" "")
+ (match_operand:SI 0 "s_register_operand" ""))
+ (set (match_operand:SI 3 "memory_operand" "")
+ (match_operand:SI 1 "s_register_operand" ""))])]
+ ""
+)
Index: arm.c
===================================================================
--- arm.c (revision 163853)
+++ arm.c (working copy)
@@ -22976,4 +22976,125 @@ arm_expand_sync (enum machine_mode mode,
}
}
+/* Check the legality of operands in an ldrd/strd instruction. */
+bool
+thumb2_check_ldrd_operands (rtx reg1, rtx reg2, rtx base,
+ rtx off1, rtx off2, bool ldrd)
+{
+ HOST_WIDE_INT offset1 = 0;
+ HOST_WIDE_INT offset2 = 0;
+
+ if (off1 != NULL)
+ offset1 = INTVAL (off1);
+ if (off2 != NULL)
+ offset2 = INTVAL (off2);
+
+ if (ldrd && (reg1 == reg2))
+ return false;
+
+ if ((offset1 + 4) == offset2)
+ return true;
+ if ((offset2 + 4) == offset1)
+ return true;
+
+ return false;
+}
+
+/* Check if the two memory accesses can be merged to an ldrd/strd instruction.
+ That is they use the same base register, and the gap between constant
+ offsets should be 4. */
+bool
+thumb2_legitimate_ldrd_p (rtx reg1, rtx reg2, rtx mem1, rtx mem2, bool ldrd)
+{
+ rtx base1, base2, op1;
+ rtx addr1 = XEXP (mem1, 0);
+ rtx addr2 = XEXP (mem2, 0);
+ HOST_WIDE_INT offset1 = 0;
+ HOST_WIDE_INT offset2 = 0;
+
+ if (MEM_VOLATILE_P (mem1) || MEM_VOLATILE_P (mem2))
+ return false;
+
+ if (REG_P (addr1))
+ base1 = addr1;
+ else if (GET_CODE (addr1) == PLUS)
+ {
+ base1 = XEXP (addr1, 0);
+ op1 = XEXP (addr1, 1);
+ if (!REG_P (base1) || (GET_CODE (op1) != CONST_INT))
+ return false;
+ offset1 = INTVAL (op1);
+ }
+ else
+ return false;
+
+ if (REG_P (addr2))
+ base2 = addr2;
+ else if (GET_CODE (addr2) == PLUS)
+ {
+ base2 = XEXP (addr2, 0);
+ op1 = XEXP (addr2, 1);
+ if (!REG_P (base2) || (GET_CODE (op1) != CONST_INT))
+ return false;
+ offset2 = INTVAL (op1);
+ }
+ else
+ return false;
+
+ if (base1 != base2)
+ return false;
+
+ if ((offset1 > 1024) || (offset1 < -1020) || ((offset1 & 3) != 0))
+ return false;
+ if ((offset2 > 1024) || (offset2 < -1020) || ((offset2 & 3) != 0))
+ return false;
+
+ if (ldrd && ((reg1 == reg2) || (reg1 == base1)))
+ return false;
+
+ if ((offset1 + 4) == offset2)
+ return true;
+ if ((offset2 + 4) == offset1)
+ return true;
+
+ return false;
+}
+
+/* Check if the insn can be expressed as ldm/stm with less cost. */
+bool
+thumb2_prefer_ldmstm (rtx reg1, rtx reg2, rtx base,
+ rtx off1, rtx off2, bool ldrd)
+{
+ HOST_WIDE_INT offset1 = 0;
+ HOST_WIDE_INT offset2 = 0;
+
+ if (off1 != NULL)
+ offset1 = INTVAL (off1);
+ if (off2 != NULL)
+ offset2 = INTVAL (off2);
+
+ if (offset1 > offset2)
+ {
+ rtx tmp;
+ HOST_WIDE_INT t = offset1;
+ offset1 = offset2;
+ offset2 = t;
+ tmp = reg1;
+ reg1 = reg2;
+ reg2 = tmp;
+ }
+
+ /* The offset of ldmdb is -8, the offset of ldmia is 0. */
+ if ((offset1 != -8) && (offset1 != 0))
+ return false;
+
+ /* Lower register corresponds to lower memory. */
+ if (REGNO (reg1) > REGNO (reg2))
+ return false;
+
+ /* Now ldm/stm is possible. Check for special cases ldm/stm has lower
+ cost. */
+ return false;
+}
+
#include "gt-arm.h"
Index: arm-protos.h
===================================================================
--- arm-protos.h (revision 163853)
+++ arm-protos.h (working copy)
@@ -149,7 +149,9 @@ extern void arm_expand_sync (enum machin
extern const char *arm_output_memory_barrier (rtx *);
extern const char *arm_output_sync_insn (rtx, rtx *);
extern unsigned int arm_sync_loop_insns (rtx , rtx *);
-
+extern bool thumb2_check_ldrd_operands (rtx, rtx, rtx, rtx, rtx, bool);
+extern bool thumb2_legitimate_ldrd_p (rtx, rtx, rtx, rtx, bool);
+extern bool thumb2_prefer_ldmstm (rtx, rtx, rtx, rtx, rtx, bool);
extern bool arm_output_addr_const_extra (FILE *, rtx);
#if defined TREE_CODE
Index: ldmstm.md
===================================================================
--- ldmstm.md (revision 163853)
+++ ldmstm.md (working copy)
@@ -852,7 +852,7 @@ (define_insn "*ldm2_ia"
(set (match_operand:SI 2 "arm_hard_register_operand" "")
(mem:SI (plus:SI (match_dup 3)
(const_int 4))))])]
- "TARGET_32BIT && XVECLEN (operands[0], 0) == 2"
+ "TARGET_ARM && XVECLEN (operands[0], 0) == 2"
"ldm%(ia%)\t%3, {%1, %2}"
[(set_attr "type" "load2")
(set_attr "predicable" "yes")])
@@ -901,7 +901,7 @@ (define_insn "*stm2_ia"
(match_operand:SI 1 "arm_hard_register_operand" ""))
(set (mem:SI (plus:SI (match_dup 3) (const_int 4)))
(match_operand:SI 2 "arm_hard_register_operand" ""))])]
- "TARGET_32BIT && XVECLEN (operands[0], 0) == 2"
+ "TARGET_ARM && XVECLEN (operands[0], 0) == 2"
"stm%(ia%)\t%3, {%1, %2}"
[(set_attr "type" "store2")
(set_attr "predicable" "yes")])
@@ -1041,7 +1041,7 @@ (define_insn "*ldm2_db"
(set (match_operand:SI 2 "arm_hard_register_operand" "")
(mem:SI (plus:SI (match_dup 3)
(const_int -4))))])]
- "TARGET_32BIT && XVECLEN (operands[0], 0) == 2"
+ "TARGET_ARM && XVECLEN (operands[0], 0) == 2"
"ldm%(db%)\t%3, {%1, %2}"
[(set_attr "type" "load2")
(set_attr "predicable" "yes")])
@@ -1067,7 +1067,7 @@ (define_insn "*stm2_db"
(match_operand:SI 1 "arm_hard_register_operand" ""))
(set (mem:SI (plus:SI (match_dup 3) (const_int -4)))
(match_operand:SI 2 "arm_hard_register_operand" ""))])]
- "TARGET_32BIT && XVECLEN (operands[0], 0) == 2"
+ "TARGET_ARM && XVECLEN (operands[0], 0) == 2"
"stm%(db%)\t%3, {%1, %2}"
[(set_attr "type" "store2")
(set_attr "predicable" "yes")])
Index: constraints.md
===================================================================
--- constraints.md (revision 163853)
+++ constraints.md (working copy)
@@ -31,7 +31,7 @@
;; The following multi-letter normal constraints have been used:
;; in ARM/Thumb-2 state: Da, Db, Dc, Dn, Dl, DL, Dv, Dy, Di, Dz
;; in Thumb-1 state: Pa, Pb, Pc, Pd
-;; in Thumb-2 state: Ps, Pt, Pu, Pv, Pw, Px
+;; in Thumb-2 state: Ps, Pt, Pu, Pv, Pw, Px, Py
;; The following memory constraints have been used:
;; in ARM/Thumb-2 state: Q, Ut, Uv, Uy, Un, Um, Us
@@ -189,6 +189,13 @@ (define_constraint "Px"
(and (match_code "const_int")
(match_test "TARGET_THUMB2 && ival >= -7 && ival <= -1")))
+(define_constraint "Py"
+ "@internal In Thumb-2 state a constant that is a multiple of 4 in the
+ range -1020 to 1024"
+ (and (match_code "const_int")
+ (match_test "TARGET_THUMB2 && ival >= -1020 && ival <= 1024
+ && (ival & 3) == 0")))
+
(define_constraint "G"
"In ARM/Thumb-2 state a valid FPA immediate constant."
(and (match_code "const_double")
Index: pr40457-1.c
===================================================================
--- pr40457-1.c (revision 163853)
+++ pr40457-1.c (working copy)
@@ -1,9 +1,9 @@
-/* { dg-options "-Os" } */
+/* { dg-options "-O2" } */
/* { dg-do compile } */
int bar(int* p)
{
- int x = p[0] + p[1];
+ int x = p[0] + p[1] + p[2];
return x;
}
Index: pr40457-2.c
===================================================================
--- pr40457-2.c (revision 163853)
+++ pr40457-2.c (working copy)
@@ -5,6 +5,7 @@ void foo(int* p)
{
p[0] = 1;
p[1] = 0;
+ p[2] = 2;
}
/* { dg-final { scan-assembler "stm" } } */
Index: pr45335.c
===================================================================
--- pr45335.c (revision 0)
+++ pr45335.c (revision 0)
@@ -0,0 +1,22 @@
+/* { dg-options "-mthumb -O2" } */
+/* { dg-require-effective-target arm_thumb2_ok } */
+/* { dg-final { scan-assembler "ldrd" } } */
+/* { dg-final { scan-assembler "strd" } } */
+
+struct S
+{
+ void* p1;
+ void* p2;
+ void* p3;
+ void* p4;
+};
+
+extern printf(char*, ...);
+
+void foo1(struct S* fp, struct S* otherSaveArea)
+{
+ struct S* saveA = fp - 1;
+ printf("StackSaveArea for fp %p [%p/%p]:\n", fp, saveA, otherSaveArea);
+ printf("prevFrame=%p savedPc=%p meth=%p curPc=%p fp[0]=0x%08x\n",
+ saveA->p1, saveA->p2, saveA->p3, saveA->p4, *(unsigned int*)fp);
+}