This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Tweak ARM vld3q and vld4q patterns
- From: Richard Sandiford <richard dot sandiford at linaro dot org>
- To: gcc-patches at gcc dot gnu dot org
- Date: Thu, 24 Mar 2011 15:57:25 +0000
- Subject: Tweak ARM vld3q and vld4q patterns
The ARM vld3q and vld4q .md patterns expand into two individual vld3/vld4
instructions. Each instruction loads half of the total elements.
The problem is that this is implemented as:
array = vld3a (array, mem1)
array = vld3b (array, mem2)
with "array" being an input to the _first_ load as well as the second.
This input is dead, but results in unnecessary loads from the stack.
E.g. for:
#include <arm_neon.h>
void
foo (uint32_t *a, uint32_t *b, uint32_t *c)
{
uint32x4x3_t x, y;
x = vld3q_u32 (a);
y = vld3q_u32 (b);
x.val[0] = vaddq_u32 (x.val[0], y.val[0]);
x.val[1] = vaddq_u32 (x.val[1], y.val[1]);
x.val[2] = vaddq_u32 (x.val[2], y.val[2]);
vst3q_u32 (a, x);
}
we get:
stmfd sp!, {r3, fp}
ldr r2, .L2
add fp, sp, #4
vldmia r2, {d16-d21}
sub sp, sp, #112
vmov q11, q8 @ ti
vmov q12, q9 @ ti
vmov q13, q10 @ ti
...
where the vldmia is loading the x and y "inputs" to the two vld3q_u32s
from the corresponding stack slots.
It's true that vld?a doesn't _change_ the whole of the array,
but that doesn't matter; we no longer care what values the
other elements have.
Tested on arm-linux-gnueabi. OK to install?
Richard
gcc/
* config/arm/neon.md (neon_vld3qa<mode>, neon_vld4qa<mode>): Remove
operand 1 and reshuffle the operands to match.
(neon_vld3<mode>, neon_vld4<mode>): Update accordingly.
Index: gcc/config/arm/neon.md
===================================================================
--- gcc/config/arm/neon.md 2011-03-24 13:47:13.000000000 +0000
+++ gcc/config/arm/neon.md 2011-03-24 15:51:59.000000000 +0000
@@ -4605,8 +4605,7 @@ (define_expand "neon_vld3<mode>"
(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
- emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[0],
- operands[1], operands[1]));
+ emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[1], operands[1]));
emit_insn (gen_neon_vld3qb<mode> (operands[0], operands[0],
operands[1], operands[1]));
DONE;
@@ -4614,12 +4613,11 @@ (define_expand "neon_vld3<mode>"
(define_insn "neon_vld3qa<mode>"
[(set (match_operand:CI 0 "s_register_operand" "=w")
- (unspec:CI [(mem:CI (match_operand:SI 3 "s_register_operand" "2"))
- (match_operand:CI 1 "s_register_operand" "0")
+ (unspec:CI [(mem:CI (match_operand:SI 2 "s_register_operand" "1"))
(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3A))
- (set (match_operand:SI 2 "s_register_operand" "=r")
- (plus:SI (match_dup 3)
+ (set (match_operand:SI 1 "s_register_operand" "=r")
+ (plus:SI (match_dup 2)
(const_int 24)))]
"TARGET_NEON"
{
@@ -4628,7 +4626,7 @@ (define_insn "neon_vld3qa<mode>"
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 4);
ops[2] = gen_rtx_REG (DImode, regno + 8);
- ops[3] = operands[2];
+ ops[3] = operands[1];
output_asm_insn ("vld3.<V_sz_elem>\t{%P0, %P1, %P2}, [%3]!", ops);
return "";
}
@@ -4897,8 +4895,7 @@ (define_expand "neon_vld4<mode>"
(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
- emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[0],
- operands[1], operands[1]));
+ emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[1], operands[1]));
emit_insn (gen_neon_vld4qb<mode> (operands[0], operands[0],
operands[1], operands[1]));
DONE;
@@ -4906,12 +4903,11 @@ (define_expand "neon_vld4<mode>"
(define_insn "neon_vld4qa<mode>"
[(set (match_operand:XI 0 "s_register_operand" "=w")
- (unspec:XI [(mem:XI (match_operand:SI 3 "s_register_operand" "2"))
- (match_operand:XI 1 "s_register_operand" "0")
+ (unspec:XI [(mem:XI (match_operand:SI 2 "s_register_operand" "1"))
(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4A))
- (set (match_operand:SI 2 "s_register_operand" "=r")
- (plus:SI (match_dup 3)
+ (set (match_operand:SI 1 "s_register_operand" "=r")
+ (plus:SI (match_dup 2)
(const_int 32)))]
"TARGET_NEON"
{
@@ -4921,7 +4917,7 @@ (define_insn "neon_vld4qa<mode>"
ops[1] = gen_rtx_REG (DImode, regno + 4);
ops[2] = gen_rtx_REG (DImode, regno + 8);
ops[3] = gen_rtx_REG (DImode, regno + 12);
- ops[4] = operands[2];
+ ops[4] = operands[1];
output_asm_insn ("vld4.<V_sz_elem>\t{%P0, %P1, %P2, %P3}, [%4]!", ops);
return "";
}