This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH][ARM] Enable auto-vectorization for copysignf
- From: Jiong Wang <jiong dot wang at arm dot com>
- To: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Mon, 18 Aug 2014 11:31:41 +0100
- Subject: [PATCH][ARM] Enable auto-vectorization for copysignf
- Authentication-results: sourceware.org; auth=none
this patch enable auto-vectorization for copysignf by using vector
bit selection instruction on arm32 when neon available.
for a simple testcase:
for (i = 0; i < N; i++)
r[i] = __builtin_copysignf (a[i], b[i]);
assuming vector factor be 4, the generated instruction sequences is:
vmov.i32 q10, #2147483648 @ v4si
.L2:
vld1.64 {d18-d19}, [ip:64]
add r3, r3, #16
add ip, ip, #16
vldr d16, [r3, #-16]
vldr d17, [r3, #-8]
vbif q8, q9, q10
vst1.32 {q8}, [r1]
add r1, r1, #16
cmp r1, lr
bne .L2
ok to install?
thanks.
gcc/
* config/arm/arm.c (NEON_COPYSIGNF): New enum.
(arm_init_neon_builtins): Support NEON_COPYSIGNF.
(arm_builtin_vectorized_function): Likewise.
* config/arm/arm_neon_builtins.def: New macro for copysignf.
* config/arm/neon.md (neon_copysignf<mode>): New pattern for vector copysignf.
gcc/testsuite/
* gcc.target/arm/vect-copysignf.c: New testcase.
commit 533b209f1899a1070394506ab32cc640de6a58e3
Author: Jiong Wang <jiong.wang@arm.com>
Date: Thu Aug 14 11:54:41 2014 +0100
vect copysignf.
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 2f8d327..045c56e 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -23243,6 +23243,7 @@ typedef enum {
NEON_SETLANE,
NEON_CREATE,
NEON_RINT,
+ NEON_COPYSIGNF,
NEON_DUP,
NEON_DUPLANE,
NEON_COMBINE,
@@ -24237,6 +24238,22 @@ arm_init_neon_builtins (void)
ftype = build_function_type_list (eltype, eltype, NULL);
break;
}
+ case NEON_COPYSIGNF:
+ {
+ tree eltype = NULL_TREE;
+ switch (insn_data[d->code].operand[1].mode)
+ {
+ case V2SFmode:
+ eltype = V2SF_type_node;
+ break;
+ case V4SFmode:
+ eltype = V4SF_type_node;
+ break;
+ default: gcc_unreachable ();
+ }
+ ftype = build_function_type_list (eltype, eltype, NULL);
+ break;
+ }
default:
gcc_unreachable ();
}
@@ -25440,6 +25457,7 @@ arm_expand_neon_builtin (int fcode, tree exp, rtx target)
return arm_expand_neon_args (target, icode, 1, type_mode, exp, fcode,
NEON_ARG_COPY_TO_REG, NEON_ARG_STOP);
+ case NEON_COPYSIGNF:
case NEON_COMBINE:
case NEON_VTBL:
return arm_expand_neon_args (target, icode, 1, type_mode, exp, fcode,
@@ -29984,27 +30002,34 @@ arm_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
return ARM_FIND_VRINT_VARIANT (vrinta);
#undef ARM_CHECK_BUILTIN_MODE
#define ARM_CHECK_BUILTIN_MODE(C, N) \
- (out_mode == N##Imode && out_n == C \
- && in_mode == N##Imode && in_n == C)
+ (out_mode == N##mode && out_n == C \
+ && in_mode == N##mode && in_n == C)
case BUILT_IN_BSWAP16:
- if (ARM_CHECK_BUILTIN_MODE (4, H))
+ if (ARM_CHECK_BUILTIN_MODE (4, HI))
return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4hi, false);
- else if (ARM_CHECK_BUILTIN_MODE (8, H))
+ else if (ARM_CHECK_BUILTIN_MODE (8, HI))
return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv8hi, false);
else
return NULL_TREE;
case BUILT_IN_BSWAP32:
- if (ARM_CHECK_BUILTIN_MODE (2, S))
+ if (ARM_CHECK_BUILTIN_MODE (2, SI))
return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2si, false);
- else if (ARM_CHECK_BUILTIN_MODE (4, S))
+ else if (ARM_CHECK_BUILTIN_MODE (4, SI))
return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4si, false);
else
return NULL_TREE;
case BUILT_IN_BSWAP64:
- if (ARM_CHECK_BUILTIN_MODE (2, D))
+ if (ARM_CHECK_BUILTIN_MODE (2, DI))
return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2di, false);
else
return NULL_TREE;
+ case BUILT_IN_COPYSIGNF:
+ if (ARM_CHECK_BUILTIN_MODE (2, SF))
+ return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv2sf, false);
+ else if (ARM_CHECK_BUILTIN_MODE (4, SF))
+ return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv4sf, false);
+ else
+ return NULL_TREE;
default:
return NULL_TREE;
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index f4531f3..a2e1301 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -135,6 +135,7 @@ VAR1 (FLOAT_WIDEN, vcvtv4sf, v4hf),
VAR1 (FLOAT_NARROW, vcvtv4hf, v4sf),
VAR10 (SELECT, vbsl,
v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di),
+VAR2 (COPYSIGNF, copysignf, v2sf, v4sf),
VAR2 (RINT, vrintn, v2sf, v4sf),
VAR2 (RINT, vrinta, v2sf, v4sf),
VAR2 (RINT, vrintp, v2sf, v4sf),
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index dc364ee..7a21a42 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2569,6 +2569,33 @@
DONE;
})
+(define_expand "neon_copysignf<mode>"
+ [(match_operand:VCVTF 0 "register_operand")
+ (match_operand:VCVTF 1 "register_operand")
+ (match_operand:VCVTF 2 "register_operand")]
+ "TARGET_NEON"
+ "{
+ rtx v_bitmask_cast;
+ rtx v_bitmask = gen_reg_rtx (<VCVTF:V_cmp_result>mode);
+ int i, n_elt = GET_MODE_NUNITS (<MODE>mode);
+ rtvec v = rtvec_alloc (n_elt);
+
+ /* Create bitmask for vector select. */
+ for (i = 0; i < n_elt; ++i)
+ RTVEC_ELT (v, i) = GEN_INT (0x80000000);
+
+ emit_move_insn (v_bitmask,
+ gen_rtx_CONST_VECTOR (<VCVTF:V_cmp_result>mode, v));
+ emit_move_insn (operands[0], operands[2]);
+ v_bitmask_cast = simplify_gen_subreg (<MODE>mode, v_bitmask,
+ <VCVTF:V_cmp_result>mode, 0);
+ emit_insn (gen_neon_vbsl<mode> (operands[0], v_bitmask_cast, operands[0],
+ operands[1]));
+
+ DONE;
+ }"
+)
+
(define_insn "neon_vqneg<mode>"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/arm/vect-copysignf.c b/gcc/testsuite/gcc.target/arm/vect-copysignf.c
new file mode 100644
index 0000000..42f5560
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-copysignf.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-add-options "arm_neon" } */
+
+extern void abort ();
+
+#define N 16
+float a[N] = {-0.1f, -3.2f, -6.3f, -9.4f,
+ -12.5f, -15.6f, -18.7f, -21.8f,
+ 24.9f, 27.1f, 30.2f, 33.3f,
+ 36.4f, 39.5f, 42.6f, 45.7f};
+float b[N] = {-1.2f, 3.4f, -5.6f, 7.8f,
+ -9.0f, 1.0f, -2.0f, 3.0f,
+ -4.0f, -5.0f, 6.0f, 7.0f,
+ -8.0f, -9.0f, 10.0f, 11.0f};
+float r[N];
+
+int
+main (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ r[i] = __builtin_copysignf (a[i], b[i]);
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ if (r[i] != __builtin_copysignf (a[i], b[i]))
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */