This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [ARM] Use vector wide add for mixed-mode adds
- From: Michael Collison <michael dot collison at linaro dot org>
- To: gcc Patches <gcc-patches at gcc dot gnu dot org>, Ramana Radhakrishnan <Ramana dot Radhakrishnan at arm dot com>, Kyrill Tkachov <kyrylo dot tkachov at arm dot com>
- Date: Sun, 29 Nov 2015 18:18:04 -0700
- Subject: Re: [ARM] Use vector wide add for mixed-mode adds
- Authentication-results: sourceware.org; auth=none
This is a modified version of my previous patch that supports vector
wide add. I added support for vaddw on big endian when generating the
parallel operand for the vector select.
There are four failing test cases on arm big endian with similar code.
They are:
gcc.dg/vect/vect-outer-4f.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4g.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4k.c -flto -ffat-lto-objects execution test
gcc.dg/vect/vect-outer-4l.c -flto -ffat-lto-objects execution test
The failures occur without my patch and are related to a bug with vector
loads using VUZP operations.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68532
Validated on arm-none-eabi, arm-none-linux-gnueabi,
arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
2015-11-29 Michael Collison <michael.collison@linaro.org>
* config/arm/neon.md (widen_<us>sum<mode>): New patterns where
mode is VQI to improve mixed mode vectorization.
* config/arm/neon.md (vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3): New
define_insn to match low half of signed vaddw.
* config/arm/neon.md (vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3): New
define_insn to match high half of signed vaddw.
* config/arm/neon.md (vec_sel_widen_usum_lo<VQI:mode><VW:mode>3): New
define_insn to match low half of unsigned vaddw.
* config/arm/neon.md (vec_sel_widen_usum_hi<VQI:mode><VW:mode>3): New
define_insn to match high half of unsigned vaddw.
* config/arm/arm.c (aarch32_simd_vect_par_cnst_half): New function.
(aarch32_simd_check_vect_par_cnst_half): Likewise.
* config/arm/arm-protos.h (aarch32_simd_vect_par_cnst_half): Prototype
for new function.
(aarch32_simd_check_vect_par_cnst_half): Likewise.
* config/arm/predicates.md (vect_par_constant_high): Support
big endian and simplify by calling
aarch32_simd_check_vect_par_cnst_half
(vect_par_constant_low): Likewise.
* testsuite/gcc.target/arm/neon-vaddws16.c: New test.
* testsuite/gcc.target/arm/neon-vaddws32.c: New test.
* testsuite/gcc.target/arm/neon-vaddwu16.c: New test.
* testsuite/gcc.target/arm/neon-vaddwu32.c: New test.
* testsuite/gcc.target/arm/neon-vaddwu8.c: New test.
* testsuite/lib/target-supports.exp
(check_effective_target_vect_widen_sum_hi_to_si_pattern): Indicate
that arm neon support vector widen sum of HImode TO SImode.
Okay for trunk?
--
Michael Collison
Linaro Toolchain Working Group
michael.collison@linaro.org
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index f9b1276..26fe370 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -50,7 +50,9 @@ extern tree arm_builtin_decl (unsigned code, bool initialize_p
ATTRIBUTE_UNUSED);
extern void arm_init_builtins (void);
extern void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update);
-
+extern rtx aarch32_simd_vect_par_cnst_half (machine_mode mode, bool high);
+extern bool aarch32_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
+ bool high);
#ifdef RTX_CODE
extern bool arm_vector_mode_supported_p (machine_mode);
extern bool arm_small_register_classes_for_mode_p (machine_mode);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 61e2aa2..158c2e8 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -30111,4 +30111,80 @@ arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
*pri = tmp;
return;
}
+
+/* Construct and return a PARALLEL RTX vector with elements numbering the
+ lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
+ the vector - from the perspective of the architecture. This does not
+ line up with GCC's perspective on lane numbers, so we end up with
+ different masks depending on our target endian-ness. The diagram
+ below may help. We must draw the distinction when building masks
+ which select one half of the vector. An instruction selecting
+ architectural low-lanes for a big-endian target, must be described using
+ a mask selecting GCC high-lanes.
+
+ Big-Endian Little-Endian
+
+GCC 0 1 2 3 3 2 1 0
+ | x | x | x | x | | x | x | x | x |
+Architecture 3 2 1 0 3 2 1 0
+
+Low Mask: { 2, 3 } { 0, 1 }
+High Mask: { 0, 1 } { 2, 3 }
+*/
+
+rtx
+aarch32_simd_vect_par_cnst_half (machine_mode mode, bool high)
+{
+ int nunits = GET_MODE_NUNITS (mode);
+ rtvec v = rtvec_alloc (nunits / 2);
+ int high_base = nunits / 2;
+ int low_base = 0;
+ int base;
+ rtx t1;
+ int i;
+
+ if (BYTES_BIG_ENDIAN)
+ base = high ? low_base : high_base;
+ else
+ base = high ? high_base : low_base;
+
+ for (i = 0; i < nunits / 2; i++)
+ RTVEC_ELT (v, i) = GEN_INT (base + i);
+
+ t1 = gen_rtx_PARALLEL (mode, v);
+ return t1;
+}
+
+/* Check OP for validity as a PARALLEL RTX vector with elements
+ numbering the lanes of either the high (HIGH == TRUE) or low lanes,
+ from the perspective of the architecture. See the diagram above
+ aarch64_simd_vect_par_cnst_half for more details. */
+
+bool
+aarch32_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
+ bool high)
+{
+ rtx ideal = aarch32_simd_vect_par_cnst_half (mode, high);
+ HOST_WIDE_INT count_op = XVECLEN (op, 0);
+ HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
+ int i = 0;
+
+ if (!VECTOR_MODE_P (mode))
+ return false;
+
+ if (count_op != count_ideal)
+ return false;
+
+ for (i = 0; i < count_ideal; i++)
+ {
+ rtx elt_op = XVECEXP (op, 0, i);
+ rtx elt_ideal = XVECEXP (ideal, 0, i);
+
+ if (!CONST_INT_P (elt_op)
+ || INTVAL (elt_ideal) != INTVAL (elt_op))
+ return false;
+ }
+ return true;
+}
+
#include "gt-arm.h"
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index e5a2b0f..8d311b7 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1174,6 +1174,51 @@
;; Widening operations
+(define_expand "widen_ssum<mode>3"
+ [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+ (plus:<V_double_width> (sign_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+ (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+ "TARGET_NEON"
+ {
+ machine_mode mode = GET_MODE (operands[1]);
+ rtx p1, p2;
+
+ p1 = aarch32_simd_vect_par_cnst_half (mode, false);
+ p2 = aarch32_simd_vect_par_cnst_half (mode, true);
+
+ if (operands[0] != operands[2])
+ emit_move_insn (operands[0], operands[2]);
+
+ emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
+ emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
+ DONE;
+ }
+)
+
+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
+ [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+ (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+ (match_operand:VQI 2 "vect_par_constant_low" "")))
+ (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+ "TARGET_NEON"
+{
+ return BYTES_BIG_ENDIAN ? "vaddw.<V_s_elem>\t%q0, %q3, %f1" :
+ "vaddw.<V_s_elem>\t%q0, %q3, %e1";
+}
+ [(set_attr "type" "neon_add_widen")])
+
+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+ [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+ (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+ (match_operand:VQI 2 "vect_par_constant_high" "")))
+ (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+ "TARGET_NEON"
+{
+ return BYTES_BIG_ENDIAN ? "vaddw.<V_s_elem>\t%q0, %q3, %e1" :
+ "vaddw.<V_s_elem>\t%q0, %q3, %f1";
+}
+ [(set_attr "type" "neon_add_widen")])
+
(define_insn "widen_ssum<mode>3"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(plus:<V_widen> (sign_extend:<V_widen>
@@ -1184,6 +1229,51 @@
[(set_attr "type" "neon_add_widen")]
)
+(define_expand "widen_usum<mode>3"
+ [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+ (plus:<V_double_width> (zero_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+ (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+ "TARGET_NEON"
+ {
+ machine_mode mode = GET_MODE (operands[1]);
+ rtx p1, p2;
+
+ p1 = aarch32_simd_vect_par_cnst_half (mode, false);
+ p2 = aarch32_simd_vect_par_cnst_half (mode, true);
+
+ if (operands[0] != operands[2])
+ emit_move_insn (operands[0], operands[2]);
+
+ emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
+ emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
+ DONE;
+ }
+)
+
+(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
+ [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+ (plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+ (match_operand:VQI 2 "vect_par_constant_low" "")))
+ (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+ "TARGET_NEON"
+{
+ return BYTES_BIG_ENDIAN ? "vaddw.<V_u_elem>\t%q0, %q3, %f1" :
+ "vaddw.<V_u_elem>\t%q0, %q3, %e1";
+}
+ [(set_attr "type" "neon_add_widen")])
+
+(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
+ [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+ (plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+ (match_operand:VQI 2 "vect_par_constant_high" "")))
+ (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+ "TARGET_NEON"
+{
+ return BYTES_BIG_ENDIAN ? "vaddw.<V_u_elem>\t%q0, %q3, %e1" :
+ "vaddw.<V_u_elem>\t%q0, %q3, %f1";
+}
+ [(set_attr "type" "neon_add_widen")])
+
(define_insn "widen_usum<mode>3"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(plus:<V_widen> (zero_extend:<V_widen>
@@ -5331,7 +5421,7 @@ if (BYTES_BIG_ENDIAN)
[(set (match_operand:<V_unpack> 0 "register_operand" "=w")
(mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:VU 1 "register_operand" "w")
- (match_operand:VU 2 "vect_par_constant_low" "")))
+ (match_operand:VU 2 "vect_par_constant_low" "")))
(SE:<V_unpack> (vec_select:<V_HALF>
(match_operand:VU 3 "register_operand" "w")
(match_dup 2)))))]
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 48e4ba8..e1c24bd 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -605,59 +605,13 @@
(define_special_predicate "vect_par_constant_high"
(match_code "parallel")
{
- HOST_WIDE_INT count = XVECLEN (op, 0);
- int i;
- int base = GET_MODE_NUNITS (mode);
-
- if ((count < 1)
- || (count != base/2))
- return false;
-
- if (!VECTOR_MODE_P (mode))
- return false;
-
- for (i = 0; i < count; i++)
- {
- rtx elt = XVECEXP (op, 0, i);
- int val;
-
- if (!CONST_INT_P (elt))
- return false;
-
- val = INTVAL (elt);
- if (val != (base/2) + i)
- return false;
- }
- return true;
+ return aarch32_simd_check_vect_par_cnst_half (op, mode, true);
})
(define_special_predicate "vect_par_constant_low"
(match_code "parallel")
{
- HOST_WIDE_INT count = XVECLEN (op, 0);
- int i;
- int base = GET_MODE_NUNITS (mode);
-
- if ((count < 1)
- || (count != base/2))
- return false;
-
- if (!VECTOR_MODE_P (mode))
- return false;
-
- for (i = 0; i < count; i++)
- {
- rtx elt = XVECEXP (op, 0, i);
- int val;
-
- if (!CONST_INT_P (elt))
- return false;
-
- val = INTVAL (elt);
- if (val != i)
- return false;
- }
- return true;
+ return aarch32_simd_check_vect_par_cnst_half (op, mode, false);
})
(define_predicate "const_double_vcvt_power_of_two_reciprocal"
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 0000000..96c657e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, short * __restrict x)
+{
+ len = len & ~31;
+ int result = 0;
+ __asm volatile ("");
+ for (int i = 0; i < len; i++)
+ result += x[i];
+ return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
new file mode 100644
index 0000000..1bfdc13
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, int * __restrict x)
+{
+ len = len & ~31;
+ long long result = 0;
+ __asm volatile ("");
+ for (int i = 0; i < len; i++)
+ result += x[i];
+ return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s32" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
new file mode 100644
index 0000000..98f8768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, unsigned short * __restrict x)
+{
+ len = len & ~31;
+ unsigned int result = 0;
+ __asm volatile ("");
+ for (int i = 0; i < len; i++)
+ result += x[i];
+ return result;
+}
+
+/* { dg-final { scan-assembler "vaddw.u16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
new file mode 100644
index 0000000..4a72a39
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, unsigned int * __restrict x)
+{
+ len = len & ~31;
+ unsigned long long result = 0;
+ __asm volatile ("");
+ for (int i = 0; i < len; i++)
+ result += x[i];
+ return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u32" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
new file mode 100644
index 0000000..9c9c68a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, char * __restrict x)
+{
+ len = len & ~31;
+ unsigned short result = 0;
+ __asm volatile ("");
+ for (int i = 0; i < len; i++)
+ result += x[i];
+ return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u8" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index b543519..4deca1f 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3943,6 +3943,7 @@ proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
} else {
set et_vect_widen_sum_hi_to_si_pattern_saved 0
if { [istarget powerpc*-*-*]
+ || [check_effective_target_arm_neon_ok]
|| [istarget ia64-*-*] } {
set et_vect_widen_sum_hi_to_si_pattern_saved 1
}
--
1.9.1