]> gcc.gnu.org Git - gcc.git/commitdiff
aarch64: Don't include vec_select high-half in SIMD add cost
authorJonathan Wright <jonathan.wright@arm.com>
Wed, 28 Jul 2021 14:49:29 +0000 (15:49 +0100)
committerJonathan Wright <jonathan.wright@arm.com>
Thu, 5 Aug 2021 10:51:57 +0000 (11:51 +0100)
The Neon add-long/add-widen instructions can select the top or bottom
half of the operand registers. This selection does not change the
cost of the underlying instruction and this should be reflected by
the RTL cost function.

This patch adds RTL tree traversal in the Neon add cost function to
match vec_select high-half of its operands. This traversal prevents
the cost of the vec_select from being added into the cost of the
subtract - meaning that these instructions can now be emitted in the
combine pass as they are no longer deemed prohibitively expensive.

gcc/ChangeLog:

2021-07-28  Jonathan Wright  <jonathan.wright@arm.com>

* config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
of vec_select high-half from being added into Neon add cost.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vaddX_high_cost.c: New test.

gcc/config/aarch64/aarch64.c
gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c [new file with mode: 0644]

index e02cbcbcb381d22c7ef9c03c7fd45748434f7692..aa687c579468d45091a05cfc55ebbd873fb86630 100644 (file)
@@ -13161,6 +13161,21 @@ cost_minus:
        op1 = XEXP (x, 1);
 
 cost_plus:
+       if (VECTOR_MODE_P (mode))
+         {
+           /* ADDL2 and ADDW2.  */
+           unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+           if (vec_flags & VEC_ADVSIMD)
+             {
+               /* The select-operand-high-half versions of the add instruction
+                  have the same cost as the regular three vector version -
+                  don't add the costs of the select into the costs of the add.
+                  */
+               op0 = aarch64_strip_extend_vec_half (op0);
+               op1 = aarch64_strip_extend_vec_half (op1);
+             }
+         }
+
        if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
            || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
          {
diff --git a/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c
new file mode 100644 (file)
index 0000000..43f28d5
--- /dev/null
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_ADDL(rettype, intype, ts, rs) \
+  rettype test_vaddl_ ## ts (intype a, intype b, intype c) \
+       { \
+               rettype t0 = vaddl_ ## ts (vget_high_ ## ts (a), \
+                                          vget_high_ ## ts (c)); \
+               rettype t1 = vaddl_ ## ts (vget_high_ ## ts (b), \
+                                          vget_high_ ## ts (c)); \
+               return vaddq ## _ ## rs (t0, t1); \
+       }
+
+TEST_ADDL (int16x8_t, int8x16_t, s8, s16)
+TEST_ADDL (uint16x8_t, uint8x16_t, u8, u16)
+TEST_ADDL (int32x4_t, int16x8_t, s16, s32)
+TEST_ADDL (uint32x4_t, uint16x8_t, u16, u32)
+TEST_ADDL (int64x2_t, int32x4_t, s32, s64)
+TEST_ADDL (uint64x2_t, uint32x4_t, u32, u64)
+
+#define TEST_ADDW(rettype, intype, intypel, ts, rs) \
+  rettype test_vaddw_ ## ts (intype a, intype b, intypel c) \
+       { \
+               rettype t0 = vaddw_ ## ts (a, vget_high_ ## ts (c)); \
+               rettype t1 = vaddw_ ## ts (b, vget_high_ ## ts (c)); \
+               return vaddq ## _ ## rs (t0, t1); \
+       }
+
+TEST_ADDW (int16x8_t, int16x8_t, int8x16_t, s8, s16)
+TEST_ADDW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16)
+TEST_ADDW (int32x4_t, int32x4_t, int16x8_t, s16, s32)
+TEST_ADDW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32)
+TEST_ADDW (int64x2_t, int64x2_t, int32x4_t, s32, s64)
+TEST_ADDW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
This page took 0.083636 seconds and 5 git commands to generate.