This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[AArch64][SVE2] Fix for r277110 (BSL variants)
- From: Yuliang Wang <Yuliang dot Wang at arm dot com>
- To: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Cc: nd <nd at arm dot com>, Richard Sandiford <Richard dot Sandiford at arm dot com>
- Date: Thu, 17 Oct 2019 16:17:24 +0000
- Subject: [AArch64][SVE2] Fix for r277110 (BSL variants)
- Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none
- Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=PO2Y610aQ/BpzO80xdGvq5qGiFLfa3WAXZ9M1o9vkjw=; b=Bq6rKMz1Jv/5Wa6HucRUIn4QFHM8Cyk6yJ0zQlkctun1tJ664C9IMR90lHYoajvA5ug+WDWE7yEaITK3bgIO6C7AKEFt6bHKkBAonA6+i3CT0fxc8iOQpomC2H8FBKU1yz3eUMFgH4cnSQAheHT/vauvFM3OHsb2QlrCUaXxfNExSs5+dSynhraQtEVDEyTrxvjVcACoEWxsuG7SefxKIQsPyNSSLiRDrV78AAJiY7NZFyBW8YUI1f8dPlqm9JGkMf2phnpdi9FxNLqbpn5GmO11jDr/Bfx4KaE0om17wPTu7up3BNFT9ANtAZf30YdhZr72/akhFZZ2p38DF3GqPg==
- Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=Noaa1Tc0NDa+/NJNlyqSte8FaUfCzTnEKH9Dj5i8D2TiIgsxMPETYx4VkYU9q1ZNX1uyhw5r0AZxi/lm/nm3wC83iTTv2nMJrXkYtsonLo9+qv3ljVqbez5W67sjSBsqG+LJqmnvBISSeIKu9OJJgn0wSoqiN8YYwQ1DZZ/hjv9wtDoO9lWQg7D2cOmDvACUuGpPVbCZeLA53uo0lWQwCy+0hpkb94KI/4gOCLWy7vTKhAiTLCSYTUTQG+nMssz8JdTDgyjhwosj4PcbB8s5wMOxP5d8YMCtoQutbOv2fv4pKxsHS1cdxBVtutGC5S9RLQqv6JwysSzrUORbBnPH4Q==
- Original-authentication-results: spf=none (sender IP is ) smtp.mailfrom=Yuliang dot Wang at arm dot com;
Hi,
SVE2 vectorization for BSL and NBSL fails when the element type is unsigned 8/16-bit.
The operands are being converted implicitly to corresponding signed types, which the mid-end fold pattern does not take into account; this patch augments the pattern with type conversion checks in order to rectify the above problem.
#define TYPE uint{8,16}_t
void
foo (TYPE *a, TYPE *b, TYPE *c, TYPE *d, int n)
{
for (int i = 0; i < n; i++)
a[i] = OP (b[i], c[i], d[i]);
}
BSL:
// #define OP(x,y,z) (((x) & (z)) | ((y) & ~(z)))
before and z1.d, z2.d, z1.d
bic z0.d, z0.d, z2.d
orr z0.d, z0.d, z1.d
...
after bsl z0.d, z0.d, z1.d, z2.d
NBSL:
// #define OP(x,y,z) ~(((x) & (z)) | ((y) & ~(z)))
before and z1.d, z2.d, z1.d
bic z0.d, z0.d, z2.d
orr z0.d, z0.d, z1.d
not z0.{b,h}, p1/m, z0.{b,h}
...
after nbsl z0.d, z0.d, z1.d, z2.d
The GIMPLE output for BSL shows where conversions could be inserted:
_1 = b[i];
_2 = d[i];
_3 = _1 & _2;
_4 = (signed short) _3;
_5 = c[i];
_6 = (signed short) _5;
_7 = d[i];
_8 = (signed short) _7;
_9 = ~_8;
_10 = _6 & _9;
_11 = _4 | _10;
_12 = (short unsigned int) _11;
a[i] = _12;
In contrast, for 32/64-bit types (regardless of signedness):
_1 = b[i];
_2 = d[i];
_3 = _1 & _2;
_4 = c[i];
_5 = d[i];
_6 = ~_5;
_7 = _4 & _6;
_8 = _3 | _7;
_9 = ~_8;
a[i] = _9;
Built and tested on aarch64-none-elf.
Regards,
Yuliang Wang
gcc/ChangeLog:
2019-10-17 Yuliang Wang <yuliang.wang@arm.com>
* match.pd (/* (x & ~m) | (y & m) -> ... */): Modified fold pattern.
* genmatch.c (convert3): New convert operation to support the above.
gcc/testsuite/ChangeLog:
2019-10-17 Yuliang Wang <yuliang.wang@arm.com>
* gcc.target/aarch64/sve2/bitsel_1.c: Add testing for unsigned types.
* gcc.target/aarch64/sve2/bitsel_2.c: As above.
* gcc.target/aarch64/sve2/bitsel_3.c: As above.
* gcc.target/aarch64/sve2/bitsel_4.c: As above.
* gcc.target/aarch64/sve2/eor3_1.c: As above.
diff --git a/gcc/genmatch.c b/gcc/genmatch.c
index 7db1f135840e09e794e2921859fa8e9b76666fa8..ce87ae33e0b3c06f4d1fde8d8e74bf2210ee7a5a 100644
--- a/gcc/genmatch.c
+++ b/gcc/genmatch.c
@@ -227,6 +227,7 @@ enum tree_code {
CONVERT0,
CONVERT1,
CONVERT2,
+CONVERT3,
VIEW_CONVERT0,
VIEW_CONVERT1,
VIEW_CONVERT2,
@@ -1176,6 +1177,7 @@ lower_opt_convert (operand *o)
= { CONVERT0, CONVERT_EXPR,
CONVERT1, CONVERT_EXPR,
CONVERT2, CONVERT_EXPR,
+ CONVERT3, CONVERT_EXPR,
VIEW_CONVERT0, VIEW_CONVERT_EXPR,
VIEW_CONVERT1, VIEW_CONVERT_EXPR,
VIEW_CONVERT2, VIEW_CONVERT_EXPR };
@@ -4145,8 +4147,8 @@ parser::record_operlist (location_t loc, user_id *p)
}
}
-/* Parse the operator ID, special-casing convert?, convert1? and
- convert2? */
+/* Parse the operator ID, special-casing convert?, convert1?, convert2? and
+ convert3? */
id_base *
parser::parse_operation ()
@@ -4167,6 +4169,8 @@ parser::parse_operation ()
;
else if (strcmp (id, "convert2") == 0)
;
+ else if (strcmp (id, "convert3") == 0)
+ ;
else if (strcmp (id, "view_convert") == 0)
id = "view_convert0";
else if (strcmp (id, "view_convert1") == 0)
@@ -4183,6 +4187,7 @@ parser::parse_operation ()
}
else if (strcmp (id, "convert1") == 0
|| strcmp (id, "convert2") == 0
+ || strcmp (id, "convert3") == 0
|| strcmp (id, "view_convert1") == 0
|| strcmp (id, "view_convert2") == 0)
fatal_at (id_tok, "expected '?' after conditional operator");
@@ -4723,9 +4728,9 @@ parser::parse_for (location_t)
id_base *idb = get_operator (oper, true);
if (idb == NULL)
fatal_at (token, "no such operator '%s'", oper);
- if (*idb == CONVERT0 || *idb == CONVERT1 || *idb == CONVERT2
- || *idb == VIEW_CONVERT0 || *idb == VIEW_CONVERT1
- || *idb == VIEW_CONVERT2)
+ if (*idb == CONVERT0 || *idb == VIEW_CONVERT0
+ || *idb == CONVERT1 || *idb == CONVERT2|| *idb == CONVERT3
+ || *idb == VIEW_CONVERT1 || *idb == VIEW_CONVERT2)
fatal_at (token, "conditional operators cannot be used inside for");
if (arity == -1)
@@ -5136,6 +5141,7 @@ main (int argc, char **argv)
add_operator (CONVERT0, "convert0", "tcc_unary", 1);
add_operator (CONVERT1, "convert1", "tcc_unary", 1);
add_operator (CONVERT2, "convert2", "tcc_unary", 1);
+add_operator (CONVERT3, "convert3", "tcc_unary", 1);
add_operator (VIEW_CONVERT0, "view_convert0", "tcc_unary", 1);
add_operator (VIEW_CONVERT1, "view_convert1", "tcc_unary", 1);
add_operator (VIEW_CONVERT2, "view_convert2", "tcc_unary", 1);
diff --git a/gcc/match.pd b/gcc/match.pd
index e3ac06c8ef5b893bd344734095b11047a43f98b8..0aa065c2941dd79477434fd3b6691c9a9b68d20c 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1461,8 +1461,13 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
/* (x & ~m) | (y & m) -> ((x ^ y) & m) ^ x */
(simplify
- (bit_ior:c (bit_and:cs @0 (bit_not @2)) (bit_and:cs @1 @2))
- (bit_xor (bit_and (bit_xor @0 @1) @2) @0))
+ (bit_ior:c
+ (convert? (bit_and:cs @0 (bit_not (convert2? @2))))
+ (convert1? (bit_and:cs @1 (convert3? @2))))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
+ && tree_nop_conversion_p (type, TREE_TYPE (@1)))
+ (bit_xor (bit_and
+ (bit_xor (convert @0) (convert @1)) (convert @2)) (convert @0))))
/* Fold A - (A & B) into ~B & A. */
(simplify
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c
index 5c58ff54231d88a4ebf0a91fe4fac97079c8d992..05431e591887c589a1bc1516f99db39c66c353c4 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c
@@ -7,27 +7,31 @@
#define OP(x,y,z) (((x) & (z)) | ((y) & ~(z)))
#endif
-#define TYPE(N) int##N##_t
-
-#define TEMPLATE(SIZE) \
-void __attribute__ ((noinline, noclone)) \
-f_##SIZE##_##OP \
- (TYPE(SIZE) *restrict a, TYPE(SIZE) *restrict b, \
- TYPE(SIZE) *restrict c, TYPE(SIZE) *restrict d, int n) \
-{ \
- for (int i = 0; i < n; i++) \
- a[i] = OP (b[i], c[i], d[i]); \
+#define TYPE(S,N) S##int##N##_t
+
+#define TEMPLATE(SIGN,SIZE) \
+void __attribute__ ((noinline, noclone)) \
+f_##SIGN##_##SIZE##_##OP \
+ (TYPE(SIGN,SIZE) *restrict a, TYPE(SIGN,SIZE) *restrict b, \
+ TYPE(SIGN,SIZE) *restrict c, TYPE(SIGN,SIZE) *restrict d, int n) \
+{ \
+ for (int i = 0; i < n; i++) \
+ a[i] = OP (b[i], c[i], d[i]); \
}
-TEMPLATE (8);
-TEMPLATE (16);
-TEMPLATE (32);
-TEMPLATE (64);
+TEMPLATE (,8);
+TEMPLATE (,16);
+TEMPLATE (,32);
+TEMPLATE (,64);
+TEMPLATE (u,8);
+TEMPLATE (u,16);
+TEMPLATE (u,32);
+TEMPLATE (u,64);
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
/* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */
/* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */
-/* { dg-final { scan-assembler-times {\tbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c
index ac0d27213e84bb5c7f3d236f3cac59c71ac674ed..da6ac527e8c93e25e69a8db368fba79190b65202 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c
@@ -5,11 +5,11 @@
#include "bitsel_1.c"
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
/* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */
/* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */
/* { dg-final { scan-assembler-not {\tnot\tz[0-9]+\.[bhsd]} } } */
-/* { dg-final { scan-assembler-times {\tnbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tnbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c
index 93995bb8bade89cd821ed85153d13e96bd4422a5..1036046a8119ef6aa19f7e975c90b2401cc43c0b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c
@@ -5,10 +5,10 @@
#include "bitsel_1.c"
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
/* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */
/* { dg-final { scan-assembler-not {\tbic\tz[0-9]+\.[bhsd]} } } */
-/* { dg-final { scan-assembler-times {\tbsl1n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tbsl1n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c
index 7ccec619b4d1e8de366c0b0c53879a89a00c2c49..527dcf1a42009f484b2cf3d01e7aeb7448a4d1cc 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c
@@ -5,11 +5,11 @@
#include "bitsel_1.c"
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
/* { dg-final { scan-assembler-not {\torr\tz[0-9]+\.[bhsd]} } } */
/* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */
/* { dg-final { scan-assembler-not {\tnot\tz[0-9]+\.[bhsd]} } } */
-/* { dg-final { scan-assembler-times {\tbsl2n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tbsl2n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c
index 551802a0c9f007273ddc68cc4ce77defe700d76e..29a023f9be705dcc67f96e0d2b97f8aef3e3ab4d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c
@@ -5,9 +5,9 @@
#include "bitsel_1.c"
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
/* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */
-/* { dg-final { scan-assembler-times {\teor3\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\teor3\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 8 } } */