This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[Patch AArch64] Stop generating BSL for simple integer code
- From: James Greenhalgh <james dot greenhalgh at arm dot com>
- To: <gcc-patches at gcc dot gnu dot org>
- Cc: <nd at arm dot com>, <richard dot earnshaw at arm dot com>, <marcus dot shawcroft at arm dot com>
- Date: Mon, 12 Jun 2017 14:35:52 +0100
- Subject: [Patch AArch64] Stop generating BSL for simple integer code
- Authentication-results: sourceware.org; auth=none
- Authentication-results: spf=pass (sender IP is 217.140.96.140) smtp.mailfrom=arm.com; gcc.gnu.org; dkim=none (message not signed) header.d=none;gcc.gnu.org; dmarc=bestguesspass action=none header.from=arm.com;
- Nodisclaimer: True
- Spamdiagnosticmetadata: NSPM
- Spamdiagnosticoutput: 1:99
Hi,
In this testcase, all argument registers and the return register
will be general purpose registers:
long long
foo (long long a, long long b, long long c)
{
return ((a ^ b) & c) ^ b;
}
However, due to the implementation of aarch64_simd_bsl<mode>_internal
we'll match that pattern and emit a BSL, necessitating moving all those
arguments and results to the Advanced SIMD registers:
fmov d2, x0
fmov d0, x2
fmov d1, x1
bsl v0.8b, v2.8b, v1.8b
fmov x0, d0
To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
knows to split back to integer operations if the register allocation
falls that way.
We could have used an unspec, but then we lose some of the nice
simplifications that can be made from explicitly spelling out the semantics
of BSL.
Bootstrapped on aarch64-none-linux-gnu.
OK?
Thanks,
James
---
gcc/
2017-06-12 James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/aarch64-simd.md
(aarch64_simd_bsl<mode>_internal): Remove DImode.
(*aarch64_simd_bsl<mode>_alt): Likewise.
(aarch64_simd_bsldi_internal): New.
gcc/testsuite/
2017-06-12 James Greenhalgh <james.greenhalgh@arm.com>
* gcc.target/aarch64/no-dimode-bsl.c: New.
* gcc.target/aarch64/dimode-bsl.c: New.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c5a86ff..eea4d25 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2256,13 +2256,13 @@
;; in *aarch64_simd_bsl<mode>_alt.
(define_insn "aarch64_simd_bsl<mode>_internal"
- [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
- (xor:VSDQ_I_DI
- (and:VSDQ_I_DI
- (xor:VSDQ_I_DI
+ [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+ (xor:VDQ_I
+ (and:VDQ_I
+ (xor:VDQ_I
(match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
- (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
- (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
+ (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
+ (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
(match_dup:<V_cmp_result> 3)
))]
"TARGET_SIMD"
@@ -2280,14 +2280,14 @@
;; permutations of commutative operations, we have to have a separate pattern.
(define_insn "*aarch64_simd_bsl<mode>_alt"
- [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
- (xor:VSDQ_I_DI
- (and:VSDQ_I_DI
- (xor:VSDQ_I_DI
- (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
- (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
- (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
- (match_dup:VSDQ_I_DI 2)))]
+ [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+ (xor:VDQ_I
+ (and:VDQ_I
+ (xor:VDQ_I
+ (match_operand:VDQ_I 3 "register_operand" "w,w,0")
+ (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
+ (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+ (match_dup:VDQ_I 2)))]
"TARGET_SIMD"
"@
bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
@@ -2296,6 +2296,44 @@
[(set_attr "type" "neon_bsl<q>")]
)
+;; DImode is special, we want to avoid computing operations which are
+;; more naturally computed in general purpose registers in the vector
+;; registers. If we do that, we need to move all three operands from general
+;; purpose registers to vector registers, then back again. However, we
+;; don't want to make this pattern an UNSPEC as we'd lose scope for
+;; optimizations based on the component operations of a BSL.
+;;
+;; That means we need a splitter back to the individual operations, if they
+;; would be better calculated on the integer side.
+
+(define_insn_and_split "aarch64_simd_bsldi_internal"
+ [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+ (xor:DI
+ (and:DI
+ (xor:DI
+ (match_operand:DI 3 "register_operand" "w,0,w,r")
+ (match_operand:DI 2 "register_operand" "w,w,0,r"))
+ (match_operand:DI 1 "register_operand" "0,w,w,r"))
+ (match_dup:DI 3)
+ ))]
+ "TARGET_SIMD"
+ "@
+ bsl\\t%0.8b, %2.8b, %3.8b
+ bit\\t%0.8b, %2.8b, %1.8b
+ bif\\t%0.8b, %3.8b, %1.8b
+ #"
+ "&& GP_REGNUM_P (REGNO (operands[0]))"
+ [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+ /* Split back to individual operations. */
+ emit_insn (gen_xordi3 (operands[0], operands[2], operands[3]));
+ emit_insn (gen_anddi3 (operands[0], operands[0], operands[1]));
+ emit_insn (gen_xordi3 (operands[0], operands[0], operands[3]));
+ DONE;
+}
+ [(set_attr "type" "neon_bsl")]
+)
+
(define_expand "aarch64_simd_bsl<mode>"
[(match_operand:VALLDIF 0 "register_operand")
(match_operand:<V_cmp_result> 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
new file mode 100644
index 0000000..4e63511
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we can generate DImode BSL when we are using
+ copysign. */
+
+double
+foo (double a, double b)
+{
+ return __builtin_copysign (a, b);
+}
+
+/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
new file mode 100644
index 0000000..67dfda0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we don't combine to BSL when in DImode, avoiding register
+ moves in the general case.
+
+ We want:
+ eor x0, x0, x1
+ and x0, x0, x2
+ eor x0, x0, x1
+ ret
+
+ Rather than:
+ fmov d2, x0
+ fmov d0, x2
+ fmov d1, x1
+ bsl v0.8b, v2.8b, v1.8b
+ fmov x0, d0
+ ret */
+
+long long
+foo (long long a, long long b, long long c)
+{
+ return ((a ^ b) & c) ^ b;
+}
+
+/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */