[PATCH]AArch64: Add NEON, SVE and SVE2 RTL patterns for Multiply, FMS and FMA.
Tamar Christina
tamar.christina@arm.com
Fri Jan 15 15:30:19 GMT 2021
Hi All,
This adds implementation for the optabs for complex operations. With this the
following C code:
void g (float complex a[restrict N], float complex b[restrict N],
float complex c[restrict N])
{
for (int i=0; i < N; i++)
c[i] = a[i] * b[i];
}
generates
NEON:
g:
movi v3.4s, 0
mov x3, 0
.p2align 3,,7
.L2:
mov v0.16b, v3.16b
ldr q2, [x1, x3]
ldr q1, [x0, x3]
fcmla v0.4s, v1.4s, v2.4s, #0
fcmla v0.4s, v1.4s, v2.4s, #90
str q0, [x2, x3]
add x3, x3, 16
cmp x3, 1600
bne .L2
ret
SVE:
g:
mov x3, 0
mov x4, 400
ptrue p1.b, all
whilelo p0.s, xzr, x4
mov z3.s, #0
.p2align 3,,7
.L2:
ld1w z1.s, p0/z, [x0, x3, lsl 2]
ld1w z2.s, p0/z, [x1, x3, lsl 2]
movprfx z0, z3
fcmla z0.s, p1/m, z1.s, z2.s, #0
fcmla z0.s, p1/m, z1.s, z2.s, #90
st1w z0.s, p0, [x2, x3, lsl 2]
incw x3
whilelo p0.s, x3, x4
b.any .L2
ret
SVE2 (with int instead of float)
g:
mov x3, 0
mov x4, 400
mov z3.b, #0
whilelo p0.s, xzr, x4
.p2align 3,,7
.L2:
ld1w z1.s, p0/z, [x0, x3, lsl 2]
ld1w z2.s, p0/z, [x1, x3, lsl 2]
movprfx z0, z3
cmla z0.s, z1.s, z2.s, #0
cmla z0.s, z1.s, z2.s, #90
st1w z0.s, p0, [x2, x3, lsl 2]
incw x3
whilelo p0.s, x3, x4
b.any .L2
ret
It defined a new iterator VALL_ARITH which contains types for which we can do
general arithmetic (excludes bfloat16).
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Checked with armv8-a+sve2+fp16 and no issues. Note that sue to a mid-end
limitation SLP for SVE currently fails for some permutes. The tests have these
marked as XFAIL. I do intend to fix this soon.
Execution tests verified with QEMU.
Matching tests for these are in the mid-end patches. This I will turn on for
these patterns in a separate patch.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md (cml<fcmac1><rot_op><mode>4,
cmul<rot_op><mode>3): New.
* config/aarch64/iterators.md (VALL_ARITH, UNSPEC_FCMUL,
UNSPEC_FCMUL180, UNSPEC_FCMLA_CONJ, UNSPEC_FCMLA180_CONJ,
UNSPEC_CMLA_CONJ, UNSPEC_CMLA180_CONJ, UNSPEC_CMUL, UNSPEC_CMUL180,
FCMLA_OP, FCMUL_OP, rot_op, rotsplit1, rotsplit2, fcmac1, sve_rot1,
sve_rot2, SVE2_INT_CMLA_OP, SVE2_INT_CMUL_OP, SVE2_INT_CADD_OP): New.
(rot): Add UNSPEC_FCMUL, UNSPEC_FCMUL180.
* config/aarch64/aarch64-sve.md (cml<fcmac1><rot_op><mode>4,
cmul<rot_op><mode>3): New.
* config/aarch64/aarch64-sve2.md (cml<fcmac1><rot_op><mode>4,
cmul<rot_op><mode>3): New.
--- inline copy of patch --
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 4b869ded918fd91ffd41e6ba068239a752b331e5..8a5f1dad224a99a8ba30669139259922a1250d0e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -516,6 +516,47 @@ (define_insn "aarch64_fcmlaq_lane<rot><mode>"
[(set_attr "type" "neon_fcmla")]
)
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+ [(set (match_operand:VHSDF 0 "register_operand")
+ (plus:VHSDF (match_operand:VHSDF 1 "register_operand")
+ (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand")
+ (match_operand:VHSDF 3 "register_operand")]
+ FCMLA_OP)))]
+ "TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (tmp, operands[1],
+ operands[3], operands[2]));
+ emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], tmp,
+ operands[3], operands[2]));
+ DONE;
+})
+
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+ [(set (match_operand:VHSDF 0 "register_operand")
+ (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")
+ (match_operand:VHSDF 2 "register_operand")]
+ FCMUL_OP))]
+ "TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ rtx res1 = gen_reg_rtx (<MODE>mode);
+ emit_move_insn (tmp, CONST0_RTX (<MODE>mode));
+ emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (res1, tmp,
+ operands[2], operands[1]));
+ emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], res1,
+ operands[2], operands[1]));
+ DONE;
+})
+
+
+
;; These instructions map to the __builtins for the Dot Product operations.
(define_insn "aarch64_<sur>dot<vsi2qi>"
[(set (match_operand:VS 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index da15bd8788507feb12d52894c14e099370f34108..9dfe6a3f4512a20ba4f1e66a105ee0ae5d6949ea 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7243,6 +7243,62 @@ (define_insn "@aarch64_pred_<optab><mode>"
[(set_attr "movprfx" "*,yes")]
)
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+ [(set (match_operand:SVE_FULL_F 0 "register_operand")
+ (unspec:SVE_FULL_F
+ [(match_dup 4)
+ (match_dup 5)
+ (match_operand:SVE_FULL_F 1 "register_operand")
+ (match_operand:SVE_FULL_F 2 "register_operand")
+ (match_operand:SVE_FULL_F 3 "register_operand")]
+ FCMLA_OP))]
+ "TARGET_SVE"
+{
+ operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+ operands[5] = gen_int_mode (SVE_RELAXED_GP, SImode);
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn
+ (gen_aarch64_pred_fcmla<sve_rot1><mode> (tmp, operands[4],
+ operands[3], operands[2],
+ operands[1], operands[5]));
+ emit_insn
+ (gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[4],
+ operands[3], operands[2],
+ tmp, operands[5]));
+ DONE;
+})
+
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+ [(set (match_operand:SVE_FULL_F 0 "register_operand")
+ (unspec:SVE_FULL_F
+ [(match_operand:SVE_FULL_F 1 "register_operand")
+ (match_operand:SVE_FULL_F 2 "register_operand")]
+ FCMUL_OP))]
+ "TARGET_SVE"
+{
+ rtx pred_reg = aarch64_ptrue_reg (<VPRED>mode);
+ rtx gp_mode = gen_int_mode (SVE_RELAXED_GP, SImode);
+ rtx accum = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn
+ (gen_aarch64_pred_fcmla<sve_rot1><mode> (tmp, pred_reg,
+ operands[2], operands[1],
+ accum, gp_mode));
+ emit_insn
+ (gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], pred_reg,
+ operands[2], operands[1],
+ tmp, gp_mode));
+ DONE;
+})
+
;; Predicated FCMLA with merging.
(define_expand "@cond_<optab><mode>"
[(set (match_operand:SVE_FULL_F 0 "register_operand")
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index 5cb9144da98af2d02b83043511a99b5723d7e8c0..b96708d03f4458726b32ec46c0078499e00b8549 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1848,6 +1848,48 @@ (define_insn "@aarch64_<optab>_lane_<mode>"
[(set_attr "movprfx" "*,yes")]
)
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+ [(set (match_operand:SVE_FULL_I 0 "register_operand")
+ (plus:SVE_FULL_I (match_operand:SVE_FULL_I 1 "register_operand")
+ (unspec:SVE_FULL_I
+ [(match_operand:SVE_FULL_I 2 "register_operand")
+ (match_operand:SVE_FULL_I 3 "register_operand")]
+ SVE2_INT_CMLA_OP)))]
+ "TARGET_SVE2"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_aarch64_sve_cmla<sve_rot1><mode> (tmp, operands[1],
+ operands[3], operands[2]));
+ emit_insn (gen_aarch64_sve_cmla<sve_rot2><mode> (operands[0], tmp,
+ operands[3], operands[2]));
+ DONE;
+})
+
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder. Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+ [(set (match_operand:SVE_FULL_I 0 "register_operand")
+ (unspec:SVE_FULL_I
+ [(match_operand:SVE_FULL_I 1 "register_operand")
+ (match_operand:SVE_FULL_I 2 "register_operand")]
+ SVE2_INT_CMUL_OP))]
+ "TARGET_SVE2"
+{
+ rtx accum = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_aarch64_sve_cmla<sve_rot1><mode> (tmp, accum,
+ operands[2], operands[1]));
+ emit_insn (gen_aarch64_sve_cmla<sve_rot2><mode> (operands[0], tmp,
+ operands[2], operands[1]));
+ DONE;
+})
+
;; -------------------------------------------------------------------------
;; ---- [INT] Complex dot product
;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index d42a70653edb266f2b76924b75a814db25f08f23..3f61fc8e380abd922d39973f40a966b7ce64fa40 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -182,6 +182,11 @@ (define_mode_iterator V2F [V2SF V2DF])
;; All Advanced SIMD modes on which we support any arithmetic operations.
(define_mode_iterator VALL [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V2SF V4SF V2DF])
+;; All Advanced SIMD modes suitable for performing arithmetics.
+(define_mode_iterator VALL_ARITH [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+ (V4HF "TARGET_SIMD_F16INST") (V8HF "TARGET_SIMD_F16INST")
+ V2SF V4SF V2DF])
+
;; All Advanced SIMD modes suitable for moving, loading, and storing.
(define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
@@ -712,6 +717,10 @@ (define_c_enum "unspec"
UNSPEC_FCMLA90 ; Used in aarch64-simd.md.
UNSPEC_FCMLA180 ; Used in aarch64-simd.md.
UNSPEC_FCMLA270 ; Used in aarch64-simd.md.
+ UNSPEC_FCMUL ; Used in aarch64-simd.md.
+ UNSPEC_FCMUL_CONJ ; Used in aarch64-simd.md.
+ UNSPEC_FCMLA_CONJ ; Used in aarch64-simd.md.
+ UNSPEC_FCMLA180_CONJ ; Used in aarch64-simd.md.
UNSPEC_ASRD ; Used in aarch64-sve.md.
UNSPEC_ADCLB ; Used in aarch64-sve2.md.
UNSPEC_ADCLT ; Used in aarch64-sve2.md.
@@ -730,6 +739,10 @@ (define_c_enum "unspec"
UNSPEC_CMLA180 ; Used in aarch64-sve2.md.
UNSPEC_CMLA270 ; Used in aarch64-sve2.md.
UNSPEC_CMLA90 ; Used in aarch64-sve2.md.
+ UNSPEC_CMLA_CONJ ; Used in aarch64-sve2.md.
+ UNSPEC_CMLA180_CONJ ; Used in aarch64-sve2.md.
+ UNSPEC_CMUL ; Used in aarch64-sve2.md.
+ UNSPEC_CMUL_CONJ ; Used in aarch64-sve2.md.
UNSPEC_COND_FCVTLT ; Used in aarch64-sve2.md.
UNSPEC_COND_FCVTNT ; Used in aarch64-sve2.md.
UNSPEC_COND_FCVTX ; Used in aarch64-sve2.md.
@@ -1291,7 +1304,7 @@ (define_mode_attr Vwide [(V2SF "v2df") (V4HF "v4sf")
;; Widened mode register suffixes for VD_BHSI/VQW/VQ_HSF.
(define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
- (V2SI "2d") (V16QI "8h")
+ (V2SI "2d") (V16QI "8h")
(V8HI "4s") (V4SI "2d")
(V8HF "4s") (V4SF "2d")])
@@ -1313,7 +1326,7 @@ (define_mode_attr Vewtype [(VNx16QI "h")
;; Widened mode register suffixes for VDW/VQW.
(define_mode_attr Vmwtype [(V8QI ".8h") (V4HI ".4s")
- (V2SI ".2d") (V16QI ".8h")
+ (V2SI ".2d") (V16QI ".8h")
(V8HI ".4s") (V4SI ".2d")
(V4HF ".4s") (V2SF ".2d")
(SI "") (HI "")])
@@ -2611,6 +2624,19 @@ (define_int_iterator SVE2_INT_CMLA [UNSPEC_CMLA
UNSPEC_SQRDCMLAH180
UNSPEC_SQRDCMLAH270])
+;; Unlike the normal CMLA instructions these represent the actual operation you
+;; to be performed. They will always need to be expanded into multiple
+;; sequences consisting of CMLA.
+(define_int_iterator SVE2_INT_CMLA_OP [UNSPEC_CMLA
+ UNSPEC_CMLA_CONJ
+ UNSPEC_CMLA180])
+
+;; Unlike the normal CMLA instructions these represent the actual operation you
+;; to be performed. They will always need to be expanded into multiple
+;; sequences consisting of CMLA.
+(define_int_iterator SVE2_INT_CMUL_OP [UNSPEC_CMUL
+ UNSPEC_CMUL_CONJ])
+
;; Same as SVE2_INT_CADD but exclude the saturating instructions
(define_int_iterator SVE2_INT_CADD_OP [UNSPEC_CADD90
UNSPEC_CADD270])
@@ -2725,6 +2751,14 @@ (define_int_iterator FMMLA [UNSPEC_FMMLA])
(define_int_iterator BF_MLA [UNSPEC_BFMLALB
UNSPEC_BFMLALT])
+(define_int_iterator FCMLA_OP [UNSPEC_FCMLA
+ UNSPEC_FCMLA180
+ UNSPEC_FCMLA_CONJ
+ UNSPEC_FCMLA180_CONJ])
+
+(define_int_iterator FCMUL_OP [UNSPEC_FCMUL
+ UNSPEC_FCMUL_CONJ])
+
;; Iterators for atomic operations.
(define_int_iterator ATOMIC_LDOP
@@ -3435,7 +3469,79 @@ (define_int_attr rot [(UNSPEC_CADD90 "90")
(UNSPEC_COND_FCMLA "0")
(UNSPEC_COND_FCMLA90 "90")
(UNSPEC_COND_FCMLA180 "180")
- (UNSPEC_COND_FCMLA270 "270")])
+ (UNSPEC_COND_FCMLA270 "270")
+ (UNSPEC_FCMUL "0")
+ (UNSPEC_FCMUL_CONJ "180")])
+
+;; A conjucate is a negation of the imaginary component
+;; The number in the unspecs are the rotation component of the instruction, e.g
+;; FCMLA180 means use the instruction with #180.
+;; The iterator is used to produce the right name mangling for the function.
+(define_int_attr rot_op [(UNSPEC_FCMLA180 "")
+ (UNSPEC_FCMLA180_CONJ "_conj")
+ (UNSPEC_FCMLA "")
+ (UNSPEC_FCMLA_CONJ "_conj")
+ (UNSPEC_FCMUL "")
+ (UNSPEC_FCMUL_CONJ "_conj")
+ (UNSPEC_CMLA "")
+ (UNSPEC_CMLA180 "")
+ (UNSPEC_CMLA_CONJ "_conj")
+ (UNSPEC_CMUL "")
+ (UNSPEC_CMUL_CONJ "_conj")])
+
+;; The complex operations when performed on a real complex number require two
+;; instructions to perform the operation. e.g. complex multiplication requires
+;; two FCMUL with a particular rotation value.
+;;
+;; These values can be looked up in rotsplit1 and rotsplit2. as an example
+;; FCMUL needs the first instruction to use #0 and the second #90.
+(define_int_attr rotsplit1 [(UNSPEC_FCMLA "0")
+ (UNSPEC_FCMLA_CONJ "0")
+ (UNSPEC_FCMUL "0")
+ (UNSPEC_FCMUL_CONJ "0")
+ (UNSPEC_FCMLA180 "270")
+ (UNSPEC_FCMLA180_CONJ "90")])
+
+(define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
+ (UNSPEC_FCMLA_CONJ "270")
+ (UNSPEC_FCMUL "90")
+ (UNSPEC_FCMUL_CONJ "270")
+ (UNSPEC_FCMLA180 "180")
+ (UNSPEC_FCMLA180_CONJ "180")])
+
+;; SVE has slightly different namings from NEON so we have to split these
+;; iterators.
+(define_int_attr sve_rot1 [(UNSPEC_FCMLA "")
+ (UNSPEC_FCMLA_CONJ "")
+ (UNSPEC_FCMUL "")
+ (UNSPEC_FCMUL_CONJ "")
+ (UNSPEC_FCMLA180 "270")
+ (UNSPEC_FCMLA180_CONJ "90")
+ (UNSPEC_CMLA "")
+ (UNSPEC_CMLA_CONJ "")
+ (UNSPEC_CMUL "")
+ (UNSPEC_CMUL_CONJ "")
+ (UNSPEC_CMLA180 "270")
+ (UNSPEC_CMLA180_CONJ "90")])
+
+(define_int_attr sve_rot2 [(UNSPEC_FCMLA "90")
+ (UNSPEC_FCMLA_CONJ "270")
+ (UNSPEC_FCMUL "90")
+ (UNSPEC_FCMUL_CONJ "270")
+ (UNSPEC_FCMLA180 "180")
+ (UNSPEC_FCMLA180_CONJ "180")
+ (UNSPEC_CMLA "90")
+ (UNSPEC_CMLA_CONJ "270")
+ (UNSPEC_CMUL "90")
+ (UNSPEC_CMUL_CONJ "270")
+ (UNSPEC_CMLA180 "180")
+ (UNSPEC_CMLA180_CONJ "180")])
+
+
+(define_int_attr fcmac1 [(UNSPEC_FCMLA "a") (UNSPEC_FCMLA_CONJ "a")
+ (UNSPEC_FCMLA180 "s") (UNSPEC_FCMLA180_CONJ "s")
+ (UNSPEC_CMLA "a") (UNSPEC_CMLA_CONJ "a")
+ (UNSPEC_CMLA180 "s") (UNSPEC_CMLA180_CONJ "s")])
(define_int_attr sve_fmla_op [(UNSPEC_COND_FMLA "fmla")
(UNSPEC_COND_FMLS "fmls")
--
-------------- next part --------------
A non-text attachment was scrubbed...
Name: rb13907.patch
Type: text/x-diff
Size: 13746 bytes
Desc: not available
URL: <https://gcc.gnu.org/pipermail/gcc-patches/attachments/20210115/9e3600ea/attachment-0001.bin>
More information about the Gcc-patches
mailing list