This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH 3/9][GCC][AArch64] Add autovectorization support for Complex instructions

From: Kyrill Tkachov <kyrylo dot tkachov at foss dot arm dot com>
To: Tamar Christina <Tamar dot Christina at arm dot com>, "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
Cc: nd <nd at arm dot com>, James Greenhalgh <James dot Greenhalgh at arm dot com>, Richard Earnshaw <Richard dot Earnshaw at arm dot com>, Marcus Shawcroft <Marcus dot Shawcroft at arm dot com>
Date: Mon, 12 Nov 2018 12:17:17 +0000
Subject: Re: [PATCH 3/9][GCC][AArch64] Add autovectorization support for Complex instructions
References: <20181111102628.GA4529@arm.com>

Hi Tamar,

On 11/11/18 10:26, Tamar Christina wrote:

Hi All,

This patch adds the expander support for supporting autovectorization of complex number operations
such as Complex addition with a rotation along the Argand plane.  This also adds support for complex
FMA.

The instructions are described in the ArmARM [1] and are available from Armv8.3-a onwards.

Concretely, this generates

f90:
        mov     x3, 0
        .p2align 3,,7
.L2:
        ldr     q0, [x0, x3]
        ldr     q1, [x1, x3]
        fcadd   v0.2d, v0.2d, v1.2d, #90
        str     q0, [x2, x3]
        add     x3, x3, 16
        cmp     x3, 3200
        bne     .L2
        ret

now instead of

f90:
        mov     x4, x1
        mov     x1, x2
        add     x3, x4, 31
        add     x2, x0, 31
        sub     x3, x3, x1
        sub     x2, x2, x1
        cmp     x3, 62
        mov     x3, 62
        ccmp    x2, x3, 0, hi
        bls     .L5
        mov     x2, x4
        add     x3, x0, 3200
        .p2align 3,,7
.L3:
        ld2     {v2.2d - v3.2d}, [x0], 32
        ld2     {v4.2d - v5.2d}, [x2], 32
        cmp     x0, x3
        fsub    v0.2d, v2.2d, v5.2d
        fadd    v1.2d, v4.2d, v3.2d
        st2     {v0.2d - v1.2d}, [x1], 32
        bne     .L3
        ret
.L5:
        add     x6, x0, 8
        add     x5, x4, 8
        add     x2, x1, 8
        mov     x3, 0
        .p2align 3,,7
.L2:
        ldr     d1, [x0, x3]
        ldr     d3, [x5, x3]
        ldr     d0, [x6, x3]
        ldr     d2, [x4, x3]
        fsub    d1, d1, d3
        fadd    d0, d0, d2
        str     d1, [x1, x3]
        str     d0, [x2, x3]
        add     x3, x3, 16
        cmp     x3, 3200
        bne     .L2
        ret

For complex additions with a 90* rotation along the Argand plane.

[1] https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile

Bootstrap and Regtest on aarch64-none-linux-gnu, arm-none-gnueabihf and x86_64-pc-linux-gnu
are still on going but previous patch showed no regressions.

The instructions have also been tested on aarch64-none-elf and arm-none-eabi on a Armv8.3-a model
and -march=Armv8.3-a+fp16 and all tests pass.

Ok for trunk?

Thanks,
Tamar

gcc/ChangeLog:

2018-11-11  Tamar Christina  <tamar.christina@arm.com>

        * config/aarch64/aarch64-simd.md (aarch64_fcadd<rot><mode>,
        fcadd<rot><mode>3, aarch64_fcmla<rot><mode>,
        fcmla<rot><mode>4): New.
        * config/aarch64/aarch64.h (TARGET_COMPLEX): New.
        * config/aarch64/iterators.md (UNSPEC_FCADD90, UNSPEC_FCADD270,
        UNSPEC_FCMLA, UNSPEC_FCMLA90, UNSPEC_FCMLA180, UNSPEC_FCMLA270): New.
        (FCADD, FCMLA): New.
        (rot, rotsplit1, rotsplit2): New.
        * config/arm/types.md (neon_fcadd, neon_fcmla): New.

--



diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c4be3101fdec930707918106cd7c53cf7584553e..12a91183a98ea23015860c77a97955cb1b30bfbb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -419,6 +419,63 @@
 }
 )

+;; The fcadd and fcmla patterns are made UNSPEC for the explicitly due to the

+;; fact that their usage need to guarantee that the source vectors are
+;; contiguous.  It would be wrong to describe the operation without being able
+;; to describe the permute that is also required, but even if that is done
+;; the permute would have been created as a LOAD_LANES which means the values
+;; in the registers are in the wrong order.
+(define_insn "aarch64_fcadd<rot><mode>"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
+		       (match_operand:VHSDF 2 "register_operand" "w")]
+		       FCADD))]
+  "TARGET_COMPLEX"
+  "fcadd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>, #<rot>"
+  [(set_attr "type" "neon_fcadd")]
+)
+
+(define_expand "fcadd<rot><mode>3"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")
+		       (match_operand:VHSDF 2 "register_operand")]
+		       FCADD))]
+  "TARGET_COMPLEX"
+{
+  emit_insn (gen_aarch64_fcadd<rot><mode> (operands[0], operands[1],
+					   operands[2]));
+  DONE;
+})
+
+(define_insn "aarch64_fcmla<rot><mode>"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0")
+		    (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w")
+				   (match_operand:VHSDF 3 "register_operand" "w")]
+				   FCMLA)))]
+  "TARGET_COMPLEX"
+  "fcmla\t%0.<Vtype>, %2.<Vtype>, %3.<Vtype>, #<rot>"
+  [(set_attr "type" "neon_fcmla")]
+)
+
+;; The complex mla operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "fcmla<rot><mode>4"
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(plus:VHSDF (match_operand:VHSDF 1 "register_operand")
+		    (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand")
+				   (match_operand:VHSDF 3 "register_operand")]
+				   FCMLA)))]
+  "TARGET_COMPLEX"
+{
+  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (operands[0], operands[1],
+						 operands[2], operands[3]));
+  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], operands[0],
+						 operands[2], operands[3]));
+  DONE;
+})
+
 ;; These instructions map to the __builtins for the Dot Product operations.
 (define_insn "aarch64_<sur>dot<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
@@ -3026,22 +3083,22 @@
     operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "smov\\t%<GPI:w>0, %1.<VDQQH:Vetype>[%2]";
   }
-  [(set_attr "type" "neon_to_gp<q>")]
-)
-
-(define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-	(zero_extend:GPI
-	  (vec_select:<VEL>
-	    (match_operand:VDQQH 1 "register_operand" "w")
-	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
-  "TARGET_SIMD"
-  {
-    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
-					   INTVAL (operands[2]));
-    return "umov\\t%w0, %1.<Vetype>[%2]";
-  }
-  [(set_attr "type" "neon_to_gp<q>")]
+  [(set_attr "type" "neon_to_gp<q>")]
+)
+
+(define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(zero_extend:GPI
+	  (vec_select:<VEL>
+	    (match_operand:VDQQH 1 "register_operand" "w")
+	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
+  "TARGET_SIMD"
+  {
+    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
+					   INTVAL (operands[2]));
+    return "umov\\t%w0, %1.<Vetype>[%2]";
+  }
+  [(set_attr "type" "neon_to_gp<q>")]
 )


Is this change intended? Is it just reformatting?
I guess that's okay, but please mention it in the ChangeLog if so.

Thanks,
Kyrill

Follow-Ups:
- Re: [PATCH 3/9][GCC][AArch64] Add autovectorization support for Complex instructions
  - From: Tamar Christina

References:
- [PATCH 3/9][GCC][AArch64] Add autovectorization support for Complex instructions
  - From: Tamar Christina

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]