This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH 10/14][AArch64] Implement vec_shr optab

From: Alan Lawrence <alan dot lawrence at arm dot com>
To: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
Date: Thu, 18 Sep 2014 13:34:24 +0100
Subject: [PATCH 10/14][AArch64] Implement vec_shr optab
Authentication-results: sourceware.org; auth=none
References: <541AC4D2 dot 9040901 at arm dot com>

This allows reduction of non-(plus|min|max) operations using log_2(N) shiftsrather than N vec_extracts; e.g. for example code


int
main (unsigned char argc, char **argv)
{
  unsigned char in[16] = { 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 };
  unsigned char i = 0;
  unsigned char sum = 1;

  /* Prevent constant propagation of the entire loop below.  */
  asm volatile ("" : : : "memory");

  for (i = 0; i < 16; i++)
    sum *= in[i];

  if (sum != 33)
      __builtin_printf("Failed %d\n", sum);
}

(a simplified, less-general version of vect-reduc-mul_1.c) this gives

main:
        ldr     q0, .LC0
        sub     sp, sp, #16
        str     q0, [sp]
        ldr     q1, [sp]
        movi    v0.4s, 0
        ext     v2.16b, v1.16b, v0.16b, #8
        mul     v1.16b, v1.16b, v2.16b
        ext     v2.16b, v1.16b, v0.16b, #4
        mul     v1.16b, v2.16b, v1.16b
        ext     v2.16b, v1.16b, v0.16b, #2
        mul     v1.16b, v2.16b, v1.16b
        ext     v0.16b, v1.16b, v0.16b, #1
        mul     v0.16b, v0.16b, v1.16b
        umov    w1, v0.b[0]
        cmp     w1, 33
        beq     .L2
        ...

rather than previously:

main:
        ldr     q0, .LC0
        sub     sp, sp, #16
        str     q0, [sp]
        ldr     d1, [sp]
        ldr     d0, [sp, 8]
        mul     v0.8b, v0.8b, v1.8b
        umov    w0, v0.b[1]
        umov    w3, v0.b[0]
        umov    w2, v0.b[2]
        umov    w7, v0.b[3]
        umov    w6, v0.b[4]
        mul     w3, w0, w3
        umov    w5, v0.b[5]
        umov    w4, v0.b[6]
        umov    w1, v0.b[7]
        mul     w3, w3, w2
        mul     w2, w3, w7
        mul     w2, w2, w6
        mul     w0, w2, w5
        mul     w0, w0, w4
        mul     w1, w0, w1
        uxtb    w1, w1
        cmp     w1, 33
        beq     .L2
        ...

Tested check-gcc on aarch64-none-elf and aarch64_be-none-elf. (Including newtests from previous patches.)


gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (vec_shr<mode>): New (*2).

gcc/testsuite/ChangeLog:
	* lib/target_supports.exp (check_effective_target_whole_vector_shift):
	Add aarch64*-*-*.

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index d4a745be59897b4cb2a0de23adb56b5d79203592..3fcf809113d73b37a95653b8c2be432478d2bc1e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -770,6 +770,45 @@
   }
 )
 
+;; For 64-bit modes we use ushl/r, as this does not require a SIMD zero.
+(define_insn "vec_shr_<mode>"
+  [(set (match_operand:VD 0 "register_operand" "=w")
+        (lshiftrt:VD (match_operand:VD 1 "register_operand" "w")
+		     (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_SIMD"
+  "ushr %d0, %d1, %2"
+  [(set_attr "type" "neon_shift_imm")]
+)
+
+(define_expand "vec_shr_<mode>"
+  [(set (match_operand:VQ 0 "register_operand" "=w")
+        (lshiftrt:VQ (match_operand:VQ 1 "register_operand" "w")
+		      (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_SIMD"
+{
+  HOST_WIDE_INT num_bits = INTVAL (operands[2]);
+  HOST_WIDE_INT elem_bits = GET_MODE_BITSIZE (GET_MODE_INNER (<MODE>mode));
+  rtx zero_reg = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+
+  gcc_assert (GET_MODE_BITSIZE (<MODE>mode) == 128);
+  gcc_assert (num_bits % elem_bits == 0);
+
+  if (num_bits == 0)
+    {
+      emit_move_insn (operands[0], operands[1]);
+      DONE;
+    }
+  else if (num_bits == 128)
+    {
+      emit_move_insn (operands[0], CONST0_RTX (<MODE>mode));
+      DONE;
+    }
+
+  emit_insn (gen_aarch64_ext<mode> (operands[0], operands[1], zero_reg,
+		      GEN_INT (num_bits / elem_bits)));
+  DONE;
+})
+
 (define_insn "aarch64_simd_vec_setv2di"
   [(set (match_operand:V2DI 0 "register_operand" "=w,w")
         (vec_merge:V2DI
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 5e40f5fcdfc95e41e804075bb5daa7030eb9bc66..720cc345bf6a76470cc85116d7b3365be07caa97 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3323,6 +3323,7 @@ proc check_effective_target_vect_shift { } {
 proc check_effective_target_whole_vector_shift { } {
     if { [istarget x86_64-*-*]
 	 || [istarget ia64-*-*]
+	 || [istarget aarch64*-*-*]
 	 || ([check_effective_target_arm32]
 	     && [check_effective_target_arm_little_endian])
 	 || ([istarget mips*-*-*]

References:
- [PATCH 0/14+2][Vectorizer] Made reductions endianness-neutral, fixes PR/61114
  - From: Alan Lawrence

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]