This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH][ARM] Fix PR 55426

From: Kyrill Tkachov <kyrylo dot tkachov at arm dot com>
To: GCC Patches <gcc-patches at gcc dot gnu dot org>
Cc: Richard Earnshaw <Richard dot Earnshaw at arm dot com>, Ramana Radhakrishnan <ramana dot radhakrishnan at arm dot com>
Date: Tue, 11 Feb 2014 09:38:08 +0000
Subject: [PATCH][ARM] Fix PR 55426
Authentication-results: sourceware.org; auth=none

Hi all,

In this PR the 128-bit load-duplicate intrinsics in neon.exp ICE on big-endianwith an unrecognisable insn error:


neon-vld1_dupQ.c:24:1: error: unrecognizable insn:
(insn 94 93 31 (set (subreg:DI (reg:V2DI 95 d16 [orig:137 D.14400 ] [137]) 0)
        (subreg:DI (reg:V2DI 95 d16 [orig:137 D.14400 ] [137]) 8))

The problem seems to be that the neon_vld1_dupv2di splitter generates subregsafter reload with gen_lowpart and gen_highpart. Since that splitter alwaysmatches after reload, we already know the hard register numbers, so we can justmanipulate those directly to extract the two doubleword parts of a quadword reg.

While we're at it, we might as well use a more general move instruction when thealignment is natural to potentially take advantage of more complex addressingmodes. We're allowed to do that because the vld1Q_dup*64 intrinsics describe abehaviour and do not guarantee that a particular instruction will be used.

Therefore the vld1Q_dup*64 tests are updated to be run-time tests instead totest the functionality. New *_misaligned tests are added, however, to make surethat we still generate vld1.64 when the address is explicitly unaligned, sincevld1.64 is the only instruction that can handle that.


Did an armeb-none-linux-gnueabihf build.
The vld1Q_dup*64* tests now pass on big and little endian.
arm-none-linux-gnueabihf bootstrap on Chromebook successful.

This is a regression since 4.7. I've tested this on trunk. Will test this on the4.8 and 4.7 branches.


Ok for those branches if no regressions?

Thanks,
Kyrill


2014-02-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

    PR target/55426
    * config/arm/neon.md (neon_vld1_dupv2di): Do not generate
    low and high part subregs, use hard reg numbers.
    * config/arm/arm.c (arm_mem_aligned_p): New function.
    (arm_init_neon_builtins): Allow for memory operands
    in load operations.
    * config/arm/arm-protos.h (arm_mem_aligned_p): Declare
    extern.
    * config/arm/constraints.md (Uo): New constraint.

2014-02-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

    PR target/55426
    * gcc.target/arm/neon/vld1Q_dupp64.c: Change to run-time test.
    * gcc.target/arm/neon/vld1Q_dups64.c: Likewise.
    * gcc.target/arm/neon/vld1Q_dupu64.c: Likewise.
    * gcc.target/arm/neon/vld1Q_dupp64_misaligned.c: New test.
    * gcc.target/arm/neon/vld1Q_dups64_misaligned.c: Likewise.
    * gcc.target/arm/neon/vld1Q_dupu64_misaligned.c: Likewise.

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 13874ee..56f46e3 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -95,6 +95,7 @@ extern enum reg_class coproc_secondary_reload_class (enum machine_mode, rtx,
 extern bool arm_tls_referenced_p (rtx);
 
 extern int arm_coproc_mem_operand (rtx, bool);
+extern bool arm_mem_aligned_p (rtx, unsigned int);
 extern int neon_vector_mem_operand (rtx, int, bool);
 extern int neon_struct_mem_operand (rtx);
 
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index fc81bf6..33c829d 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -12499,6 +12499,14 @@ arm_coproc_mem_operand (rtx op, bool wb)
   return FALSE;
 }
 
+/* Return true if the MEM RTX x has the given alignment.  */
+bool
+arm_mem_aligned_p (rtx x, unsigned int alignment)
+{
+  gcc_assert (MEM_P (x));
+  return MEM_ALIGN (x) == alignment;
+}
+
 /* Return TRUE if OP is a memory operand which we can load or store a vector
    to/from. TYPE is one of the following values:
     0 - Vector load/stor (vldr)
@@ -23644,7 +23652,9 @@ arm_init_neon_builtins (void)
 		    /* Neon load patterns always have the memory
 		       operand in the operand 1 position.  */
 		    gcc_assert (insn_data[d->code].operand[k].predicate
-				== neon_struct_operand);
+				  == neon_struct_operand
+			        || insn_data[d->code].operand[k].predicate
+			             == memory_operand);
 
 		    switch (d->mode)
 		      {
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index 85dd116..86947dd 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -381,6 +381,14 @@
  (and (match_code "mem")
       (match_test "TARGET_32BIT && neon_vector_mem_operand (op, 2, true)")))
 
+(define_memory_constraint "Uo"
+ "@internal
+  In ARM/Thumb-2 state a valid address for Neon element and structure
+  load/store instructions or normal load on doubleword alignment."
+ (and (match_code "mem")
+      (match_test "TARGET_32BIT && (arm_mem_aligned_p (op, DOUBLEWORD_ALIGNMENT)
+                                   || neon_vector_mem_operand (op, 2, true))")))
+
 (define_memory_constraint "Us"
  "@internal
   In ARM/Thumb-2 state a valid address for non-offset loads/stores of
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 2f06e42..e4490ba 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -4406,19 +4406,28 @@
 
 (define_insn_and_split "neon_vld1_dupv2di"
    [(set (match_operand:V2DI 0 "s_register_operand" "=w")
-    (vec_duplicate:V2DI (match_operand:DI 1 "neon_struct_operand" "Um")))]
+    (vec_duplicate:V2DI (match_operand:DI 1 "memory_operand" "Uo")))]
    "TARGET_NEON"
    "#"
    "&& reload_completed"
    [(const_int 0)]
    {
-    rtx tmprtx = gen_lowpart (DImode, operands[0]);
-    emit_insn (gen_neon_vld1_dupdi (tmprtx, operands[1]));
-    emit_move_insn (gen_highpart (DImode, operands[0]), tmprtx );
-    DONE;
-    }
-  [(set_attr "length" "8")
-   (set_attr "type" "neon_load1_all_lanes_q")]
+     rtx lo_reg = gen_rtx_REG (DImode, REGNO (operands[0]));
+     rtx hi_reg = gen_rtx_REG (DImode, REGNO (operands[0]) + 2);
+
+     /* If the alignment is not natural, we have to use vld1.
+        In all other cases we can try to take advantage of a more general
+        move operation.  */
+
+     if (arm_mem_aligned_p (operands[1], DOUBLEWORD_ALIGNMENT))
+       emit_move_insn (lo_reg, operands[1]);
+     else
+       emit_insn (gen_neon_vld1_dupdi (lo_reg, operands[1]));
+
+     emit_move_insn (hi_reg, lo_reg);
+     DONE;
+   }
+  [(set_attr "length" "8")]
 )
 
 (define_expand "vec_store_lanes<mode><mode>"
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64.c
index 2d504c1..a606eef 100644
--- a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64.c
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64.c
@@ -1,19 +1,35 @@
-/* Test the `vld1Q_dupp64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
+/* Test the `vld1Q_dup_p64' ARM Neon intrinsic.
+   If the argument to vld1q_dup_p64 is naturally aligned the compiler has
+   the freedom to choose any instruction equivalent to vld1.64, therefore we
+   do not scan for it explicitly but instead test the runtime functionality.
+   */
 
-/* { dg-do assemble } */
+/* { dg-do run } */
 /* { dg-require-effective-target arm_crypto_ok } */
-/* { dg-options "-save-temps -O0" } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2" } */
 /* { dg-add-options arm_crypto } */
 
 #include "arm_neon.h"
 
-void test_vld1Q_dupp64 (void)
-{
-  poly64x2_t out_poly64x2_t;
+extern void abort (void);
 
-  out_poly64x2_t = vld1q_dup_p64 (0);
+poly64x2_t
+test_vld1Q_dupp64 (poly64_t* p)
+{
+   return vld1q_dup_p64 (p);
 }
 
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { cleanup-saved-temps } } */
+int
+main (void)
+{
+  uint64_t c = 0xf00ba1;
+  poly64_t pol = vcreate_p64 (c);
+  poly64x2_t b = test_vld1Q_dupp64 (&pol);
+
+  if (vreinterpret_u64_p64 (vget_high_p64 (b)) != c
+      || vreinterpret_u64_p64 (vget_low_p64 (b) != c))
+    abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64_misaligned.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64_misaligned.c
new file mode 100644
index 0000000..3a916ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupp64_misaligned.c
@@ -0,0 +1,43 @@
+/* Test the `vld1Q_dup_p64' ARM Neon intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_crypto } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+struct __attribute__ ((__packed__)) my_struct
+{
+  int i;
+  poly64_t str;
+};
+
+poly64x2_t
+test_vld1Q_dupp64 (struct my_struct* s)
+{
+   return vld1q_dup_p64 (&(s->str));
+}
+
+int
+main (void)
+{
+  uint64_t c = 0xf00ba1;
+  poly64_t pol = vcreate_p64 (c);
+  struct my_struct st;
+  st.i = -1;
+  st.str = pol;
+  poly64x2_t b = test_vld1Q_dupp64 (&st);
+
+  if (vreinterpret_u64_p64 (vget_high_p64 (b)) != c
+      || vreinterpret_u64_p64 (vget_low_p64 (b) != c))
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64.c
index 4fceee8..a7e57f4 100644
--- a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64.c
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64.c
@@ -1,19 +1,33 @@
-/* Test the `vld1Q_dups64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
+/* Test the `vld1Q_dup_s64' ARM Neon intrinsic.
+   If the argument to vld1q_dup_s64 is naturally aligned the compiler has
+   the freedom to choose any instruction equivalent to vld1.64, therefore we
+   do not scan for it explicitly but instead test the runtime functionality.
+   */
 
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -save-temps" } */
 /* { dg-add-options arm_neon } */
 
 #include "arm_neon.h"
 
-void test_vld1Q_dups64 (void)
-{
-  int64x2_t out_int64x2_t;
+extern void abort (void);
 
-  out_int64x2_t = vld1q_dup_s64 (0);
+int64x2_t
+test_vld1Q_dups64 (int64_t* p)
+{
+   return vld1q_dup_s64 (p);
 }
 
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { cleanup-saved-temps } } */
+int
+main (void)
+{
+  int64_t c = 0xf00ba1;
+  int64x2_t b = test_vld1Q_dups64 (&c);
+
+  if (vget_high_s64 (b) != c
+      || vget_low_s64 (b) != c)
+    abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64_misaligned.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64_misaligned.c
new file mode 100644
index 0000000..2079a66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dups64_misaligned.c
@@ -0,0 +1,41 @@
+/* Test the `vld1Q_dup_s64' ARM Neon intrinsic.  */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+struct __attribute__ ((__packed__)) my_struct
+{
+  int i;
+  int64_t str;
+};
+
+int64x2_t
+test_vld1Q_dups64 (struct my_struct* s)
+{
+   return vld1q_dup_s64 (&(s->str));
+}
+
+int
+main (void)
+{
+  int64_t c = 0xf00ba1;
+  struct my_struct st;
+  st.i = -1;
+  st.str = c;
+  int64x2_t b = test_vld1Q_dups64 (&st);
+
+  if (vget_high_s64 (b) != c
+      || vget_low_s64 (b) != c)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64.c
index ef0a382..985670f 100644
--- a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64.c
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64.c
@@ -1,19 +1,33 @@
-/* Test the `vld1Q_dupu64' ARM Neon intrinsic.  */
-/* This file was autogenerated by neon-testgen.  */
+/* Test the `vld1Q_dup_u64' ARM Neon intrinsic.
+   If the argument to vld1q_dup_u64 is naturally aligned the compiler has
+   the freedom to choose any instruction equivalent to vld1.64, therefore we
+   do not scan for it explicitly but instead test the runtime functionality.
+   */
 
-/* { dg-do assemble } */
-/* { dg-require-effective-target arm_neon_ok } */
-/* { dg-options "-save-temps -O0" } */
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -save-temps" } */
 /* { dg-add-options arm_neon } */
 
 #include "arm_neon.h"
 
-void test_vld1Q_dupu64 (void)
-{
-  uint64x2_t out_uint64x2_t;
+extern void abort (void);
 
-  out_uint64x2_t = vld1q_dup_u64 (0);
+uint64x2_t
+test_vld1Q_dupu64 (uint64_t* p)
+{
+   return vld1q_dup_u64 (p);
 }
 
-/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
-/* { dg-final { cleanup-saved-temps } } */
+int
+main (void)
+{
+  uint64_t c = 0xf00ba1;
+  uint64x2_t b = test_vld1Q_dupu64 (&c);
+
+  if (vget_high_u64 (b) != c
+      || vget_low_u64 (b) != c)
+    abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64_misaligned.c b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64_misaligned.c
new file mode 100644
index 0000000..5468792
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon/vld1Q_dupu64_misaligned.c
@@ -0,0 +1,42 @@
+/* Test the `vld1Q_dup_u64' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+
+/* { dg-do run } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+struct __attribute__ ((__packed__)) my_struct
+{
+  int i;
+  uint64_t str;
+};
+
+uint64x2_t
+test_vld1Q_dupu64 (struct my_struct* s)
+{
+   return vld1q_dup_u64 (&(s->str));
+}
+
+int
+main (void)
+{
+  uint64_t c = 0xf00ba1;
+  struct my_struct st;
+  st.i = -1;
+  st.str = c;
+  uint64x2_t b = test_vld1Q_dupu64 (&st);
+
+  if (vget_high_u64 (b) != c
+      || vget_low_u64 (b) != c)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler "vld1\.64\[ 	\]+((\\\{\[dD\]\[0-9\]+\\\})|(\[dD\]\[0-9\]+)), \\\[\[rR\]\[0-9\]+\(:\[0-9\]+\)?\\\]!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */

Follow-Ups:
- Re: [PATCH][ARM] Fix PR 55426
  - From: Kyrill Tkachov

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]