This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

ARM: Improve NEON vector creation


Right now, neon_expand_vector_init will always create vectors by using
the stack.  We have several other perfectly good instructions for
this...

This patch teaches it a couple of new tricks, some of which are
borrowed from GCC's Altivec support.

* If the vector is constant, use vmov, vdup, or vldr (constant pool).

* If the vector has all elements the same, use vdup.

* If the vector has all but one element constant, load the constant
vector and then use vec_set for the non-constant element.

This isn't the full range of possible improvements, but it handles enough
common cases to improve the generated code substantially.  The only
one I'm unsure about is the constant vdup case; we'll generate this
sometimes to initialize a vector to a repeating float:

  vldr s0, [pc, #offset]
  vdup d0, d0[0]

  .word <FLOAT_CONSTANT>

This isn't obviously a win, even for -mvectorize-with-neon-quad.
Should I limit vdup to the non-constant case instead?  I had hoped to
avoid the constant pool entry by using movw / movt / vdup, but GCC
doesn't realize (or doesn't agree) that such a sequence is cheaper
than a constant pool load.

I adjusted the vdup pattern to support a source in VFP register.
This required adjusting the generated NEON tests; the builtin for vdup
(core register) may now generate a vdup (scalar lane) instruction
instead, which I think is entirely reasonable.  ML is not a
comfortable tongue for me, but I think I got that part right.
ML review appreciated.  Doug, I copied you since IIRC you were trying
to do a related change several months ago; just FYI.

Tested on arm-none-eabi.  OK to commit?

-- 
Daniel Jacobowitz
CodeSourcery

2009-11-10  Daniel Jacobowitz  <dan@codesourcery.com>

	* config/arm/arm.c (neon_vdup_constant, neon_make_constant): New.
	(neon_expand_vector_init): Use them.  Also handle non-constant
	vectors with identical elements and vectors with only one
	non-constant element.
	(arm_print_operand): Handle 'y' modifier.
	* config/arm/arm-protos.h (neon_make_constant): Declare.
	* config/arm/neon.md (neon_vdup_n<mode>): Split into two
	patterns.  Use VX instead of VDQW for the first one.  Allow
	a VFP alternative and V32 modes for the second one.
	* config/arm/neon.ml (shape_elt): Add Alternatives.
	(ops): Use Alternatives for vdup lane instructions.
	* config/arm/neon-testgen.ml (analyze_shape): Handle Alternatives.
	* config/arm/vec-common.md (mov<mode>): Use neon_make_constant.

	* gcc.target/arm/neon: Regenerate generated tests.

---
 gcc/config/arm/arm-protos.h                    |    1 
 gcc/config/arm/arm.c                           |  190 +++++++++++++++++++++++--
 gcc/config/arm/neon-testgen.ml                 |    1 
 gcc/config/arm/neon.md                         |   19 ++
 gcc/config/arm/neon.ml                         |   21 ++
 gcc/config/arm/vec-common.md                   |    5 
 gcc/testsuite/gcc.target/arm/neon/vdupQ_nf32.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vdupQ_np16.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vdupQ_np8.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vdupQ_ns16.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vdupQ_ns32.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vdupQ_ns8.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vdupQ_nu16.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vdupQ_nu32.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vdupQ_nu8.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vdup_nf32.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vdup_np16.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vdup_np8.c   |    2 
 gcc/testsuite/gcc.target/arm/neon/vdup_ns16.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vdup_ns32.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vdup_ns8.c   |    2 
 gcc/testsuite/gcc.target/arm/neon/vdup_nu16.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vdup_nu32.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vdup_nu8.c   |    2 
 gcc/testsuite/gcc.target/arm/neon/vmovQ_nf32.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vmovQ_np16.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vmovQ_np8.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vmovQ_ns16.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vmovQ_ns32.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vmovQ_ns8.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vmovQ_nu16.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vmovQ_nu32.c |    2 
 gcc/testsuite/gcc.target/arm/neon/vmovQ_nu8.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vmov_nf32.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vmov_np16.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vmov_np8.c   |    2 
 gcc/testsuite/gcc.target/arm/neon/vmov_ns16.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vmov_ns32.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vmov_ns8.c   |    2 
 gcc/testsuite/gcc.target/arm/neon/vmov_nu16.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vmov_nu32.c  |    2 
 gcc/testsuite/gcc.target/arm/neon/vmov_nu8.c   |    2 
 42 files changed, 256 insertions(+), 53 deletions(-)

Index: gcc/testsuite/gcc.target/arm/neon/vdup_ns8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdup_ns8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdup_ns8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdup_ns8 (void)
   out_int8x8_t = vdup_n_s8 (arg0_int8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmov_ns8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmov_ns8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmov_ns8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmov_ns8 (void)
   out_int8x8_t = vmov_n_s8 (arg0_int8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdup_np16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdup_np16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdup_np16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdup_np16 (void)
   out_poly16x4_t = vdup_n_p16 (arg0_poly16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdup_nu8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdup_nu8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdup_nu8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdup_nu8 (void)
   out_uint8x8_t = vdup_n_u8 (arg0_uint8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmov_nu8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmov_nu8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmov_nu8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmov_nu8 (void)
   out_uint8x8_t = vmov_n_u8 (arg0_uint8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmovQ_np8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmovQ_np8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmovQ_np8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmovQ_np8 (void)
   out_poly8x16_t = vmovq_n_p8 (arg0_poly8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdup_ns32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdup_ns32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdup_ns32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdup_ns32 (void)
   out_int32x2_t = vdup_n_s32 (arg0_int32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmov_np16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmov_np16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmov_np16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmov_np16 (void)
   out_poly16x4_t = vmov_n_p16 (arg0_poly16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdup_ns16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdup_ns16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdup_ns16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdup_ns16 (void)
   out_int16x4_t = vdup_n_s16 (arg0_int16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmovQ_np16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmovQ_np16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmovQ_np16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmovQ_np16 (void)
   out_poly16x8_t = vmovq_n_p16 (arg0_poly16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdup_nu32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdup_nu32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdup_nu32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdup_nu32 (void)
   out_uint32x2_t = vdup_n_u32 (arg0_uint32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdup_np8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdup_np8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdup_np8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdup_np8 (void)
   out_poly8x8_t = vdup_n_p8 (arg0_poly8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdup_nu16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdup_nu16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdup_nu16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdup_nu16 (void)
   out_uint16x4_t = vdup_n_u16 (arg0_uint16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmov_ns32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmov_ns32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmov_ns32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmov_ns32 (void)
   out_int32x2_t = vmov_n_s32 (arg0_int32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdup_nf32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdup_nf32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdup_nf32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdup_nf32 (void)
   out_float32x2_t = vdup_n_f32 (arg0_float32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmov_np8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmov_np8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmov_np8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmov_np8 (void)
   out_poly8x8_t = vmov_n_p8 (arg0_poly8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmovQ_ns32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmovQ_ns32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmovQ_ns32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmovQ_ns32 (void)
   out_int32x4_t = vmovq_n_s32 (arg0_int32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmov_ns16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmov_ns16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmov_ns16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmov_ns16 (void)
   out_int16x4_t = vmov_n_s16 (arg0_int16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdupQ_np16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdupQ_np16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdupQ_np16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdupQ_np16 (void)
   out_poly16x8_t = vdupq_n_p16 (arg0_poly16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmovQ_ns16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmovQ_ns16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmovQ_ns16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmovQ_ns16 (void)
   out_int16x8_t = vmovq_n_s16 (arg0_int16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmov_nu32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmov_nu32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmov_nu32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmov_nu32 (void)
   out_uint32x2_t = vmov_n_u32 (arg0_uint32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmovQ_nu32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmovQ_nu32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmovQ_nu32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmovQ_nu32 (void)
   out_uint32x4_t = vmovq_n_u32 (arg0_uint32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmov_nu16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmov_nu16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmov_nu16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmov_nu16 (void)
   out_uint16x4_t = vmov_n_u16 (arg0_uint16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmov_nf32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmov_nf32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmov_nf32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmov_nf32 (void)
   out_float32x2_t = vmov_n_f32 (arg0_float32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[dD\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmovQ_nu16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmovQ_nu16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmovQ_nu16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmovQ_nu16 (void)
   out_uint16x8_t = vmovq_n_u16 (arg0_uint16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmovQ_nf32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmovQ_nf32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmovQ_nf32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmovQ_nf32 (void)
   out_float32x4_t = vmovq_n_f32 (arg0_float32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdupQ_ns32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdupQ_ns32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdupQ_ns32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdupQ_ns32 (void)
   out_int32x4_t = vdupq_n_s32 (arg0_int32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdupQ_ns8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdupQ_ns8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdupQ_ns8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdupQ_ns8 (void)
   out_int8x16_t = vdupq_n_s8 (arg0_int8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdupQ_ns16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdupQ_ns16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdupQ_ns16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdupQ_ns16 (void)
   out_int16x8_t = vdupq_n_s16 (arg0_int16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdupQ_nu8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdupQ_nu8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdupQ_nu8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdupQ_nu8 (void)
   out_uint8x16_t = vdupq_n_u8 (arg0_uint8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdupQ_nu32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdupQ_nu32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdupQ_nu32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdupQ_nu32 (void)
   out_uint32x4_t = vdupq_n_u32 (arg0_uint32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdupQ_nu16.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdupQ_nu16.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdupQ_nu16.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdupQ_nu16 (void)
   out_uint16x8_t = vdupq_n_u16 (arg0_uint16_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.16\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdupQ_nf32.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdupQ_nf32.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdupQ_nf32.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdupQ_nf32 (void)
   out_float32x4_t = vdupq_n_f32 (arg0_float32_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.32\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmovQ_ns8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmovQ_ns8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmovQ_ns8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmovQ_ns8 (void)
   out_int8x16_t = vmovq_n_s8 (arg0_int8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vmovQ_nu8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vmovQ_nu8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vmovQ_nu8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vmovQ_nu8 (void)
   out_uint8x16_t = vmovq_n_u8 (arg0_uint8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/testsuite/gcc.target/arm/neon/vdupQ_np8.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vdupQ_np8.c.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/testsuite/gcc.target/arm/neon/vdupQ_np8.c	2009-11-10 06:47:06.000000000 -0800
@@ -15,5 +15,5 @@ void test_vdupQ_np8 (void)
   out_poly8x16_t = vdupq_n_p8 (arg0_poly8_t);
 }
 
-/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, \[rR\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "vdup\.8\[ 	\]+\[qQ\]\[0-9\]+, (\[rR\]\[0-9\]+|\[dD\]\[0-9\]+\\\[\[0-9\]+\\\])!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c.orig	2009-11-10 06:46:28.000000000 -0800
+++ gcc/config/arm/arm.c	2009-11-10 06:47:13.000000000 -0800
@@ -8085,25 +8085,171 @@ neon_pairwise_reduce (rtx op0, rtx op1, 
     }
 }
 
-/* Initialize a vector with non-constant elements.  FIXME: We can do better
-   than the current implementation (building a vector on the stack and then
-   loading it) in many cases.  See rs6000.c.  */
+/* If VALS is a vector constant that can be loaded into a register
+   using VDUP, generate instructions to do so and return an RTX to
+   assign to the register.  Otherwise return NULL_RTX.  */
+
+static rtx
+neon_vdup_constant (rtx vals)
+{
+  enum machine_mode mode = GET_MODE (vals);
+  enum machine_mode inner_mode = GET_MODE_INNER (mode);
+  int n_elts = GET_MODE_NUNITS (mode);
+  bool all_same = true;
+  rtx x;
+  int i;
+
+  if (GET_CODE (vals) != CONST_VECTOR || GET_MODE_SIZE (inner_mode) > 4)
+    return NULL_RTX;
+
+  for (i = 0; i < n_elts; ++i)
+    {
+      x = XVECEXP (vals, 0, i);
+      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
+	all_same = false;
+    }
+
+  if (!all_same)
+    /* The elements are not all the same.  We could handle repeating
+       patterns of a mode larger than INNER_MODE here (e.g. int8x8_t
+       {0, C, 0, C, 0, C, 0, C} which can be loaded using
+       vdup.i16).  */
+    return NULL_RTX;
+
+  /* We can load this constant by using VDUP and a constant in a
+     single ARM register.  This will be cheaper than a vector
+     load.  */
+
+  x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
+  return gen_rtx_UNSPEC (mode, gen_rtvec (1, x),
+			 UNSPEC_VDUP_N);
+}
+
+/* Generate code to load VALS, which is a PARALLEL containing only
+   constants (for vec_init) or CONST_VECTOR, efficiently into a
+   register.  Returns an RTX to copy into the register, or NULL_RTX
+   for a PARALLEL that can not be converted into a CONST_VECTOR.  */
+
+rtx
+neon_make_constant (rtx vals)
+{
+  enum machine_mode mode = GET_MODE (vals);
+  rtx target;
+  rtx const_vec = NULL_RTX;
+  int n_elts = GET_MODE_NUNITS (mode);
+  int n_const = 0;
+  int i;
+
+  if (GET_CODE (vals) == CONST_VECTOR)
+    const_vec = vals;
+  else if (GET_CODE (vals) == PARALLEL)
+    {
+      /* A CONST_VECTOR must contain only CONST_INTs and
+	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
+	 Only store valid constants in a CONST_VECTOR.  */
+      for (i = 0; i < n_elts; ++i)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE)
+	    n_const++;
+	}
+      if (n_const == n_elts)
+	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
+    }
+  else
+    gcc_unreachable ();
+
+  if (const_vec != NULL
+      && neon_immediate_valid_for_move (const_vec, mode, NULL, NULL))
+    /* Load using VMOV.  On Cortex-A8 this takes one cycle.  */
+    return const_vec;
+  else if ((target = neon_vdup_constant (vals)) != NULL_RTX)
+    /* Loaded using VDUP.  On Cortex-A8 the VDUP takes one NEON
+       pipeline cycle; creating the constant takes one or two ARM
+       pipeline cycles.  */
+    return target;
+  else if (const_vec != NULL_RTX)
+    /* Load from constant pool.  On Cortex-A8 this takes two cycles
+       (for either double or quad vectors).  We can not take advantage
+       of single-cycle VLD1 because we need a PC-relative addressing
+       mode.  */
+    return const_vec;
+  else
+    /* A PARALLEL containing something not valid inside CONST_VECTOR.
+       We can not construct an initializer.  */
+    return NULL_RTX;
+}
+
+/* Initialize vector TARGET to VALS.  */
 
 void
 neon_expand_vector_init (rtx target, rtx vals)
 {
   enum machine_mode mode = GET_MODE (target);
-  enum machine_mode inner = GET_MODE_INNER (mode);
-  unsigned int i, n_elts = GET_MODE_NUNITS (mode);
-  rtx mem;
+  enum machine_mode inner_mode = GET_MODE_INNER (mode);
+  int n_elts = GET_MODE_NUNITS (mode);
+  int n_var = 0, one_var = -1;
+  bool all_same = true;
+  rtx x, mem;
+  int i;
 
-  gcc_assert (VECTOR_MODE_P (mode));
+  for (i = 0; i < n_elts; ++i)
+    {
+      x = XVECEXP (vals, 0, i);
+      if (!CONSTANT_P (x))
+	++n_var, one_var = i;
+
+      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
+	all_same = false;
+    }
 
+  if (n_var == 0)
+    {
+      rtx constant = neon_make_constant (vals);
+      if (constant != NULL_RTX)
+	{
+	  emit_move_insn (target, constant);
+	  return;
+	}
+    }
+
+  /* Splat a single non-constant element if we can.  */
+  if (all_same && GET_MODE_SIZE (inner_mode) <= 4)
+    {
+      x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
+      emit_insn (gen_rtx_SET (VOIDmode, target,
+			      gen_rtx_UNSPEC (mode, gen_rtvec (1, x),
+					      UNSPEC_VDUP_N)));
+      return;
+    }
+
+  /* One field is non-constant.  Load constant then overwrite varying
+     field.  This is more efficient than using the stack.  */
+  if (n_var == 1)
+    {
+      rtx copy = copy_rtx (vals);
+      rtvec ops;
+
+      /* Load constant part of vector, substitute neighboring value for
+	 varying element.  */
+      XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, (one_var + 1) % n_elts);
+      neon_expand_vector_init (target, copy);
+
+      /* Insert variable.  */
+      x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
+      ops = gen_rtvec (3, x, target, GEN_INT (one_var));
+      emit_insn (gen_rtx_SET (VOIDmode, target,
+			      gen_rtx_UNSPEC (mode, ops, UNSPEC_VSET_LANE)));
+      return;
+    }
+
+  /* Construct the vector in memory one field at a time
+     and load the whole vector.  */
   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), 0);
   for (i = 0; i < n_elts; i++)
-    emit_move_insn (adjust_address_nv (mem, inner, i * GET_MODE_SIZE (inner)),
-                   XVECEXP (vals, 0, i));
-
+    emit_move_insn (adjust_address_nv (mem, inner_mode,
+				    i * GET_MODE_SIZE (inner_mode)),
+		    XVECEXP (vals, 0, i));
   emit_move_insn (target, mem);
 }
 
@@ -15253,6 +15399,30 @@ arm_print_operand (FILE *stream, rtx x, 
       }
       return;
 
+    /* Translate an S register number into a D register number and element index.  */
+    case 'y':
+      {
+        int mode = GET_MODE (x);
+        int regno;
+
+        if (GET_MODE_SIZE (mode) != 4 || GET_CODE (x) != REG)
+          {
+	    output_operand_lossage ("invalid operand for code '%c'", code);
+	    return;
+          }
+
+        regno = REGNO (x);
+        if (!VFP_REGNO_OK_FOR_SINGLE (regno))
+          {
+	    output_operand_lossage ("invalid operand for code '%c'", code);
+	    return;
+          }
+
+	regno = regno - FIRST_VFP_REGNUM;
+	fprintf (stream, "d%d[%d]", regno / 2, regno % 2);
+      }
+      return;
+
     /* Register specifier for vld1.16/vst1.16.  Translate the S register
        number into a D register number and element index.  */
     case 'z':
Index: gcc/config/arm/arm-protos.h
===================================================================
--- gcc/config/arm/arm-protos.h.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/config/arm/arm-protos.h	2009-11-10 06:47:06.000000000 -0800
@@ -68,6 +68,7 @@ extern char *neon_output_logic_immediate
 					  enum machine_mode, int, int);
 extern void neon_pairwise_reduce (rtx, rtx, enum machine_mode,
 				  rtx (*) (rtx, rtx, rtx));
+extern rtx neon_make_constant (rtx);
 extern void neon_expand_vector_init (rtx, rtx);
 extern void neon_lane_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT);
 extern void neon_const_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT);
Index: gcc/config/arm/neon.md
===================================================================
--- gcc/config/arm/neon.md.orig	2009-11-10 06:46:42.000000000 -0800
+++ gcc/config/arm/neon.md	2009-11-10 06:47:06.000000000 -0800
@@ -2687,9 +2687,9 @@
 })
 
 (define_insn "neon_vdup_n<mode>"
-  [(set (match_operand:VDQW 0 "s_register_operand" "=w")
-	(unspec:VDQW [(match_operand:<V_elem> 1 "s_register_operand" "r")]
-                    UNSPEC_VDUP_N))]
+  [(set (match_operand:VX 0 "s_register_operand" "=w")
+	(unspec:VX [(match_operand:<V_elem> 1 "s_register_operand" "r")]
+		   UNSPEC_VDUP_N))]
   "TARGET_NEON"
   "vdup%?.<V_sz_elem>\t%<V_reg>0, %1"
   ;; Assume this schedules like vmov.
@@ -2697,6 +2697,19 @@
    (set_attr "neon_type" "neon_bp_simple")]
 )
 
+(define_insn "neon_vdup_n<mode>"
+  [(set (match_operand:V32 0 "s_register_operand" "=w,w")
+	(unspec:V32 [(match_operand:<V_elem> 1 "s_register_operand" "r,t")]
+		    UNSPEC_VDUP_N))]
+  "TARGET_NEON"
+  "@
+  vdup%?.<V_sz_elem>\t%<V_reg>0, %1
+  vdup%?.<V_sz_elem>\t%<V_reg>0, %y1"
+  ;; Assume this schedules like vmov.
+  [(set_attr "predicable" "yes")
+   (set_attr "neon_type" "neon_bp_simple")]
+)
+
 (define_insn "neon_vdup_ndi"
   [(set (match_operand:DI 0 "s_register_operand" "=w")
 	(unspec:DI [(match_operand:DI 1 "s_register_operand" "r")]
Index: gcc/config/arm/neon.ml
===================================================================
--- gcc/config/arm/neon.ml.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/config/arm/neon.ml	2009-11-10 06:47:06.000000000 -0800
@@ -68,6 +68,7 @@ type shape_elt = Dreg | Qreg | Corereg |
 	       | Element_of_dreg	(* Used for "lane" variants.  *)
 	       | Element_of_qreg	(* Likewise.  *)
 	       | All_elements_of_dreg	(* Used for "dup" variants.  *)
+	       | Alternatives of shape_elt list (* Used for multiple valid operands *)
 
 type shape_form = All of int * shape_elt
                 | Long
@@ -1008,7 +1009,10 @@ let ops =
       pf_su_8_64;
 
     (* Set all lanes to the same value.  *)
-    Vdup_n, [],
+    Vdup_n,
+      [Disassembles_as [Use_operands [| Dreg;
+                                        Alternatives [ Corereg;
+                                                       Element_of_dreg ] |]]],
       Use_operands [| Dreg; Corereg |], "vdup_n", bits_1,
       pf_su_8_32;
     Vdup_n,
@@ -1016,7 +1020,10 @@ let ops =
        Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
       Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
       [S64; U64];
-    Vdup_n, [],
+    Vdup_n,
+      [Disassembles_as [Use_operands [| Qreg;
+                                        Alternatives [ Corereg;
+                                                       Element_of_dreg ] |]]],
       Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
       pf_su_8_32;
     Vdup_n,
@@ -1028,7 +1035,10 @@ let ops =
 
     (* These are just aliases for the above.  *)
     Vmov_n,
-      [Builtin_name "vdup_n"],
+      [Builtin_name "vdup_n";
+       Disassembles_as [Use_operands [| Dreg;
+                                        Alternatives [ Corereg;
+                                                       Element_of_dreg ] |]]],
       Use_operands [| Dreg; Corereg |],
       "vmov_n", bits_1, pf_su_8_32;
     Vmov_n,
@@ -1038,7 +1048,10 @@ let ops =
       Use_operands [| Dreg; Corereg |],
       "vmov_n", notype_1, [S64; U64];
     Vmov_n,
-      [Builtin_name "vdupQ_n"],
+      [Builtin_name "vdupQ_n";
+       Disassembles_as [Use_operands [| Qreg;
+                                        Alternatives [ Corereg;
+                                                       Element_of_dreg ] |]]],
       Use_operands [| Qreg; Corereg |],
       "vmovQ_n", bits_1, pf_su_8_32;
     Vmov_n,
Index: gcc/config/arm/neon-testgen.ml
===================================================================
--- gcc/config/arm/neon-testgen.ml.orig	2009-11-10 06:40:14.000000000 -0800
+++ gcc/config/arm/neon-testgen.ml	2009-11-10 06:47:06.000000000 -0800
@@ -175,6 +175,7 @@ let rec analyze_shape shape =
     | Element_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]"
     | Element_of_qreg -> (analyze_shape_elt Qreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]"
     | All_elements_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\\\\\]"
+    | Alternatives (elts) -> "(" ^ (String.concat "|" (List.map analyze_shape_elt elts)) ^ ")"
   in
     match shape with
       All (n, elt) -> commas analyze_shape_elt (n_things n elt) ""
Index: gcc/config/arm/vec-common.md
===================================================================
--- gcc/config/arm/vec-common.md.orig	2009-11-10 06:46:42.000000000 -0800
+++ gcc/config/arm/vec-common.md	2009-11-10 06:47:46.000000000 -0800
@@ -42,6 +42,11 @@
     {
       if (GET_CODE (operands[0]) != REG)
 	operands[1] = force_reg (<MODE>mode, operands[1]);
+      else if (TARGET_NEON && CONSTANT_P (operands[1]))
+	{
+	  operands[1] = neon_make_constant (operands[1]);
+	  gcc_assert (operands[1] != NULL_RTX);
+	}
     }
 })
 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]