v4sf vector reductions for i386

Sat Jun 18 22:48:00 GMT 2005

The expr.c change elides an extra move that's creeped in since we
changed clobbered values to get new registers in reload.

I've only done V4SF because it looks like V2DF would be most
efficient if it were expanded to scalar operations, and the integer
vector modes will want to use the shifting patterns that Dorit is
still working on.

But SSE1 doesn't have shifting patterns, and even if we are actually
using SSE2, there's a re-formatting penalty associated with that, so
we'd like to use the insns specifically designated for them.

Tested by hand on a version of the vect-reduc-1.c test case, modified
to work on floats.  Which I accidentally deleted instead of checking
it in.  Oh well, maybe later...


r~


        * expr.c (store_constructor): Use store of 0 to indicate value
        death instead of a clobber.

        * config/i386/i386.c (ix86_expand_reduc_v4sf): New.
        * config/i386/i386-protos.h (ix86_expand_reduc_v4sf): Declare.
        * config/i386/sse.md (reduc_plus_v4sf): New.
        (reduc_smax_v4sf, reduc_smin_v4sf): New.

Index: expr.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/expr.c,v
retrieving revision 1.795
diff -u -p -d -r1.795 expr.c

--- expr.c	18 Jun 2005 13:18:36 -0000	1.795
+++ expr.c	18 Jun 2005 22:38:34 -0000
@@ -5151,9 +5151,9 @@ store_constructor (tree exp, rtx target,
 	    cleared = 1;
 	  }
 	
+	/* Inform later passes that the old value is dead.  */
 	if (!cleared && REG_P (target))
-	  /* Inform later passes that the old value is dead.  */
-	  emit_insn (gen_rtx_CLOBBER (VOIDmode, target));
+	  emit_move_insn (target, CONST0_RTX (GET_MODE (target)));
 
         /* Store each element of the constructor into the corresponding
 	   element of TARGET, determined by counting the elements.  */
Index: config/i386/i386-protos.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386-protos.h,v
retrieving revision 1.140
diff -u -p -d -r1.140 i386-protos.h
--- config/i386/i386-protos.h	8 Jun 2005 05:05:18 -0000	1.140
+++ config/i386/i386-protos.h	18 Jun 2005 22:38:35 -0000
@@ -230,6 +230,7 @@ extern rtx ix86_tls_get_addr (void);
 extern void ix86_expand_vector_init (bool, rtx, rtx);
 extern void ix86_expand_vector_set (bool, rtx, rtx, int);
 extern void ix86_expand_vector_extract (bool, rtx, rtx, int);
+extern void ix86_expand_reduc_v4sf (rtx (*)(rtx, rtx, rtx), rtx, rtx);
 
 /* In winnt.c  */
 extern int i386_pe_dllexport_name_p (const char *);
Index: config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.c,v
retrieving revision 1.831
diff -u -p -d -r1.831 i386.c
--- config/i386/i386.c	16 Jun 2005 16:39:49 -0000	1.831
+++ config/i386/i386.c	18 Jun 2005 22:38:38 -0000
@@ -17366,6 +17366,27 @@ ix86_expand_vector_extract (bool mmx_ok,
       emit_move_insn (target, tmp);
     }
 }
+
+/* Expand a vector reduction on V4SFmode for SSE1.  FN is the binar
+   pattern to reduce; DEST is the destination; IN is the input vector.  */
+
+void
+ix86_expand_reduc_v4sf (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+{
+  rtx tmp1, tmp2, tmp3;
+
+  tmp1 = gen_reg_rtx (V4SFmode);
+  tmp2 = gen_reg_rtx (V4SFmode);
+  tmp3 = gen_reg_rtx (V4SFmode);
+
+  emit_insn (gen_sse_movhlps (tmp1, in, in));
+  emit_insn (fn (tmp2, tmp1, in));
+
+  emit_insn (gen_sse_shufps_1 (tmp3, tmp2, tmp2,
+			       GEN_INT (1), GEN_INT (1),
+			       GEN_INT (1+4), GEN_INT (1+4)));
+  emit_insn (fn (dest, tmp2, tmp3));
+}
 
 /* Implements target hook vector_mode_supported_p.  */
 static bool
Index: config/i386/sse.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/sse.md,v
retrieving revision 1.15
diff -u -p -d -r1.15 sse.md
--- config/i386/sse.md	26 May 2005 18:14:58 -0000	1.15
+++ config/i386/sse.md	18 Jun 2005 22:38:42 -0000
@@ -653,6 +653,40 @@
   [(set_attr "type" "sseadd")
    (set_attr "mode" "V4SF")])
 
+(define_expand "reduc_plus_v4sf"
+  [(match_operand:V4SF 0 "register_operand" "")
+   (match_operand:V4SF 1 "register_operand" "")]
+  "TARGET_SSE"
+{
+  if (TARGET_SSE3)
+    {
+      rtx tmp = gen_reg_rtx (V4SFmode);
+      emit_insn (gen_sse3_haddv4sf3 (tmp, operands[1], operands[1]));
+      emit_insn (gen_sse3_haddv4sf3 (operands[0], tmp, tmp));
+    }
+  else
+    ix86_expand_reduc_v4sf (gen_addv4sf3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc_smax_v4sf"
+  [(match_operand:V4SF 0 "register_operand" "")
+   (match_operand:V4SF 1 "register_operand" "")]
+  "TARGET_SSE"
+{
+  ix86_expand_reduc_v4sf (gen_smaxv4sf3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc_smin_v4sf"
+  [(match_operand:V4SF 0 "register_operand" "")
+   (match_operand:V4SF 1 "register_operand" "")]
+  "TARGET_SSE"
+{
+  ix86_expand_reduc_v4sf (gen_sminv4sf3, operands[0], operands[1]);
+  DONE;
+})
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel single-precision floating point comparisons