This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH, i386]: Fix PR target/13958:Conversion from unsigned to double is painfully slow on P4


Hello!

Due to store forwarding penalty (this is how partial memory access is called nowadays), the code from PR runs "painfully" slow:

--cut here--
unsigned a[2]={1,2};

inline unsigned foo1(int i) { return a[i]; }

int main()
{
   double x=0;
   int    i;

   for ( i=0; i<100000000; ++i )
       x+=foo1(i%2);

   return (int)x;
}
--cut here--

The inner loop is compiled (-O2 -march=pentium4 -malign-double) to:

.L4:
       movl    %ecx, %eax
       andl    $1, %eax
       movl    a(,%eax,4), %eax
       xorl    %edx, %edx
(*)    pushl   %edx
(*)    pushl   %eax
(*)    fildll  (%esp)
       addl    $8, %esp
       faddp   %st, %st(1)
       addl    $1, %ecx
       cmpl    $100000000, %ecx
       jne     .L4

Instructions marked with (*) form partial memory access.

Runtime:

time ./a.out

real    0m0.794s
user    0m0.724s
sys     0m0.000s

Patched gcc creates:

.L4:
       movl    %edx, %eax
       andl    $1, %eax
       movd    a(,%eax,4), %xmm0
       movq    %xmm0, -16(%ebp)
       fildll  -16(%ebp)
       faddp   %st, %st(1)
       addl    $1, %edx
       cmpl    $100000000, %edx
       jne     .L4

time ./a.out

real    0m0.123s
user    0m0.124s
sys     0m0.000s

This represents more than 5.8x speedup on what is claimed as:

--quote--

Btw, such conversions are quite common in numerical codes that deal
with uniform grids: the array index can be used as a coordinate (usually
after some trivial scaling). Given that the indices used in libstdc++
are usually of the type size_t the slow conversion can have quite a
negative performance impact.

--unqoute--

I guess that such a speedup comes quite handy. This code prefers DImode aligned to 8, since we are dealing with real DImode values. H.J. - should we align DImode values to 8 for TARGET_MMX/TARGET_SSE ?

2008-03-21 Uros Bizjak <ubizjak@gmail.com>

       PR target/13958
       * config/i386/i386.md ("*floatunssi<mode2>_1"): New pattern with
       corresponding post-reload splitters.
       ("floatunssi<mode>2"): Expand to unsigned_float x87 insn pattern
       when x87 FP math is selected.
       * config/i386/i386-protos.h (ix86_expand_convert_uns_sixf_sse):
       New function prototype.
       * config/i386/i386.c (ix86_expand_convert_uns_sixf_sse): New
       unreachable function to ease macroization of insn patterns.

The patch was bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}, patch is committed to SVN.

RMs, Do we want this patch in 4.3.1, although it isn't strictly a regression?

Uros.
Index: i386.md
===================================================================
--- i386.md	(revision 133430)
+++ i386.md	(working copy)
@@ -5313,13 +5313,76 @@
   DONE;
 })
 
+;; Avoid store forwarding (partial memory) stall penalty by extending
+;; SImode value to DImode through XMM register instead of pushing two
+;; SImode values to stack. Note that even !TARGET_INTER_UNIT_MOVES
+;; targets benefit from this optimization. Also note that fild
+;; loads from memory only.
+
+(define_insn "*floatunssi<mode>2_1"
+  [(set (match_operand:X87MODEF 0 "register_operand" "=f,f")
+	(unsigned_float:X87MODEF
+	  (match_operand:SI 1 "nonimmediate_operand" "x,m")))
+   (clobber (match_operand:DI 2 "memory_operand" "=m,m"))
+   (clobber (match_scratch:SI 3 "=X,x"))]
+  "!TARGET_64BIT
+   && TARGET_80387 && TARGET_SSE"
+  "#"
+  [(set_attr "type" "multi")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:X87MODEF 0 "register_operand" "")
+	(unsigned_float:X87MODEF
+	  (match_operand:SI 1 "register_operand" "")))
+   (clobber (match_operand:DI 2 "memory_operand" ""))
+   (clobber (match_scratch:SI 3 ""))]
+  "!TARGET_64BIT
+   && TARGET_80387 && TARGET_SSE
+   && reload_completed"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0)
+	(float:X87MODEF (match_dup 2)))]
+  "operands[1] = simplify_gen_subreg (DImode, operands[1], SImode, 0);")
+
+(define_split
+  [(set (match_operand:X87MODEF 0 "register_operand" "")
+	(unsigned_float:X87MODEF
+	  (match_operand:SI 1 "memory_operand" "")))
+   (clobber (match_operand:DI 2 "memory_operand" ""))
+   (clobber (match_scratch:SI 3 ""))]
+  "!TARGET_64BIT
+   && TARGET_80387 && TARGET_SSE
+   && reload_completed"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0)
+	(float:X87MODEF (match_dup 2)))]
+{
+  emit_move_insn (operands[3], operands[1]);
+  operands[3] = simplify_gen_subreg (DImode, operands[3], SImode, 0);
+})
+
 (define_expand "floatunssi<mode>2"
-  [(use (match_operand:MODEF 0 "register_operand" ""))
-   (use (match_operand:SI 1 "nonimmediate_operand" ""))]
-  "!TARGET_64BIT && SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
+  [(parallel
+     [(set (match_operand:X87MODEF 0 "register_operand" "")
+	   (unsigned_float:X87MODEF
+	     (match_operand:SI 1 "nonimmediate_operand" "")))
+      (clobber (match_dup 2))
+      (clobber (match_scratch:SI 3 ""))])]
+  "!TARGET_64BIT
+   && ((TARGET_80387 && TARGET_SSE)
+       || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH))"
 {
-  ix86_expand_convert_uns_si<mode>_sse (operands[0], operands[1]);
-  DONE;
+  if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+    {
+      ix86_expand_convert_uns_si<mode>_sse (operands[0], operands[1]);
+      DONE;
+    }
+  else
+    {
+      int slot = virtuals_instantiated ? SLOT_TEMP : SLOT_VIRTUAL;
+      operands[2] = assign_386_stack_local (DImode, slot);
+    }
 })
 
 (define_expand "floatunsdisf2"
Index: i386-protos.h
===================================================================
--- i386-protos.h	(revision 133430)
+++ i386-protos.h	(working copy)
@@ -91,6 +91,7 @@
 extern rtx ix86_build_const_vector (enum machine_mode, bool, rtx);
 extern void ix86_split_convert_uns_si_sse (rtx[]);
 extern void ix86_expand_convert_uns_didf_sse (rtx, rtx);
+extern void ix86_expand_convert_uns_sixf_sse (rtx, rtx);
 extern void ix86_expand_convert_uns_sidf_sse (rtx, rtx);
 extern void ix86_expand_convert_uns_sisf_sse (rtx, rtx);
 extern void ix86_expand_convert_sign_didf_sse (rtx, rtx);
Index: i386.c
===================================================================
--- i386.c	(revision 133430)
+++ i386.c	(working copy)
@@ -10903,6 +10903,14 @@
   ix86_expand_vector_extract (false, target, fp_xmm, 0);
 }
 
+/* Not used, but eases macroization of patterns.  */
+void
+ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
+				  rtx input ATTRIBUTE_UNUSED)
+{
+  gcc_unreachable ();
+}
+
 /* Convert an unsigned SImode value into a DFmode.  Only currently used
    for SSE, but applicable anywhere.  */
 

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]