This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH, i386]: Fix PR target/13958:Conversion from unsigned to double is painfully slow on P4
- From: Uros Bizjak <ubizjak at gmail dot com>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Cc: "H. J. Lu" <hjl at lucon dot org>
- Date: Fri, 21 Mar 2008 20:26:42 +0100
- Subject: [PATCH, i386]: Fix PR target/13958:Conversion from unsigned to double is painfully slow on P4
Hello!
Due to store forwarding penalty (this is how partial memory access is
called nowadays), the code from PR runs "painfully" slow:
--cut here--
unsigned a[2]={1,2};
inline unsigned foo1(int i) { return a[i]; }
int main()
{
double x=0;
int i;
for ( i=0; i<100000000; ++i )
x+=foo1(i%2);
return (int)x;
}
--cut here--
The inner loop is compiled (-O2 -march=pentium4 -malign-double) to:
.L4:
movl %ecx, %eax
andl $1, %eax
movl a(,%eax,4), %eax
xorl %edx, %edx
(*) pushl %edx
(*) pushl %eax
(*) fildll (%esp)
addl $8, %esp
faddp %st, %st(1)
addl $1, %ecx
cmpl $100000000, %ecx
jne .L4
Instructions marked with (*) form partial memory access.
Runtime:
time ./a.out
real 0m0.794s
user 0m0.724s
sys 0m0.000s
Patched gcc creates:
.L4:
movl %edx, %eax
andl $1, %eax
movd a(,%eax,4), %xmm0
movq %xmm0, -16(%ebp)
fildll -16(%ebp)
faddp %st, %st(1)
addl $1, %edx
cmpl $100000000, %edx
jne .L4
time ./a.out
real 0m0.123s
user 0m0.124s
sys 0m0.000s
This represents more than 5.8x speedup on what is claimed as:
--quote--
Btw, such conversions are quite common in numerical codes that deal
with uniform grids: the array index can be used as a coordinate (usually
after some trivial scaling). Given that the indices used in libstdc++
are usually of the type size_t the slow conversion can have quite a
negative performance impact.
--unqoute--
I guess that such a speedup comes quite handy. This code prefers DImode
aligned to 8, since we are dealing with real DImode values. H.J. -
should we align DImode values to 8 for TARGET_MMX/TARGET_SSE ?
2008-03-21 Uros Bizjak <ubizjak@gmail.com>
PR target/13958
* config/i386/i386.md ("*floatunssi<mode2>_1"): New pattern with
corresponding post-reload splitters.
("floatunssi<mode>2"): Expand to unsigned_float x87 insn pattern
when x87 FP math is selected.
* config/i386/i386-protos.h (ix86_expand_convert_uns_sixf_sse):
New function prototype.
* config/i386/i386.c (ix86_expand_convert_uns_sixf_sse): New
unreachable function to ease macroization of insn patterns.
The patch was bootstrapped and regression tested on x86_64-pc-linux-gnu
{,-m32}, patch is committed to SVN.
RMs, Do we want this patch in 4.3.1, although it isn't strictly a
regression?
Uros.
Index: i386.md
===================================================================
--- i386.md (revision 133430)
+++ i386.md (working copy)
@@ -5313,13 +5313,76 @@
DONE;
})
+;; Avoid store forwarding (partial memory) stall penalty by extending
+;; SImode value to DImode through XMM register instead of pushing two
+;; SImode values to stack. Note that even !TARGET_INTER_UNIT_MOVES
+;; targets benefit from this optimization. Also note that fild
+;; loads from memory only.
+
+(define_insn "*floatunssi<mode>2_1"
+ [(set (match_operand:X87MODEF 0 "register_operand" "=f,f")
+ (unsigned_float:X87MODEF
+ (match_operand:SI 1 "nonimmediate_operand" "x,m")))
+ (clobber (match_operand:DI 2 "memory_operand" "=m,m"))
+ (clobber (match_scratch:SI 3 "=X,x"))]
+ "!TARGET_64BIT
+ && TARGET_80387 && TARGET_SSE"
+ "#"
+ [(set_attr "type" "multi")
+ (set_attr "mode" "<MODE>")])
+
+(define_split
+ [(set (match_operand:X87MODEF 0 "register_operand" "")
+ (unsigned_float:X87MODEF
+ (match_operand:SI 1 "register_operand" "")))
+ (clobber (match_operand:DI 2 "memory_operand" ""))
+ (clobber (match_scratch:SI 3 ""))]
+ "!TARGET_64BIT
+ && TARGET_80387 && TARGET_SSE
+ && reload_completed"
+ [(set (match_dup 2) (match_dup 1))
+ (set (match_dup 0)
+ (float:X87MODEF (match_dup 2)))]
+ "operands[1] = simplify_gen_subreg (DImode, operands[1], SImode, 0);")
+
+(define_split
+ [(set (match_operand:X87MODEF 0 "register_operand" "")
+ (unsigned_float:X87MODEF
+ (match_operand:SI 1 "memory_operand" "")))
+ (clobber (match_operand:DI 2 "memory_operand" ""))
+ (clobber (match_scratch:SI 3 ""))]
+ "!TARGET_64BIT
+ && TARGET_80387 && TARGET_SSE
+ && reload_completed"
+ [(set (match_dup 2) (match_dup 3))
+ (set (match_dup 0)
+ (float:X87MODEF (match_dup 2)))]
+{
+ emit_move_insn (operands[3], operands[1]);
+ operands[3] = simplify_gen_subreg (DImode, operands[3], SImode, 0);
+})
+
(define_expand "floatunssi<mode>2"
- [(use (match_operand:MODEF 0 "register_operand" ""))
- (use (match_operand:SI 1 "nonimmediate_operand" ""))]
- "!TARGET_64BIT && SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
+ [(parallel
+ [(set (match_operand:X87MODEF 0 "register_operand" "")
+ (unsigned_float:X87MODEF
+ (match_operand:SI 1 "nonimmediate_operand" "")))
+ (clobber (match_dup 2))
+ (clobber (match_scratch:SI 3 ""))])]
+ "!TARGET_64BIT
+ && ((TARGET_80387 && TARGET_SSE)
+ || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH))"
{
- ix86_expand_convert_uns_si<mode>_sse (operands[0], operands[1]);
- DONE;
+ if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+ {
+ ix86_expand_convert_uns_si<mode>_sse (operands[0], operands[1]);
+ DONE;
+ }
+ else
+ {
+ int slot = virtuals_instantiated ? SLOT_TEMP : SLOT_VIRTUAL;
+ operands[2] = assign_386_stack_local (DImode, slot);
+ }
})
(define_expand "floatunsdisf2"
Index: i386-protos.h
===================================================================
--- i386-protos.h (revision 133430)
+++ i386-protos.h (working copy)
@@ -91,6 +91,7 @@
extern rtx ix86_build_const_vector (enum machine_mode, bool, rtx);
extern void ix86_split_convert_uns_si_sse (rtx[]);
extern void ix86_expand_convert_uns_didf_sse (rtx, rtx);
+extern void ix86_expand_convert_uns_sixf_sse (rtx, rtx);
extern void ix86_expand_convert_uns_sidf_sse (rtx, rtx);
extern void ix86_expand_convert_uns_sisf_sse (rtx, rtx);
extern void ix86_expand_convert_sign_didf_sse (rtx, rtx);
Index: i386.c
===================================================================
--- i386.c (revision 133430)
+++ i386.c (working copy)
@@ -10903,6 +10903,14 @@
ix86_expand_vector_extract (false, target, fp_xmm, 0);
}
+/* Not used, but eases macroization of patterns. */
+void
+ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
+ rtx input ATTRIBUTE_UNUSED)
+{
+ gcc_unreachable ();
+}
+
/* Convert an unsigned SImode value into a DFmode. Only currently used
for SSE, but applicable anywhere. */