[PATCH] disable use_vector_fp_converts for m_CORE_ALL
Zamyatin, Igor
igor.zamyatin@intel.com
Wed Sep 18 14:06:00 GMT 2013
Ccing Uros. Changes in i386.md could be related to the fix for PR57954.
Thanks,
Igor
-----Original Message-----
From: Wei Mi [mailto:wmi@google.com]
Sent: Thursday, September 12, 2013 2:51 AM
To: GCC Patches
Cc: David Li; Zamyatin, Igor
Subject: [PATCH] disable use_vector_fp_converts for m_CORE_ALL
For the following testcase 1.c, on westmere and sandybridge, performance with the option -mtune=^use_vector_fp_converts is better (improves from 3.46s to 2.83s). It means cvtss2sd is often better than
unpcklps+cvtps2pd on recent x86 platforms.
1.c:
float total = 0.2;
int k = 5;
int main() {
int i;
for (i = 0; i < 1000000000; i++) {
total += (0.5 + k);
}
return total == 0.3;
}
assembly generated by gcc-r201963 without -mtune=^use_vector_fp_converts
.L2:
unpcklps %xmm0, %xmm0
subl $1, %eax
cvtps2pd %xmm0, %xmm0
addsd %xmm1, %xmm0
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
jne .L2
assembly generated by gcc-r201963 with -mtune=^use_vector_fp_converts
.L2:
cvtss2sd %xmm0, %xmm0
subl $1, %eax
addsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
jne .L2
But for testcase 2.c (Thanks to Igor Zamyatin for the testcase), performance with the option -mtune=^use_vector_fp_converts is worse.
Analysis to the assembly shows the performance degradation comes from partial reg stall caused by cvtsd2ss. Adding pxor %xmm0, %xmm0 before cvtsd2ss b(,%rdx,8), %xmm0 gets the performance back.
2.c:
double b[1024];
float a[1024];
int main()
{
int i;
for(i = 0 ; i < 1024 * 1024 * 256; i++)
a[i & 1023] = a[i & 1023] * (float)b[i & 1023];
return (int)a[512];
}
without -mtune-crtl=^use_vector_fp_converts
.L2:
movl %eax, %edx
addl $1, %eax
andl $1023, %edx
cmpl $268435456, %eax
movsd b(,%rdx,8), %xmm0
cvtpd2ps %xmm0, %xmm0 ==> without partial reg stall
because of movsd.
mulss a(,%rdx,4), %xmm0
movss %xmm0, a(,%rdx,4)
jne .L2
with -mtune-crtl=^use_vector_fp_converts
.L2:
movl %eax, %edx
addl $1, %eax
andl $1023, %edx
cmpl $268435456, %eax
cvtsd2ss b(,%rdx,8), %xmm0 ==> with partial reg
stall. Needs to insert "pxor %xmm0, %xmm0" before current insn.
mulss a(,%rdx,4), %xmm0
movss %xmm0, a(,%rdx,4)
jne .L2
So the patch is to turn off use_vector_fp_converts for m_CORE_ALL to use cvtss2sd/cvtsd2ss directly, and add "pxor %xmmreg %xmmreg" before cvtss2sd/cvtsd2ss to break partial reg stall (similar as what r201308 does for cvtsi2ss/cvtsi2sd). bootstrap and regression pass. ok for trunk?
Thanks,
Wei Mi.
2013-09-11 Wei Mi <wmi@google.com>
* config/i386/x86-tune.def (DEF_TUNE): Remove
m_CORE_ALL.
* config/i386/i386.md: Add define_peephole2 to
break partial reg stall for cvtss2sd/cvtsd2ss.
Index: config/i386/x86-tune.def
===================================================================
--- config/i386/x86-tune.def (revision 201963)
+++ config/i386/x86-tune.def (working copy)
@@ -189,7 +189,7 @@ DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_
/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
from FP to FP. */
DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
- m_CORE_ALL | m_AMDFAM10 | m_GENERIC)
+ m_AMDFAM10 | m_GENERIC)
/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
from integer to FP. */
DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 201963)
+++ config/i386/i386.md (working copy)
@@ -5075,6 +5075,63 @@
emit_move_insn (operands[0], CONST0_RTX (<ssevecmode>mode));
})
+;; Break partial reg stall for cvtsd2ss.
+
+(define_peephole2
+ [(set (match_operand:SF 0 "register_operand")
+ (float_truncate:SF
+ (match_operand:DF 1 "nonimmediate_operand")))]
+ "TARGET_SSE2 && TARGET_SSE_MATH
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ && optimize_function_for_speed_p (cfun)
+ && reload_completed && SSE_REG_P (operands[0])
+ && peep2_reg_dead_p (0, operands[0])
+ && (!SSE_REG_P (operands[1])
+ || REGNO (operands[0]) != REGNO (operands[1]))"
+ [(set (match_dup 0)
+ (vec_merge:V4SF
+ (vec_duplicate:V4SF
+ (float_truncate:V2SF
+ (match_dup 1)))
+ (match_dup 0)
+ (const_int 1)))]
+{
+ operands[0] = simplify_gen_subreg (V4SFmode, operands[0],
+ SFmode, 0);
+ operands[1] = simplify_gen_subreg (V2DFmode, operands[1],
+ DFmode, 0);
+ emit_move_insn (operands[0], CONST0_RTX (V4SFmode));
+})
+
+;; Break partial reg stall for cvtss2sd.
+
+(define_peephole2
+ [(set (match_operand:DF 0 "register_operand")
+ (float_extend:DF
+ (match_operand:SF 1 "nonimmediate_operand")))]
+ "TARGET_SSE2 && TARGET_SSE_MATH
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ && optimize_function_for_speed_p (cfun)
+ && reload_completed && SSE_REG_P (operands[0])
+ && peep2_reg_dead_p (0, operands[0])
+ && (!SSE_REG_P (operands[1])
+ || REGNO (operands[0]) != REGNO (operands[1]))"
+ [(set (match_dup 0)
+ (vec_merge:V2DF
+ (float_extend:V2DF
+ (vec_select:V2SF
+ (match_dup 1)
+ (parallel [(const_int 0) (const_int 1)])))
+ (match_dup 0)
+ (const_int 1)))]
+{
+ operands[0] = simplify_gen_subreg (V2DFmode, operands[0],
+ DFmode, 0);
+ operands[1] = simplify_gen_subreg (V4SFmode, operands[1],
+ SFmode, 0);
+ emit_move_insn (operands[0], CONST0_RTX (V2DFmode));
+})
+
;; Avoid store forwarding (partial memory) stall penalty ;; by passing DImode value through XMM registers. */
More information about the Gcc-patches
mailing list