[RFC PATCH] i386: Do not sanitize upper part of V2SFmode reg with -fno-trapping-math [PR110832]
Uros Bizjak
ubizjak@gmail.com
Sun Jul 30 20:12:53 GMT 2023
Also introduce -m[no-]mmxfp-with-sse option to disable trapping V2SF
named patterns in order to avoid generation of partial vector V4SFmode
trapping instructions.
The new option is enabled by default, because even with sanitization,
a small but consistent speed up of 2 to 3% with Polyhedron capacita
benchmark can be achieved vs. scalar code.
Using -fno-trapping-math improves Polyhedron capacita runtime 8 to 9%
vs. scalar code. This is what clang does by default, as it defaults
to -fno-trapping-math.
PR target/110832
gcc/ChangeLog:
* config/i386/i386.h (TARGET_MMXFP_WITH_SSE): New macro.
* config/i386/i386/opt (mmmxfp-with-sse): New option.
* config/i386/mmx.md (movq_<mode>_to_sse): Do not sanitize
upper part of V2SFmode register with -fno-trapping-math.
(<plusminusmult:insn>v2sf3): Enable for TARGET_MMXFP_WITH_SSE.
(divv2sf3): Ditto.
(<smaxmin:code>v2sf3): Ditto.
(sqrtv2sf2): Ditto.
(*mmx_haddv2sf3_low): Ditto.
(*mmx_hsubv2sf3_low): Ditto.
(vec_addsubv2sf3): Ditto.
(vec_cmpv2sfv2si): Ditto.
(vcond<V2FI:mode>v2sf): Ditto.
(fmav2sf4): Ditto.
(fmsv2sf4): Ditto.
(fnmav2sf4): Ditto.
(fnmsv2sf4): Ditto.
(fix_truncv2sfv2si2): Ditto.
(fixuns_truncv2sfv2si2): Ditto.
(floatv2siv2sf2): Ditto.
(floatunsv2siv2sf2): Ditto.
(nearbyintv2sf2): Ditto.
(rintv2sf2): Ditto.
(lrintv2sfv2si2): Ditto.
(ceilv2sf2): Ditto.
(lceilv2sfv2si2): Ditto.
(floorv2sf2): Ditto.
(lfloorv2sfv2si2): Ditto.
(btruncv2sf2): Ditto.
(roundv2sf2): Ditto.
(lroundv2sfv2si2): Ditto.
Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.
Uros.
-------------- next part --------------
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index ef342fcee9b..af72b6c48a9 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -50,6 +50,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#define TARGET_16BIT_P(x) TARGET_CODE16_P(x)
#define TARGET_MMX_WITH_SSE (TARGET_64BIT && TARGET_SSE2)
+#define TARGET_MMXFP_WITH_SSE (TARGET_MMX_WITH_SSE && ix86_mmxfp_with_sse)
#include "config/vxworks-dummy.h"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 1cc8563477a..1b65fed5daf 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -670,6 +670,10 @@ m3dnowa
Target Mask(ISA_3DNOW_A) Var(ix86_isa_flags) Save
Support Athlon 3Dnow! built-in functions.
+mmmxfp-with-sse
+Target Var(ix86_mmxfp_with_sse) Init(1)
+Enable MMX floating point vectors in SSE registers
+
msse
Target Mask(ISA_SSE) Var(ix86_isa_flags) Save
Support MMX and SSE built-in functions and code generation.
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 896af76a33f..0555da9022b 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -597,7 +597,18 @@ (define_expand "movq_<mode>_to_sse"
(match_operand:V2FI 1 "nonimmediate_operand")
(match_dup 2)))]
"TARGET_SSE2"
- "operands[2] = CONST0_RTX (<MODE>mode);")
+{
+ if (<MODE>mode == V2SFmode
+ && !flag_trapping_math)
+ {
+ rtx op1 = force_reg (<MODE>mode, operands[1]);
+ emit_move_insn (operands[0], lowpart_subreg (<mmxdoublevecmode>mode,
+ op1, <MODE>mode));
+ DONE;
+ }
+
+ operands[2] = CONST0_RTX (<MODE>mode);
+})
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
@@ -650,7 +661,7 @@ (define_expand "<insn>v2sf3"
(plusminusmult:V2SF
(match_operand:V2SF 1 "nonimmediate_operand")
(match_operand:V2SF 2 "nonimmediate_operand")))]
- "TARGET_MMX_WITH_SSE"
+ "TARGET_MMXFP_WITH_SSE"
{
rtx op2 = gen_reg_rtx (V4SFmode);
rtx op1 = gen_reg_rtx (V4SFmode);
@@ -728,7 +739,7 @@ (define_expand "divv2sf3"
[(set (match_operand:V2SF 0 "register_operand")
(div:V2SF (match_operand:V2SF 1 "register_operand")
(match_operand:V2SF 2 "register_operand")))]
- "TARGET_MMX_WITH_SSE"
+ "TARGET_MMXFP_WITH_SSE"
{
rtx op2 = gen_reg_rtx (V4SFmode);
rtx op1 = gen_reg_rtx (V4SFmode);
@@ -750,7 +761,7 @@ (define_expand "<code>v2sf3"
(smaxmin:V2SF
(match_operand:V2SF 1 "register_operand")
(match_operand:V2SF 2 "register_operand")))]
- "TARGET_MMX_WITH_SSE"
+ "TARGET_MMXFP_WITH_SSE"
{
rtx op2 = gen_reg_rtx (V4SFmode);
rtx op1 = gen_reg_rtx (V4SFmode);
@@ -852,7 +863,7 @@ (define_insn "mmx_rcpit2v2sf3"
(define_expand "sqrtv2sf2"
[(set (match_operand:V2SF 0 "register_operand")
(sqrt:V2SF (match_operand:V2SF 1 "nonimmediate_operand")))]
- "TARGET_MMX_WITH_SSE"
+ "TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SFmode);
@@ -933,7 +944,7 @@ (define_insn_and_split "*mmx_haddv2sf3_low"
(vec_select:SF
(match_dup 1)
(parallel [(match_operand:SI 3 "const_0_to_1_operand")]))))]
- "TARGET_SSE3 && TARGET_MMX_WITH_SSE
+ "TARGET_SSE3 && TARGET_MMXFP_WITH_SSE
&& INTVAL (operands[2]) != INTVAL (operands[3])
&& ix86_pre_reload_split ()"
"#"
@@ -979,7 +990,7 @@ (define_insn_and_split "*mmx_hsubv2sf3_low"
(vec_select:SF
(match_dup 1)
(parallel [(const_int 1)]))))]
- "TARGET_SSE3 && TARGET_MMX_WITH_SSE
+ "TARGET_SSE3 && TARGET_MMXFP_WITH_SSE
&& ix86_pre_reload_split ()"
"#"
"&& 1"
@@ -1041,7 +1052,7 @@ (define_expand "vec_addsubv2sf3"
(match_operand:V2SF 2 "nonimmediate_operand"))
(plus:V2SF (match_dup 1) (match_dup 2))
(const_int 1)))]
- "TARGET_SSE3 && TARGET_MMX_WITH_SSE"
+ "TARGET_SSE3 && TARGET_MMXFP_WITH_SSE"
{
rtx op2 = gen_reg_rtx (V4SFmode);
rtx op1 = gen_reg_rtx (V4SFmode);
@@ -1104,7 +1115,7 @@ (define_expand "vec_cmpv2sfv2si"
(match_operator:V2SI 1 ""
[(match_operand:V2SF 2 "nonimmediate_operand")
(match_operand:V2SF 3 "nonimmediate_operand")]))]
- "TARGET_MMX_WITH_SSE"
+ "TARGET_MMXFP_WITH_SSE"
{
rtx ops[4];
ops[3] = gen_reg_rtx (V4SFmode);
@@ -1130,7 +1141,7 @@ (define_expand "vcond<mode>v2sf"
(match_operand:V2SF 5 "nonimmediate_operand")])
(match_operand:V2FI 1 "general_operand")
(match_operand:V2FI 2 "general_operand")))]
- "TARGET_MMX_WITH_SSE"
+ "TARGET_MMXFP_WITH_SSE"
{
rtx ops[6];
ops[5] = gen_reg_rtx (V4SFmode);
@@ -1320,7 +1331,7 @@ (define_expand "fmav2sf4"
(match_operand:V2SF 2 "nonimmediate_operand")
(match_operand:V2SF 3 "nonimmediate_operand")))]
"(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
- && TARGET_MMX_WITH_SSE"
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op3 = gen_reg_rtx (V4SFmode);
rtx op2 = gen_reg_rtx (V4SFmode);
@@ -1345,7 +1356,7 @@ (define_expand "fmsv2sf4"
(neg:V2SF
(match_operand:V2SF 3 "nonimmediate_operand"))))]
"(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
- && TARGET_MMX_WITH_SSE"
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op3 = gen_reg_rtx (V4SFmode);
rtx op2 = gen_reg_rtx (V4SFmode);
@@ -1370,7 +1381,7 @@ (define_expand "fnmav2sf4"
(match_operand:V2SF 2 "nonimmediate_operand")
(match_operand:V2SF 3 "nonimmediate_operand")))]
"(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
- && TARGET_MMX_WITH_SSE"
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op3 = gen_reg_rtx (V4SFmode);
rtx op2 = gen_reg_rtx (V4SFmode);
@@ -1396,7 +1407,7 @@ (define_expand "fnmsv2sf4"
(neg:V2SF
(match_operand:V2SF 3 "nonimmediate_operand"))))]
"(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
- && TARGET_MMX_WITH_SSE"
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op3 = gen_reg_rtx (V4SFmode);
rtx op2 = gen_reg_rtx (V4SFmode);
@@ -1422,7 +1433,7 @@ (define_expand "fnmsv2sf4"
(define_expand "fix_truncv2sfv2si2"
[(set (match_operand:V2SI 0 "register_operand")
(fix:V2SI (match_operand:V2SF 1 "nonimmediate_operand")))]
- "TARGET_MMX_WITH_SSE"
+ "TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SImode);
@@ -1438,7 +1449,7 @@ (define_expand "fix_truncv2sfv2si2"
(define_expand "fixuns_truncv2sfv2si2"
[(set (match_operand:V2SI 0 "register_operand")
(unsigned_fix:V2SI (match_operand:V2SF 1 "nonimmediate_operand")))]
- "TARGET_AVX512VL && TARGET_MMX_WITH_SSE"
+ "TARGET_AVX512VL && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SImode);
@@ -1463,7 +1474,7 @@ (define_insn "mmx_fix_truncv2sfv2si2"
(define_expand "floatv2siv2sf2"
[(set (match_operand:V2SF 0 "register_operand")
(float:V2SF (match_operand:V2SI 1 "nonimmediate_operand")))]
- "TARGET_MMX_WITH_SSE"
+ "TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SImode);
rtx op0 = gen_reg_rtx (V4SFmode);
@@ -1479,7 +1490,7 @@ (define_expand "floatv2siv2sf2"
(define_expand "floatunsv2siv2sf2"
[(set (match_operand:V2SF 0 "register_operand")
(unsigned_float:V2SF (match_operand:V2SI 1 "nonimmediate_operand")))]
- "TARGET_AVX512VL && TARGET_MMX_WITH_SSE"
+ "TARGET_AVX512VL && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SImode);
rtx op0 = gen_reg_rtx (V4SFmode);
@@ -1756,7 +1767,7 @@ (define_expand "vec_initv2sfsf"
(define_expand "nearbyintv2sf2"
[(match_operand:V2SF 0 "register_operand")
(match_operand:V2SF 1 "nonimmediate_operand")]
- "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+ "TARGET_SSE4_1 && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SFmode);
@@ -1772,7 +1783,7 @@ (define_expand "nearbyintv2sf2"
(define_expand "rintv2sf2"
[(match_operand:V2SF 0 "register_operand")
(match_operand:V2SF 1 "nonimmediate_operand")]
- "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+ "TARGET_SSE4_1 && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SFmode);
@@ -1788,8 +1799,8 @@ (define_expand "rintv2sf2"
(define_expand "lrintv2sfv2si2"
[(match_operand:V2SI 0 "register_operand")
(match_operand:V2SF 1 "nonimmediate_operand")]
- "TARGET_SSE4_1 && !flag_trapping_math
- && TARGET_MMX_WITH_SSE"
+ "TARGET_SSE4_1 && !flag_trapping_math
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SImode);
@@ -1806,7 +1817,7 @@ (define_expand "ceilv2sf2"
[(match_operand:V2SF 0 "register_operand")
(match_operand:V2SF 1 "nonimmediate_operand")]
"TARGET_SSE4_1 && !flag_trapping_math
- && TARGET_MMX_WITH_SSE"
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SFmode);
@@ -1822,8 +1833,8 @@ (define_expand "ceilv2sf2"
(define_expand "lceilv2sfv2si2"
[(match_operand:V2SI 0 "register_operand")
(match_operand:V2SF 1 "nonimmediate_operand")]
- "TARGET_SSE4_1 && !flag_trapping_math
- && TARGET_MMX_WITH_SSE"
+ "TARGET_SSE4_1 && !flag_trapping_math
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SImode);
@@ -1840,7 +1851,7 @@ (define_expand "floorv2sf2"
[(match_operand:V2SF 0 "register_operand")
(match_operand:V2SF 1 "nonimmediate_operand")]
"TARGET_SSE4_1 && !flag_trapping_math
- && TARGET_MMX_WITH_SSE"
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SFmode);
@@ -1856,8 +1867,8 @@ (define_expand "floorv2sf2"
(define_expand "lfloorv2sfv2si2"
[(match_operand:V2SI 0 "register_operand")
(match_operand:V2SF 1 "nonimmediate_operand")]
- "TARGET_SSE4_1 && !flag_trapping_math
- && TARGET_MMX_WITH_SSE"
+ "TARGET_SSE4_1 && !flag_trapping_math
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SImode);
@@ -1874,7 +1885,7 @@ (define_expand "btruncv2sf2"
[(match_operand:V2SF 0 "register_operand")
(match_operand:V2SF 1 "nonimmediate_operand")]
"TARGET_SSE4_1 && !flag_trapping_math
- && TARGET_MMX_WITH_SSE"
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SFmode);
@@ -1891,7 +1902,7 @@ (define_expand "roundv2sf2"
[(match_operand:V2SF 0 "register_operand")
(match_operand:V2SF 1 "nonimmediate_operand")]
"TARGET_SSE4_1 && !flag_trapping_math
- && TARGET_MMX_WITH_SSE"
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SFmode);
@@ -1907,8 +1918,8 @@ (define_expand "roundv2sf2"
(define_expand "lroundv2sfv2si2"
[(match_operand:V2SI 0 "register_operand")
(match_operand:V2SF 1 "nonimmediate_operand")]
- "TARGET_SSE4_1 && !flag_trapping_math
- && TARGET_MMX_WITH_SSE"
+ "TARGET_SSE4_1 && !flag_trapping_math
+ && TARGET_MMXFP_WITH_SSE"
{
rtx op1 = gen_reg_rtx (V4SFmode);
rtx op0 = gen_reg_rtx (V4SImode);
More information about the Gcc-patches
mailing list