[PATCH, i386]: (Partially) fix PR89074, break SSE reg dependency for a few scalar insns
Uros Bizjak
ubizjak@gmail.com
Sun Feb 3 16:47:00 GMT 2019
Following patch may help with partial SSE reg dependencies for
{R,}SQRTS{S,D}, RCPS{S,D} and ROUNDS{S,D} instructions. It takes the
same strategy as both ICC and clang take, that is:
a) load from memory with MOVS{S,D} and
b) in case of SSE, match input and output register.
The implementation uses preferred_for_speed attribute, so in cold
sections or when compiled with -Os, the compiler is still able to
create direct load from memory (SSE, AVX) and use unmatched registers
for SSE targets.
The sqrt from memory is now compiled to:
movsd z(%rip), %xmm0
sqrtsd %xmm0, %xmm0
(SSE) or
vmovsd z(%rip), %xmm1
vsqrtsd %xmm1, %xmm1, %xmm0
(AVX).
And sqrt from unmatched input register will compile to:
sqrtsd %xmm1, %xmm1
movapd %xmm1, %xmm0
(SSE) or
vsqrtsd %xmm1, %xmm1, %xmm0
(AVX).
The patch doesn't touch conversion instructions, where XOR clearing is
preferred (pending patch for PR 87007).
2019-02-03 Uroš Bizjak <ubizjak@gmail.com>
PR target/89071
* config/i386/i386.md (*sqrt<mode>2_sse): Add (v,0) alternative.
Do not prefer (v,v) alternative for non-AVX targets and (m,v)
alternative for speed when TARGET_SSE_PARTIAL_REG_DEPENDENCY is set.
(*rcpsf2_sse): Ditto.
(*rsqrtsf2_sse): Ditto.
(sse4_1_round<mode<2): Ditto.
Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.
Committed to mainline SVN.
Uros.
-------------- next part --------------
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 744f155fca6f..9948f77fca53 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4472,9 +4472,9 @@
(set (match_dup 0) (float_extend:DF (match_dup 2)))]
"operands[2] = lowpart_subreg (SFmode, operands[0], DFmode);")
-;; Break partial reg stall for cvtss2sd. This splitter should split
-;; late in the pass sequence (after register rename pass),
-;; so allocated registers won't change anymore.
+;; Break partial SSE register dependency stall. This splitter should split
+;; late in the pass sequence (after register rename pass), so allocated
+;; registers won't change anymore
(define_split
[(set (match_operand:DF 0 "sse_reg_operand")
@@ -4632,9 +4632,9 @@
(set (match_dup 0) (float_truncate:SF (match_dup 2)))]
"operands[2] = lowpart_subreg (DFmode, operands[0], SFmode);")
-;; Break partial reg stall for cvtsd2ss. This splitter should split
-;; late in the pass sequence (after register rename pass),
-;; so allocated registers won't change anymore.
+;; Break partial SSE register dependency stall. This splitter should split
+;; late in the pass sequence (after register rename pass), so allocated
+;; registers won't change anymore
(define_split
[(set (match_operand:SF 0 "sse_reg_operand")
@@ -5137,7 +5137,7 @@
(set_attr "unit" "i387")
(set_attr "fp_int_src" "true")])
-;; Avoid partial SSE register dependency stalls. This splitter should split
+;; Break partial SSE register dependency stall. This splitter should split
;; late in the pass sequence (after register rename pass), so allocated
;; registers won't change anymore
@@ -14765,18 +14765,26 @@
(symbol_ref "false"))))])
(define_insn "*rcpsf2_sse"
- [(set (match_operand:SF 0 "register_operand" "=x,x")
- (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "x,m")]
+ [(set (match_operand:SF 0 "register_operand" "=x,x,x")
+ (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "0,x,m")]
UNSPEC_RCP))]
"TARGET_SSE && TARGET_SSE_MATH"
"@
+ %vrcpss\t{%d1, %0|%0, %d1}
%vrcpss\t{%d1, %0|%0, %d1}
%vrcpss\t{%1, %d0|%d0, %1}"
[(set_attr "type" "sse")
(set_attr "atom_sse_attr" "rcp")
(set_attr "btver2_sse_attr" "rcp")
(set_attr "prefix" "maybe_vex")
- (set_attr "mode" "SF")])
+ (set_attr "mode" "SF")
+ (set (attr "preferred_for_speed")
+ (cond [(eq_attr "alternative" "1")
+ (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+ (eq_attr "alternative" "2")
+ (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+ ]
+ (symbol_ref "true")))])
(define_insn "*fop_xf_1_i387"
[(set (match_operand:XF 0 "register_operand" "=f,f")
@@ -15003,18 +15011,26 @@
(set_attr "bdver1_decode" "direct")])
(define_insn "*rsqrtsf2_sse"
- [(set (match_operand:SF 0 "register_operand" "=x,x")
- (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "x,m")]
+ [(set (match_operand:SF 0 "register_operand" "=x,x,x")
+ (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "0,x,m")]
UNSPEC_RSQRT))]
"TARGET_SSE && TARGET_SSE_MATH"
"@
+ %vrsqrtss\t{%d1, %0|%0, %d1}
%vrsqrtss\t{%d1, %0|%0, %d1}
%vrsqrtss\t{%1, %d0|%d0, %1}"
[(set_attr "type" "sse")
(set_attr "atom_sse_attr" "rcp")
(set_attr "btver2_sse_attr" "rcp")
(set_attr "prefix" "maybe_vex")
- (set_attr "mode" "SF")])
+ (set_attr "mode" "SF")
+ (set (attr "preferred_for_speed")
+ (cond [(eq_attr "alternative" "1")
+ (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+ (eq_attr "alternative" "2")
+ (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+ ]
+ (symbol_ref "true")))])
(define_expand "rsqrtsf2"
[(set (match_operand:SF 0 "register_operand")
@@ -15027,11 +15043,12 @@
})
(define_insn "*sqrt<mode>2_sse"
- [(set (match_operand:MODEF 0 "register_operand" "=v,v")
+ [(set (match_operand:MODEF 0 "register_operand" "=v,v,v")
(sqrt:MODEF
- (match_operand:MODEF 1 "nonimmediate_operand" "v,m")))]
+ (match_operand:MODEF 1 "nonimmediate_operand" "0,v,m")))]
"SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
"@
+ %vsqrt<ssemodesuffix>\t{%d1, %0|%0, %d1}
%vsqrt<ssemodesuffix>\t{%d1, %0|%0, %d1}
%vsqrt<ssemodesuffix>\t{%1, %d0|%d0, %1}"
[(set_attr "type" "sse")
@@ -15039,9 +15056,13 @@
(set_attr "btver2_sse_attr" "sqrt")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "<MODE>")
- (set_attr "athlon_decode" "*")
- (set_attr "amdfam10_decode" "*")
- (set_attr "bdver1_decode" "*")])
+ (set (attr "preferred_for_speed")
+ (cond [(eq_attr "alternative" "1")
+ (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+ (eq_attr "alternative" "2")
+ (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+ ]
+ (symbol_ref "true")))])
(define_expand "sqrt<mode>2"
[(set (match_operand:MODEF 0 "register_operand")
@@ -16175,21 +16196,30 @@
(define_insn "sse4_1_round<mode>2"
- [(set (match_operand:MODEF 0 "register_operand" "=x,x,v")
- (unspec:MODEF [(match_operand:MODEF 1 "nonimmediate_operand" "x,m,vm")
- (match_operand:SI 2 "const_0_to_15_operand" "n,n,n")]
- UNSPEC_ROUND))]
+ [(set (match_operand:MODEF 0 "register_operand" "=x,x,x,v")
+ (unspec:MODEF
+ [(match_operand:MODEF 1 "nonimmediate_operand" "0,x,m,vm")
+ (match_operand:SI 2 "const_0_to_15_operand" "n,n,n,n")]
+ UNSPEC_ROUND))]
"TARGET_SSE4_1"
"@
+ %vround<ssemodesuffix>\t{%2, %d1, %0|%0, %d1, %2}
%vround<ssemodesuffix>\t{%2, %d1, %0|%0, %d1, %2}
%vround<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2}
vrndscale<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2}"
[(set_attr "type" "ssecvt")
- (set_attr "prefix_extra" "1,1,*")
- (set_attr "length_immediate" "*,*,1")
- (set_attr "prefix" "maybe_vex,maybe_vex,evex")
- (set_attr "isa" "noavx512f,noavx512f,avx512f")
- (set_attr "mode" "<MODE>")])
+ (set_attr "prefix_extra" "1,1,1,*")
+ (set_attr "length_immediate" "*,*,*,1")
+ (set_attr "prefix" "maybe_vex,maybe_vex,maybe_vex,evex")
+ (set_attr "isa" "noavx512f,noavx512f,noavx512f,avx512f")
+ (set_attr "mode" "<MODE>")
+ (set (attr "preferred_for_speed")
+ (cond [(eq_attr "alternative" "1")
+ (symbol_ref "TARGET_AVX || !TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+ (eq_attr "alternative" "2")
+ (symbol_ref "!TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+ ]
+ (symbol_ref "true")))])
(define_insn "rintxf2"
[(set (match_operand:XF 0 "register_operand" "=f")
More information about the Gcc-patches
mailing list