[Bug target/94298] New: x86 duplicates loads
rguenth at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Tue Mar 24 10:17:34 GMT 2020
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94298
Bug ID: 94298
Summary: x86 duplicates loads
Product: gcc
Version: 10.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: rguenth at gcc dot gnu.org
Target Milestone: ---
For the following testcase at -O3 -fgimple (gimple testcase because the
vectorizer generated code depends on not committed patches) we somehow
duplicate the load from y:
typedef double v2df __attribute__((vector_size(16)));
typedef long v2di __attribute__((vector_size(16)));
double x[1024], y[1024];
void __GIMPLE (ssa,guessed_local(10737416))
foo ()
{
v2df * vectp_x7;
v2df vect__56;
v2df vect__45;
v2df * vectp_y3;
v2df _12;
v2df _13;
unsigned int _19;
unsigned int _24;
__BB(2,guessed_local(10737416)):
goto __BB3(precise(134217728));
__BB(3,loop_header(1),guessed_local(1063004409)):
vectp_y3_21 = __PHI (__BB2: &y, __BB3: vectp_y3_17);
vectp_x7_6 = __PHI (__BB2: &x, __BB3: vectp_x7_20);
_19 = __PHI (__BB2: 0u, __BB3: _24);
vect__45_14 = __MEM <v2df> ((double *)vectp_y3_21);
_13 = __VEC_PERM (vect__45_14, vect__45_14, _Literal (v2di) { 1l, 1l });
_12 = __VEC_PERM (vect__45_14, vect__45_14, _Literal (v2di) { 0l, 0l });
vect__56_7 = _12 + _13;
__MEM <v2df> ((double *)vectp_x7_6) = vect__56_7;
vectp_y3_17 = vectp_y3_21 + 16ul;
vectp_x7_20 = vectp_x7_6 + 16ul;
_24 = _19 + 1u;
if (_24 != 512u)
goto __BB3(adjusted(132875551));
else
goto __BB4(adjusted(1342177));
__BB(4,guessed_local(10737416)):
return;
}
results in
foo:
.LFB0:
.cfi_startproc
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L2:
movapd y(%rax), %xmm1
movapd y(%rax), %xmm0
addq $16, %rax
unpcklpd %xmm1, %xmm1
unpckhpd %xmm0, %xmm0
addpd %xmm1, %xmm0
movaps %xmm0, x-16(%rax)
cmpq $8192, %rax
jne .L2
ret
The duplication happens in IRA/LRA but I suspect either x86 costing or
operand constraints makes them think this is cheaper. LRA is fed with
(insn 8 6 11 3 (set (reg/v:V2DF 85 [ vect__45 ])
(mem:V2DF (plus:DI (reg:DI 86 [ ivtmp.6 ])
(symbol_ref:DI ("y") [flags 0x2] <var_decl 0x7f2a94b8dbd0 y>))
[1 MEM[symbol: y, index: ivtmp.6_7, offset: 0B]+0 S16 A128])) 1338
{movv2df_internal}
(expr_list:REG_EQUIV (mem:V2DF (plus:DI (reg:DI 86 [ ivtmp.6 ])
(symbol_ref:DI ("y") [flags 0x2] <var_decl 0x7f2a94b8dbd0 y>))
[1 MEM[symbol: y, index: ivtmp.6_7, offset: 0B]+0 S16 A128])
(nil)))
(insn 11 8 12 3 (set (reg:V2DF 89)
(vec_select:V2DF (vec_concat:V4DF (reg/v:V2DF 85 [ vect__45 ])
(reg/v:V2DF 85 [ vect__45 ]))
(parallel [
(const_int 0 [0])
(const_int 2 [0x2])
]))) "t2.c":27:3 2995 {*vec_interleave_lowv2df}
(nil))
(insn 12 11 13 3 (set (reg:V2DF 90)
(vec_select:V2DF (vec_concat:V4DF (reg/v:V2DF 85 [ vect__45 ])
(reg/v:V2DF 85 [ vect__45 ]))
(parallel [
(const_int 1 [0x1])
(const_int 3 [0x3])
]))) "t2.c":27:3 2989 {*vec_interleave_highv2df}
(expr_list:REG_DEAD (reg/v:V2DF 85 [ vect__45 ])
(nil)))
(insn 13 12 14 3 (set (reg:V2DF 91 [ vect__56 ])
(plus:V2DF (reg:V2DF 89)
(reg:V2DF 90))) "t2.c":27:3 1519 {*addv2df3}
(expr_list:REG_DEAD (reg:V2DF 90)
(expr_list:REG_DEAD (reg:V2DF 89)
(expr_list:REG_EQUIV (mem:V2DF (plus:DI (reg:DI 86 [ ivtmp.6 ])
(symbol_ref:DI ("x") [flags 0x2] <var_decl
0x7f2a94b8db40 x>)) [1 MEM[symbol: x, index: ivtmp.6_7, offset: 0B]+0 S16
A128])
(nil)))))
(insn 14 13 15 3 (set (mem:V2DF (plus:DI (reg:DI 86 [ ivtmp.6 ])
(symbol_ref:DI ("x") [flags 0x2] <var_decl 0x7f2a94b8db40 x>))
[1 MEM[symbol: x, index: ivtmp.6_7, offset: 0B]+0 S16 A128])
(reg:V2DF 91 [ vect__56 ])) "t2.c":27:3 1338 {movv2df_internal}
(expr_list:REG_DEAD (reg:V2DF 91 [ vect__56 ])
(nil)))
and the LRA:
Choosing alt 3 in insn 11: (0) x (1) 0 (2) m
{*vec_interleave_lowv2df}
Creating newreg=92 from oldreg=89, assigning class SSE_REGS to r92
11: r92:V2DF=vec_select(vec_concat(r92:V2DF,[r86:DI+`y']),parallel)
Inserting insn reload before:
26: r92:V2DF=[r86:DI+`y']
Inserting insn reload after:
27: r89:V2DF=r92:V2DF
and postreload CSE cannot do anything because the shuffle clobbers the
reg we loaded into (only sched2 moves things in a way that CSE would
be possible again but after sched2 there's no CSE anymore).
More information about the Gcc-bugs
mailing list