[Bug middle-end/99416] New: s211 benchmark of TSVC is vectorized by icc and not by gcc
hubicka at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Fri Mar 5 16:20:54 GMT 2021
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99416
Bug ID: 99416
Summary: s211 benchmark of TSVC is vectorized by icc and not by
gcc
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
void main()
{
for (int nl = 0; nl < iterations; nl++) {
for (int i = 1; i < LEN_1D-1; i++) {
a[i] = b[i - 1] + c[i] * d[i];
b[i] = b[i + 1] - e[i] * d[i];
}
}
}
Icc produces:
ain:
..B1.1: # Preds ..B1.0
# Execution count [0.00e+00]
.cfi_startproc
..___tag_value_ain.1:
..L2:
#9.1
subq $136, %rsp #9.1
.cfi_def_cfa_offset 144
xorl %edx, %edx #11.5
lea 12+d(%rip), %r8 #14.38
vmovss (%r8), %xmm0 #14.38
movl $7, %edi #13.38
lea 12+e(%rip), %r9 #14.38
vmulss (%r9), %xmm0, %xmm12 #14.38
xorl %esi, %esi #13.38
lea 12+c(%rip), %r10 #13.38
vmulss (%r10), %xmm0, %xmm0 #13.38
vmovss 16(%r8), %xmm4 #14.38
movl $31977, %ecx #12.9
vmulss 16(%r9), %xmm4, %xmm14 #14.38
movl $31975, %eax #12.9
lea 24+b(%rip), %r11 #14.20
vmovss (%r11), %xmm11 #14.20
vmovss 4(%r8), %xmm6 #14.38
vmovss %xmm12, 104(%rsp) #14.38[spill]
vmovss %xmm11, 8(%rsp) #14.20[spill]
vmulss 4(%r9), %xmm6, %xmm12 #14.38
vmulss 4(%r10), %xmm6, %xmm11 #13.38
vmovss 127984+d(%rip), %xmm6 #14.38
vmovss 8(%r8), %xmm13 #14.38
vmovss %xmm14, 96(%rsp) #14.38[spill]
vmulss 127984+e(%rip), %xmm6, %xmm14 #14.38
vmulss 8(%r9), %xmm13, %xmm1 #14.38
vmovss %xmm14, 112(%rsp) #14.38[spill]
vmovss 127988+d(%rip), %xmm14 #14.38
vmovss %xmm1, 16(%rsp) #14.38[spill]
vmulss 8(%r10), %xmm13, %xmm1 #13.38
vmulss 16(%r10), %xmm4, %xmm13 #13.38
vmulss 127988+e(%rip), %xmm14, %xmm4 #14.38
vmovss %xmm4, 120(%rsp) #14.38[spill]
vmulss 127988+c(%rip), %xmm14, %xmm4 #13.38
vmovss -4(%r11), %xmm5 #14.20
vmovss -8(%r8), %xmm2 #14.38
vmovss 12(%r8), %xmm15 #14.38
vmovss %xmm4, 24(%rsp) #13.38[spill]
vmovss 127992+d(%rip), %xmm4 #14.38
vmovss %xmm5, (%rsp) #14.20[spill]
vmulss -8(%r9), %xmm2, %xmm3 #14.38
vmulss -8(%r10), %xmm2, %xmm5 #13.38
vmulss 12(%r9), %xmm15, %xmm2 #14.38
vmulss 12(%r10), %xmm15, %xmm15 #13.38
vmulss 127992+e(%rip), %xmm4, %xmm14 #14.38
vmulss 127992+c(%rip), %xmm4, %xmm4 #13.38
vmovss -4(%r8), %xmm10 #14.38
vmulss -4(%r9), %xmm10, %xmm7 #14.38
vmulss -4(%r10), %xmm10, %xmm10 #13.38
vmovss %xmm7, 88(%rsp) #14.38[spill]
vmovss %xmm4, 32(%rsp) #13.38[spill]
vmovss %xmm15, 56(%rsp) #13.31[spill]
vmovss %xmm14, 40(%rsp) #13.31[spill]
vmovss %xmm3, 80(%rsp) #13.31[spill]
vmovss -16(%r11), %xmm9 #14.20
vmovss -12(%r11), %xmm8 #14.20
vmovss -8(%r11), %xmm7 #14.20
vmovss 127984+c(%rip), %xmm4 #13.31
vmovss %xmm1, 64(%rsp) #13.31[spill]
vmovss %xmm0, 48(%rsp) #13.31[spill]
vmovss %xmm2, 72(%rsp) #13.31[spill]
vmovss 16(%rsp), %xmm14 #13.31[spill]
vmovss 8(%rsp), %xmm15 #13.31[spill]
vmovss (%rsp), %xmm3 #13.31[spill]
# LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15
edx xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
..B1.2: # Preds ..B1.10 ..B1.1
# Execution count [1.00e+05]
movq %rdi, %r8 #12.9
vsubss 80(%rsp), %xmm9, %xmm0 #14.38[spill]
vsubss 88(%rsp), %xmm8, %xmm1 #14.38[spill]
vsubss 104(%rsp), %xmm7, %xmm2 #14.38[spill]
vsubss %xmm14, %xmm15, %xmm7 #14.38
vsubss %xmm12, %xmm3, %xmm3 #14.38
vmovss 28+b(%rip), %xmm8 #14.20
vmovss 32+b(%rip), %xmm15 #14.20
vmovss %xmm0, 4+b(%rip) #14.13
vmovss %xmm1, 8+b(%rip) #14.13
vmovss %xmm2, 12+b(%rip) #14.13
vmovss %xmm3, 16+b(%rip) #14.13
vmovss %xmm7, 20+b(%rip) #14.13
vsubss 72(%rsp), %xmm8, %xmm9 #14.38[spill]
vsubss 96(%rsp), %xmm15, %xmm0 #14.38[spill]
vmovss %xmm9, 24+b(%rip) #14.13
vmovss %xmm0, 28+b(%rip) #14.13
# LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14
r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.3: # Preds ..B1.3 ..B1.2
# Execution count [3.20e+09]
vmovups 4+e(,%r8,4), %ymm1 #14.31
lea (,%r8,4), %r9 #14.13
vmovups 36+e(,%r8,4), %ymm3 #14.31
vmovups 68+e(,%r8,4), %ymm8 #14.31
vmovups 100+e(,%r8,4), %ymm15 #14.31
vmovups 4+d(,%r8,4), %ymm0 #14.38
vmovups 36+d(,%r8,4), %ymm2 #14.38
vmovups 68+d(,%r8,4), %ymm7 #14.38
vmovups 100+d(,%r8,4), %ymm9 #14.38
vfnmadd213ps 8+b(,%r8,4), %ymm0, %ymm1 #14.38
vfnmadd213ps 40+b(,%r8,4), %ymm2, %ymm3 #14.38
vfnmadd213ps 72+b(,%r8,4), %ymm7, %ymm8 #14.38
vfnmadd213ps 104+b(,%r8,4), %ymm9, %ymm15 #14.38
vmovups %ymm1, 4+b(%r9) #14.13
vmovups %ymm3, 36+b(%r9) #14.13
vmovups %ymm8, 68+b(%r9) #14.13
vmovups %ymm15, 100+b(%r9) #14.13
addq $32, %r8 #12.9
cmpq $31975, %r8 #12.9
jb ..B1.3 # Prob 99% #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14
r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.4: # Preds ..B1.3
# Execution count [1.00e+05]
movq %rsi, %r9 #12.9
movq %rcx, %r8 #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14
r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.5: # Preds ..B1.5 ..B1.4
# Execution count [3.20e+09]
vmovups 127904+e(,%r9,4), %xmm1 #14.31
vmovups 127904+d(,%r9,4), %xmm0 #14.38
vfnmadd213ps b(,%r8,4), %xmm0, %xmm1 #14.38
addq $4, %r8 #12.9
vmovups %xmm1, 127904+b(,%r9,4) #14.13
addq $4, %r9 #12.9
cmpq $20, %r9 #12.9
jb ..B1.5 # Prob 99% #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14
r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14
..B1.6: # Preds ..B1.5
# Execution count [1.00e+05]
vmovss 127996+b(%rip), %xmm9 #14.20
movq %rdi, %r8 #12.9
vmovss 127992+b(%rip), %xmm1 #14.20
vmovss 127988+b(%rip), %xmm2 #14.20
vaddss b(%rip), %xmm5, %xmm7 #13.38
vaddss 4+b(%rip), %xmm10, %xmm3 #13.38
vsubss 40(%rsp), %xmm9, %xmm8 #14.38[spill]
vsubss 112(%rsp), %xmm2, %xmm2 #14.38[spill]
vsubss 120(%rsp), %xmm1, %xmm1 #14.38[spill]
vmovss %xmm7, 4+a(%rip) #13.13
vmovss 16+b(%rip), %xmm7 #13.20
vmovss %xmm3, 8+a(%rip) #13.13
vmovss 8+b(%rip), %xmm9 #13.20
vmovss %xmm8, 127992+b(%rip) #14.13
vmovss 12+b(%rip), %xmm8 #13.20
vmovss %xmm2, 127984+b(%rip) #14.13
vaddss %xmm11, %xmm8, %xmm0 #13.38
vaddss 64(%rsp), %xmm7, %xmm3 #13.38[spill]
vaddss 48(%rsp), %xmm9, %xmm15 #13.38[spill]
vmovss %xmm3, 20+a(%rip) #13.13
vmovss 20+b(%rip), %xmm3 #13.20
vmovss %xmm15, 12+a(%rip) #13.13
vmovss %xmm0, 16+a(%rip) #13.13
vmovss %xmm1, 127988+b(%rip) #14.13
vmovss %xmm9, (%rsp) #13.13[spill]
vaddss 56(%rsp), %xmm3, %xmm15 #13.38[spill]
vmovss %xmm15, 24+a(%rip) #13.13
vmovss 24+b(%rip), %xmm15 #13.20
vaddss %xmm13, %xmm15, %xmm0 #13.38
vmovss %xmm0, 28+a(%rip) #13.13
# LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15
edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
..B1.7: # Preds ..B1.7 ..B1.6
# Execution count [3.20e+09]
vmovups 4+c(,%r8,4), %ymm9 #13.31
lea (,%r8,4), %r9 #13.13
vmovups 4+d(,%r8,4), %ymm0 #13.38
vfmadd213ps b(,%r8,4), %ymm0, %ymm9 #13.38
vmovups 36+d(,%r8,4), %ymm0 #13.38
vmovups %ymm9, 4+a(%r9) #13.13
vmovups 36+c(,%r8,4), %ymm9 #13.31
vfmadd213ps 32+b(,%r8,4), %ymm0, %ymm9 #13.38
vmovups 68+d(,%r8,4), %ymm0 #13.38
vmovups %ymm9, 36+a(%r9) #13.13
vmovups 68+c(,%r8,4), %ymm9 #13.31
vfmadd213ps 64+b(,%r8,4), %ymm0, %ymm9 #13.38
vmovups 100+d(,%r8,4), %ymm0 #13.38
vmovups %ymm9, 68+a(%r9) #13.13
vmovups 100+c(,%r8,4), %ymm9 #13.31
vfmadd213ps 96+b(,%r8,4), %ymm0, %ymm9 #13.38
addq $32, %r8 #12.9
vmovups %ymm9, 100+a(%r9) #13.13
cmpq $31975, %r8 #12.9
jb ..B1.7 # Prob 99% #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14
r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14
xmm15
..B1.8: # Preds ..B1.7
# Execution count [1.00e+05]
movq %rsi, %r9 #12.9
movq %rax, %r8 #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14
r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14
xmm15
..B1.9: # Preds ..B1.9 ..B1.8
# Execution count [3.20e+09]
vmovups 127904+c(,%r9,4), %xmm9 #13.31
vmovups 127904+d(,%r9,4), %xmm0 #13.38
vfmadd213ps b(,%r8,4), %xmm0, %xmm9 #13.38
addq $4, %r8 #12.9
vmovups %xmm9, 127904+a(,%r9,4) #13.13
addq $4, %r9 #12.9
cmpq $20, %r9 #12.9
jb ..B1.9 # Prob 99% #12.9
# LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14
r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14
xmm15
..B1.10: # Preds ..B1.9
# Execution count [1.07e+09]
incl %edx #11.5
vmovss 127980+b(%rip), %xmm0 #13.20
vmovss (%rsp), %xmm9 #[spill]
vfmadd231ss %xmm6, %xmm4, %xmm0 #13.38
cmpl $100000, %edx #11.5
jb ..B1.2 # Prob 99% #11.5
# LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15
edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13
xmm14 xmm15
..B1.11: # Preds ..B1.10
# Execution count [1.00e+00]
vmovss %xmm0, 127984+a(%rip) #13.13
vaddss 32(%rsp), %xmm1, %xmm1 #13.38[spill]
vaddss 24(%rsp), %xmm2, %xmm2 #13.38[spill]
vmovss %xmm1, 127992+a(%rip) #13.13
vmovss %xmm2, 127988+a(%rip) #13.13
vzeroupper #17.1
addq $136, %rsp #17.1
.cfi_def_cfa_offset 8
ret #17.1
More information about the Gcc-bugs
mailing list