[Bug middle-end/99415] New: s115 benchmark of TSVC is vectorized by icc and not by gcc
hubicka at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Fri Mar 5 15:50:14 GMT 2021
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99415
Bug ID: 99415
Summary: s115 benchmark of TSVC is vectorized by icc and not by
gcc
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
typedef float real_t;
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],aa[LEN_2D][LEN_2D];
void main()
{
for (int nl = 0; nl < 1000*(iterations/LEN_2D); nl++) {
for (int j = 0; j < LEN_2D; j++) {
for (int i = j+1; i < LEN_2D; i++) {
a[i] -= aa[j][i] * a[j];
}
}
}
}
is built as:
main:
..B1.1: # Preds ..B1.0
# Execution count [1.17e-01]
.cfi_startproc
..___tag_value_main.1:
..L2:
#9.1
pushq %rbp #9.1
.cfi_def_cfa_offset 16
movq %rsp, %rbp #9.1
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-128, %rsp #9.1
pushq %r14 #9.1
pushq %r15 #9.1
pushq %rbx #9.1
subq $104, %rsp #9.1
movl $3, %edi #9.1
xorl %esi, %esi #9.1
call __intel_new_feature_proc_init #9.1
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff,
0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff,
0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff,
0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
# LOE rbx r12 r13 r14 r15
..B1.29: # Preds ..B1.1
# Execution count [1.17e-01]
vstmxcsr (%rsp) #9.1
xorl %eax, %eax #11.5
orl $32832, (%rsp) #9.1
vldmxcsr (%rsp) #9.1
# LOE r12 r13 eax
..B1.2: # Preds ..B1.22 ..B1.29
# Execution count [4.50e+04]
xorl %r11d, %r11d #12.9
xorl %edi, %edi #12.9
xorl %ebx, %ebx #12.9
xorl %r9d, %r9d #12.9
xorl %esi, %esi #12.9
# LOE rbx rsi r11 r12 r13 eax edi r9d
..B1.3: # Preds ..B1.21 ..B1.2
# Execution count [1.15e+07]
incl %edi #13.28
decl %r9d #13.28
cmpl $256, %edi #13.35
jge ..B1.21 # Prob 50% #13.35
# LOE rbx rsi r11 r12 r13 eax edi r9d
..B1.4: # Preds ..B1.3
# Execution count [1.04e+07]
lea 256(%r9), %r10d #13.35
cmpl $16, %r10d #13.13
jl ..B1.25 # Prob 10% #13.13
# LOE rbx rsi r11 r12 r13 eax edi r9d r10d
..B1.5: # Preds ..B1.4
# Execution count [1.04e+07]
lea 4+aa(%rsi,%rbx), %r8 #14.25
andq $31, %r8 #13.13
lea (%rsi,%rbx), %r14 #14.25
movl %r8d, %edx #13.13
negl %edx #13.13
addl $32, %edx #13.13
shrl $2, %edx #13.13
testl %r8d, %r8d #13.13
cmovne %edx, %r8d #13.13
lea 16(%r8), %ecx #13.13
cmpl %ecx, %r10d #13.13
jl ..B1.25 # Prob 10% #13.13
# LOE rbx rsi r8 r11 r12 r13 r14 eax edi r9d
r10d
..B1.6: # Preds ..B1.5
# Execution count [1.15e+07]
movl %r10d, %ecx #13.13
xorl %r15d, %r15d #13.13
subl %r8d, %ecx #13.13
xorl %edx, %edx #13.13
andl $15, %ecx #13.13
negl %ecx #13.13
addl %r10d, %ecx #13.13
testl %r8d, %r8d #13.13
jbe ..B1.10 # Prob 9% #13.13
# LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax
ecx edi r9d r10d
..B1.7: # Preds ..B1.6
# Execution count [1.04e+07]
vmovss a(%rbx), %xmm0 #14.36
# LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax
ecx edi r9d r10d xmm0
..B1.8: # Preds ..B1.8 ..B1.7
# Execution count [3.33e+11]
vmovss 4+aa(%rdx,%r14), %xmm1 #14.25
incq %r15 #13.13
vfnmadd213ss 4+a(%rdx,%rbx), %xmm0, %xmm1 #14.17
vmovss %xmm1, 4+a(%rdx,%rbx) #14.17
addq $4, %rdx #13.13
cmpq %r8, %r15 #13.13
jb ..B1.8 # Prob 99% #13.13
# LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax
ecx edi r9d r10d xmm0
..B1.10: # Preds ..B1.8 ..B1.6
# Execution count [1.04e+07]
vbroadcastss a(,%r11,4), %ymm0 #14.36
lea (%r8,%r11), %r15 #13.13
movslq %ecx, %rdx #13.13
.align 16,0x90
# LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax
ecx edi r9d r10d ymm0
..B1.11: # Preds ..B1.11 ..B1.10
# Execution count [3.33e+11]
vmovups 4+aa(%r14,%r8,4), %ymm1 #14.25
vmovups 36+aa(%r14,%r8,4), %ymm2 #14.25
vfnmadd213ps 4+a(,%r15,4), %ymm0, %ymm1 #14.17
vfnmadd213ps 36+a(,%r15,4), %ymm0, %ymm2 #14.17
vmovups %ymm1, 4+a(,%r15,4) #14.17
vmovups %ymm2, 36+a(,%r15,4) #14.17
addq $16, %r8 #13.13
addq $16, %r15 #13.13
cmpq %rdx, %r8 #13.13
jb ..B1.11 # Prob 99% #13.13
# LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax
ecx edi r9d r10d ymm0
..B1.12: # Preds ..B1.11
# Execution count [1.04e+07]
lea 1(%rcx), %r8d #13.13
cmpl %r10d, %r8d #13.13
ja ..B1.21 # Prob 50% #13.13
# LOE rdx rbx rsi r11 r12 r13 r14 eax ecx edi
r9d r10d
..B1.13: # Preds ..B1.12
# Execution count [1.04e+07]
movslq %r10d, %r10 #13.13
subq %rdx, %r10 #13.13
cmpq $4, %r10 #13.13
jl ..B1.24 # Prob 10% #13.13
# LOE rdx rbx rsi r10 r11 r12 r13 r14 eax ecx
edi r9d
..B1.14: # Preds ..B1.13
# Execution count [1.04e+07]
movl %r10d, %r8d #13.13
lea (%r14,%rdx,4), %r14 #14.25
andl $-4, %r8d #13.13
addq %r11, %rdx #13.13
movslq %r8d, %r8 #13.13
xorl %r15d, %r15d #13.13
# LOE rdx rbx rsi r8 r10 r11 r12 r13 r14 r15
eax ecx edi r9d
..B1.15: # Preds ..B1.15 ..B1.14
# Execution count [3.33e+11]
vbroadcastss a(%rbx), %xmm1 #14.36
vmovups 4+aa(%r14,%r15,4), %xmm0 #14.25
vfnmadd213ps 4+a(,%rdx,4), %xmm0, %xmm1 #14.17
addq $4, %r15 #13.13
vmovups %xmm1, 4+a(,%rdx,4) #14.17
addq $4, %rdx #13.13
cmpq %r8, %r15 #13.13
jb ..B1.15 # Prob 99% #13.13
# LOE rdx rbx rsi r8 r10 r11 r12 r13 r14 r15
eax ecx edi r9d
..B1.17: # Preds ..B1.15 ..B1.24 ..B1.26
# Execution count [1.15e+07]
lea (,%r8,4), %r14 #13.13
cmpq %r10, %r8 #13.13
jae ..B1.21 # Prob 9% #13.13
# LOE rbx rsi r8 r10 r11 r12 r13 r14 eax ecx
edi r9d
..B1.18: # Preds ..B1.17
# Execution count [1.04e+07]
movslq %ecx, %rcx #14.17
lea (%rsi,%r11,4), %r15 #14.25
lea (,%rcx,4), %rdx #14.25
lea (%rdx,%r11,4), %rdx #14.17
lea (%r15,%rcx,4), %rcx #14.25
# LOE rdx rcx rbx rsi r8 r10 r11 r12 r13 r14
eax edi r9d
..B1.19: # Preds ..B1.19 ..B1.18
# Execution count [3.33e+11]
vmovss a(,%r11,4), %xmm1 #14.36
incq %r8 #13.13
vmovss 4+aa(%r14,%rcx), %xmm0 #14.25
vfnmadd213ss 4+a(%r14,%rdx), %xmm0, %xmm1 #14.17
vmovss %xmm1, 4+a(%r14,%rdx) #14.17
addq $4, %r14 #13.13
cmpq %r10, %r8 #13.13
jb ..B1.19 # Prob 99% #13.13
# LOE rdx rcx rbx rsi r8 r10 r11 r12 r13 r14
eax edi r9d
..B1.21: # Preds ..B1.19 ..B1.25 ..B1.12 ..B1.17 ..B1.3
#
# Execution count [1.15e+07]
addq $4, %rbx #13.28
addq $1024, %rsi #13.28
incq %r11 #13.28
cmpl $256, %edi #12.9
jb ..B1.3 # Prob 99% #12.9
# LOE rbx rsi r11 r12 r13 eax edi r9d
..B1.22: # Preds ..B1.21
# Execution count [4.50e+04]
.byte 15 #11.5
.byte 31 #11.5
.byte 128 #11.5
.byte 0 #11.5
.byte 0 #11.5
.byte 0 #11.5
.byte 0 #11.5
incl %eax #11.5
cmpl $390000, %eax #11.5
jb ..B1.2 # Prob 99% #11.5
# LOE r12 r13 eax
..B1.23: # Preds ..B1.22
# Execution count [1.17e-01]
vzeroupper #19.1
xorl %eax, %eax #19.1
addq $104, %rsp #19.1
.cfi_restore 3
popq %rbx #19.1
.cfi_restore 15
popq %r15 #19.1
.cfi_restore 14
popq %r14 #19.1
movq %rbp, %rsp #19.1
popq %rbp #19.1
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #19.1
.cfi_def_cfa 6, 16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff,
0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_offset 6, -16
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff,
0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff,
0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
# LOE
..B1.24: # Preds ..B1.13
# Execution count [1.04e+06]: Infreq
xorl %r8d, %r8d #13.13
jmp ..B1.17 # Prob 100% #13.13
# LOE rbx rsi r8 r10 r11 r12 r13 eax ecx edi
r9d
..B1.25: # Preds ..B1.5 ..B1.4
# Execution count [1.15e+06]: Infreq
xorl %ecx, %ecx #13.13
cmpl $1, %r10d #13.13
jb ..B1.21 # Prob 50% #13.13
# LOE rbx rsi r11 r12 r13 eax ecx edi r9d r10d
..B1.26: # Preds ..B1.25
# Execution count [5.77e+05]: Infreq
movslq %r10d, %r10 #13.13
xorl %r8d, %r8d #13.13
jmp ..B1.17 # Prob 100% #13.13
which runs 0.7s while gcc binary needs 5.7s
More information about the Gcc-bugs
mailing list