typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],aa[LEN_2D][LEN_2D]; void main() { for (int nl = 0; nl < 1000*(iterations/LEN_2D); nl++) { for (int j = 0; j < LEN_2D; j++) { for (int i = j+1; i < LEN_2D; i++) { a[i] -= aa[j][i] * a[j]; } } } } is built as: main: ..B1.1: # Preds ..B1.0 # Execution count [1.17e-01] .cfi_startproc ..___tag_value_main.1: ..L2: #9.1 pushq %rbp #9.1 .cfi_def_cfa_offset 16 movq %rsp, %rbp #9.1 .cfi_def_cfa 6, 16 .cfi_offset 6, -16 andq $-128, %rsp #9.1 pushq %r14 #9.1 pushq %r15 #9.1 pushq %rbx #9.1 subq $104, %rsp #9.1 movl $3, %edi #9.1 xorl %esi, %esi #9.1 call __intel_new_feature_proc_init #9.1 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 # LOE rbx r12 r13 r14 r15 ..B1.29: # Preds ..B1.1 # Execution count [1.17e-01] vstmxcsr (%rsp) #9.1 xorl %eax, %eax #11.5 orl $32832, (%rsp) #9.1 vldmxcsr (%rsp) #9.1 # LOE r12 r13 eax ..B1.2: # Preds ..B1.22 ..B1.29 # Execution count [4.50e+04] xorl %r11d, %r11d #12.9 xorl %edi, %edi #12.9 xorl %ebx, %ebx #12.9 xorl %r9d, %r9d #12.9 xorl %esi, %esi #12.9 # LOE rbx rsi r11 r12 r13 eax edi r9d ..B1.3: # Preds ..B1.21 ..B1.2 # Execution count [1.15e+07] incl %edi #13.28 decl %r9d #13.28 cmpl $256, %edi #13.35 jge ..B1.21 # Prob 50% #13.35 # LOE rbx rsi r11 r12 r13 eax edi r9d ..B1.4: # Preds ..B1.3 # Execution count [1.04e+07] lea 256(%r9), %r10d #13.35 cmpl $16, %r10d #13.13 jl ..B1.25 # Prob 10% #13.13 # LOE rbx rsi r11 r12 r13 eax edi r9d r10d ..B1.5: # Preds ..B1.4 # Execution count [1.04e+07] lea 4+aa(%rsi,%rbx), %r8 #14.25 andq $31, %r8 #13.13 lea (%rsi,%rbx), %r14 #14.25 movl %r8d, %edx #13.13 negl %edx #13.13 addl $32, %edx #13.13 shrl $2, %edx #13.13 testl %r8d, %r8d #13.13 cmovne %edx, %r8d #13.13 lea 16(%r8), %ecx #13.13 cmpl %ecx, %r10d #13.13 jl ..B1.25 # Prob 10% #13.13 # LOE rbx rsi r8 r11 r12 r13 r14 eax edi r9d r10d ..B1.6: # Preds ..B1.5 # Execution count [1.15e+07] movl %r10d, %ecx #13.13 xorl %r15d, %r15d #13.13 subl %r8d, %ecx #13.13 xorl %edx, %edx #13.13 andl $15, %ecx #13.13 negl %ecx #13.13 addl %r10d, %ecx #13.13 testl %r8d, %r8d #13.13 jbe ..B1.10 # Prob 9% #13.13 # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d ..B1.7: # Preds ..B1.6 # Execution count [1.04e+07] vmovss a(%rbx), %xmm0 #14.36 # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d xmm0 ..B1.8: # Preds ..B1.8 ..B1.7 # Execution count [3.33e+11] vmovss 4+aa(%rdx,%r14), %xmm1 #14.25 incq %r15 #13.13 vfnmadd213ss 4+a(%rdx,%rbx), %xmm0, %xmm1 #14.17 vmovss %xmm1, 4+a(%rdx,%rbx) #14.17 addq $4, %rdx #13.13 cmpq %r8, %r15 #13.13 jb ..B1.8 # Prob 99% #13.13 # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d xmm0 ..B1.10: # Preds ..B1.8 ..B1.6 # Execution count [1.04e+07] vbroadcastss a(,%r11,4), %ymm0 #14.36 lea (%r8,%r11), %r15 #13.13 movslq %ecx, %rdx #13.13 .align 16,0x90 # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d ymm0 ..B1.11: # Preds ..B1.11 ..B1.10 # Execution count [3.33e+11] vmovups 4+aa(%r14,%r8,4), %ymm1 #14.25 vmovups 36+aa(%r14,%r8,4), %ymm2 #14.25 vfnmadd213ps 4+a(,%r15,4), %ymm0, %ymm1 #14.17 vfnmadd213ps 36+a(,%r15,4), %ymm0, %ymm2 #14.17 vmovups %ymm1, 4+a(,%r15,4) #14.17 vmovups %ymm2, 36+a(,%r15,4) #14.17 addq $16, %r8 #13.13 addq $16, %r15 #13.13 cmpq %rdx, %r8 #13.13 jb ..B1.11 # Prob 99% #13.13 # LOE rdx rbx rsi r8 r11 r12 r13 r14 r15 eax ecx edi r9d r10d ymm0 ..B1.12: # Preds ..B1.11 # Execution count [1.04e+07] lea 1(%rcx), %r8d #13.13 cmpl %r10d, %r8d #13.13 ja ..B1.21 # Prob 50% #13.13 # LOE rdx rbx rsi r11 r12 r13 r14 eax ecx edi r9d r10d ..B1.13: # Preds ..B1.12 # Execution count [1.04e+07] movslq %r10d, %r10 #13.13 subq %rdx, %r10 #13.13 cmpq $4, %r10 #13.13 jl ..B1.24 # Prob 10% #13.13 # LOE rdx rbx rsi r10 r11 r12 r13 r14 eax ecx edi r9d ..B1.14: # Preds ..B1.13 # Execution count [1.04e+07] movl %r10d, %r8d #13.13 lea (%r14,%rdx,4), %r14 #14.25 andl $-4, %r8d #13.13 addq %r11, %rdx #13.13 movslq %r8d, %r8 #13.13 xorl %r15d, %r15d #13.13 # LOE rdx rbx rsi r8 r10 r11 r12 r13 r14 r15 eax ecx edi r9d ..B1.15: # Preds ..B1.15 ..B1.14 # Execution count [3.33e+11] vbroadcastss a(%rbx), %xmm1 #14.36 vmovups 4+aa(%r14,%r15,4), %xmm0 #14.25 vfnmadd213ps 4+a(,%rdx,4), %xmm0, %xmm1 #14.17 addq $4, %r15 #13.13 vmovups %xmm1, 4+a(,%rdx,4) #14.17 addq $4, %rdx #13.13 cmpq %r8, %r15 #13.13 jb ..B1.15 # Prob 99% #13.13 # LOE rdx rbx rsi r8 r10 r11 r12 r13 r14 r15 eax ecx edi r9d ..B1.17: # Preds ..B1.15 ..B1.24 ..B1.26 # Execution count [1.15e+07] lea (,%r8,4), %r14 #13.13 cmpq %r10, %r8 #13.13 jae ..B1.21 # Prob 9% #13.13 # LOE rbx rsi r8 r10 r11 r12 r13 r14 eax ecx edi r9d ..B1.18: # Preds ..B1.17 # Execution count [1.04e+07] movslq %ecx, %rcx #14.17 lea (%rsi,%r11,4), %r15 #14.25 lea (,%rcx,4), %rdx #14.25 lea (%rdx,%r11,4), %rdx #14.17 lea (%r15,%rcx,4), %rcx #14.25 # LOE rdx rcx rbx rsi r8 r10 r11 r12 r13 r14 eax edi r9d ..B1.19: # Preds ..B1.19 ..B1.18 # Execution count [3.33e+11] vmovss a(,%r11,4), %xmm1 #14.36 incq %r8 #13.13 vmovss 4+aa(%r14,%rcx), %xmm0 #14.25 vfnmadd213ss 4+a(%r14,%rdx), %xmm0, %xmm1 #14.17 vmovss %xmm1, 4+a(%r14,%rdx) #14.17 addq $4, %r14 #13.13 cmpq %r10, %r8 #13.13 jb ..B1.19 # Prob 99% #13.13 # LOE rdx rcx rbx rsi r8 r10 r11 r12 r13 r14 eax edi r9d ..B1.21: # Preds ..B1.19 ..B1.25 ..B1.12 ..B1.17 ..B1.3 # # Execution count [1.15e+07] addq $4, %rbx #13.28 addq $1024, %rsi #13.28 incq %r11 #13.28 cmpl $256, %edi #12.9 jb ..B1.3 # Prob 99% #12.9 # LOE rbx rsi r11 r12 r13 eax edi r9d ..B1.22: # Preds ..B1.21 # Execution count [4.50e+04] .byte 15 #11.5 .byte 31 #11.5 .byte 128 #11.5 .byte 0 #11.5 .byte 0 #11.5 .byte 0 #11.5 .byte 0 #11.5 incl %eax #11.5 cmpl $390000, %eax #11.5 jb ..B1.2 # Prob 99% #11.5 # LOE r12 r13 eax ..B1.23: # Preds ..B1.22 # Execution count [1.17e-01] vzeroupper #19.1 xorl %eax, %eax #19.1 addq $104, %rsp #19.1 .cfi_restore 3 popq %rbx #19.1 .cfi_restore 15 popq %r15 #19.1 .cfi_restore 14 popq %r14 #19.1 movq %rbp, %rsp #19.1 popq %rbp #19.1 .cfi_def_cfa 7, 8 .cfi_restore 6 ret #19.1 .cfi_def_cfa 6, 16 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 .cfi_offset 6, -16 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 # LOE ..B1.24: # Preds ..B1.13 # Execution count [1.04e+06]: Infreq xorl %r8d, %r8d #13.13 jmp ..B1.17 # Prob 100% #13.13 # LOE rbx rsi r8 r10 r11 r12 r13 eax ecx edi r9d ..B1.25: # Preds ..B1.5 ..B1.4 # Execution count [1.15e+06]: Infreq xorl %ecx, %ecx #13.13 cmpl $1, %r10d #13.13 jb ..B1.21 # Prob 50% #13.13 # LOE rbx rsi r11 r12 r13 eax ecx edi r9d r10d ..B1.26: # Preds ..B1.25 # Execution count [5.77e+05]: Infreq movslq %r10d, %r10 #13.13 xorl %r8d, %r8d #13.13 jmp ..B1.17 # Prob 100% #13.13 which runs 0.7s while gcc binary needs 5.7s
The benchmark is written badly to confuse our loop header copying it seems. Writing for (int j = 0; j < LEN_2D-1; j++) { for (int i = j+1; i < LEN_2D; i++) { a[i] -= aa[j][i] * a[j]; } } fixes the vectorizing. Possibly a mistake users do, so probably worth investigating further. Not sure how to most easily address this - we'd like to peel the last iteration of the outer loop, noting it does nothing. Maybe loop-splitting can figure this out? Alternatively loop header copying should just do its job... Hmm, actually loop-header copying does do its job but then there's jump threading messing this up again (the loop header check is redundant for all but the last iteration of the outer loop). So -fno-tree-dominator-opts fixes this as well. And for some reason ch_vect thinks the loops are all do-while loops.