typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D]; void main() { for (int nl = 0; nl < iterations; nl++) { for (int i = 1; i < LEN_1D-1; i++) { a[i] = b[i - 1] + c[i] * d[i]; b[i] = b[i + 1] - e[i] * d[i]; } } } Icc produces: ain: ..B1.1: # Preds ..B1.0 # Execution count [0.00e+00] .cfi_startproc ..___tag_value_ain.1: ..L2: #9.1 subq $136, %rsp #9.1 .cfi_def_cfa_offset 144 xorl %edx, %edx #11.5 lea 12+d(%rip), %r8 #14.38 vmovss (%r8), %xmm0 #14.38 movl $7, %edi #13.38 lea 12+e(%rip), %r9 #14.38 vmulss (%r9), %xmm0, %xmm12 #14.38 xorl %esi, %esi #13.38 lea 12+c(%rip), %r10 #13.38 vmulss (%r10), %xmm0, %xmm0 #13.38 vmovss 16(%r8), %xmm4 #14.38 movl $31977, %ecx #12.9 vmulss 16(%r9), %xmm4, %xmm14 #14.38 movl $31975, %eax #12.9 lea 24+b(%rip), %r11 #14.20 vmovss (%r11), %xmm11 #14.20 vmovss 4(%r8), %xmm6 #14.38 vmovss %xmm12, 104(%rsp) #14.38[spill] vmovss %xmm11, 8(%rsp) #14.20[spill] vmulss 4(%r9), %xmm6, %xmm12 #14.38 vmulss 4(%r10), %xmm6, %xmm11 #13.38 vmovss 127984+d(%rip), %xmm6 #14.38 vmovss 8(%r8), %xmm13 #14.38 vmovss %xmm14, 96(%rsp) #14.38[spill] vmulss 127984+e(%rip), %xmm6, %xmm14 #14.38 vmulss 8(%r9), %xmm13, %xmm1 #14.38 vmovss %xmm14, 112(%rsp) #14.38[spill] vmovss 127988+d(%rip), %xmm14 #14.38 vmovss %xmm1, 16(%rsp) #14.38[spill] vmulss 8(%r10), %xmm13, %xmm1 #13.38 vmulss 16(%r10), %xmm4, %xmm13 #13.38 vmulss 127988+e(%rip), %xmm14, %xmm4 #14.38 vmovss %xmm4, 120(%rsp) #14.38[spill] vmulss 127988+c(%rip), %xmm14, %xmm4 #13.38 vmovss -4(%r11), %xmm5 #14.20 vmovss -8(%r8), %xmm2 #14.38 vmovss 12(%r8), %xmm15 #14.38 vmovss %xmm4, 24(%rsp) #13.38[spill] vmovss 127992+d(%rip), %xmm4 #14.38 vmovss %xmm5, (%rsp) #14.20[spill] vmulss -8(%r9), %xmm2, %xmm3 #14.38 vmulss -8(%r10), %xmm2, %xmm5 #13.38 vmulss 12(%r9), %xmm15, %xmm2 #14.38 vmulss 12(%r10), %xmm15, %xmm15 #13.38 vmulss 127992+e(%rip), %xmm4, %xmm14 #14.38 vmulss 127992+c(%rip), %xmm4, %xmm4 #13.38 vmovss -4(%r8), %xmm10 #14.38 vmulss -4(%r9), %xmm10, %xmm7 #14.38 vmulss -4(%r10), %xmm10, %xmm10 #13.38 vmovss %xmm7, 88(%rsp) #14.38[spill] vmovss %xmm4, 32(%rsp) #13.38[spill] vmovss %xmm15, 56(%rsp) #13.31[spill] vmovss %xmm14, 40(%rsp) #13.31[spill] vmovss %xmm3, 80(%rsp) #13.31[spill] vmovss -16(%r11), %xmm9 #14.20 vmovss -12(%r11), %xmm8 #14.20 vmovss -8(%r11), %xmm7 #14.20 vmovss 127984+c(%rip), %xmm4 #13.31 vmovss %xmm1, 64(%rsp) #13.31[spill] vmovss %xmm0, 48(%rsp) #13.31[spill] vmovss %xmm2, 72(%rsp) #13.31[spill] vmovss 16(%rsp), %xmm14 #13.31[spill] vmovss 8(%rsp), %xmm15 #13.31[spill] vmovss (%rsp), %xmm3 #13.31[spill] # LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15 edx xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 ..B1.2: # Preds ..B1.10 ..B1.1 # Execution count [1.00e+05] movq %rdi, %r8 #12.9 vsubss 80(%rsp), %xmm9, %xmm0 #14.38[spill] vsubss 88(%rsp), %xmm8, %xmm1 #14.38[spill] vsubss 104(%rsp), %xmm7, %xmm2 #14.38[spill] vsubss %xmm14, %xmm15, %xmm7 #14.38 vsubss %xmm12, %xmm3, %xmm3 #14.38 vmovss 28+b(%rip), %xmm8 #14.20 vmovss 32+b(%rip), %xmm15 #14.20 vmovss %xmm0, 4+b(%rip) #14.13 vmovss %xmm1, 8+b(%rip) #14.13 vmovss %xmm2, 12+b(%rip) #14.13 vmovss %xmm3, 16+b(%rip) #14.13 vmovss %xmm7, 20+b(%rip) #14.13 vsubss 72(%rsp), %xmm8, %xmm9 #14.38[spill] vsubss 96(%rsp), %xmm15, %xmm0 #14.38[spill] vmovss %xmm9, 24+b(%rip) #14.13 vmovss %xmm0, 28+b(%rip) #14.13 # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14 ..B1.3: # Preds ..B1.3 ..B1.2 # Execution count [3.20e+09] vmovups 4+e(,%r8,4), %ymm1 #14.31 lea (,%r8,4), %r9 #14.13 vmovups 36+e(,%r8,4), %ymm3 #14.31 vmovups 68+e(,%r8,4), %ymm8 #14.31 vmovups 100+e(,%r8,4), %ymm15 #14.31 vmovups 4+d(,%r8,4), %ymm0 #14.38 vmovups 36+d(,%r8,4), %ymm2 #14.38 vmovups 68+d(,%r8,4), %ymm7 #14.38 vmovups 100+d(,%r8,4), %ymm9 #14.38 vfnmadd213ps 8+b(,%r8,4), %ymm0, %ymm1 #14.38 vfnmadd213ps 40+b(,%r8,4), %ymm2, %ymm3 #14.38 vfnmadd213ps 72+b(,%r8,4), %ymm7, %ymm8 #14.38 vfnmadd213ps 104+b(,%r8,4), %ymm9, %ymm15 #14.38 vmovups %ymm1, 4+b(%r9) #14.13 vmovups %ymm3, 36+b(%r9) #14.13 vmovups %ymm8, 68+b(%r9) #14.13 vmovups %ymm15, 100+b(%r9) #14.13 addq $32, %r8 #12.9 cmpq $31975, %r8 #12.9 jb ..B1.3 # Prob 99% #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14 ..B1.4: # Preds ..B1.3 # Execution count [1.00e+05] movq %rsi, %r9 #12.9 movq %rcx, %r8 #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14 ..B1.5: # Preds ..B1.5 ..B1.4 # Execution count [3.20e+09] vmovups 127904+e(,%r9,4), %xmm1 #14.31 vmovups 127904+d(,%r9,4), %xmm0 #14.38 vfnmadd213ps b(,%r8,4), %xmm0, %xmm1 #14.38 addq $4, %r8 #12.9 vmovups %xmm1, 127904+b(,%r9,4) #14.13 addq $4, %r9 #12.9 cmpq $20, %r9 #12.9 jb ..B1.5 # Prob 99% #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 edx xmm4 xmm5 xmm6 xmm10 xmm11 xmm12 xmm13 xmm14 ..B1.6: # Preds ..B1.5 # Execution count [1.00e+05] vmovss 127996+b(%rip), %xmm9 #14.20 movq %rdi, %r8 #12.9 vmovss 127992+b(%rip), %xmm1 #14.20 vmovss 127988+b(%rip), %xmm2 #14.20 vaddss b(%rip), %xmm5, %xmm7 #13.38 vaddss 4+b(%rip), %xmm10, %xmm3 #13.38 vsubss 40(%rsp), %xmm9, %xmm8 #14.38[spill] vsubss 112(%rsp), %xmm2, %xmm2 #14.38[spill] vsubss 120(%rsp), %xmm1, %xmm1 #14.38[spill] vmovss %xmm7, 4+a(%rip) #13.13 vmovss 16+b(%rip), %xmm7 #13.20 vmovss %xmm3, 8+a(%rip) #13.13 vmovss 8+b(%rip), %xmm9 #13.20 vmovss %xmm8, 127992+b(%rip) #14.13 vmovss 12+b(%rip), %xmm8 #13.20 vmovss %xmm2, 127984+b(%rip) #14.13 vaddss %xmm11, %xmm8, %xmm0 #13.38 vaddss 64(%rsp), %xmm7, %xmm3 #13.38[spill] vaddss 48(%rsp), %xmm9, %xmm15 #13.38[spill] vmovss %xmm3, 20+a(%rip) #13.13 vmovss 20+b(%rip), %xmm3 #13.20 vmovss %xmm15, 12+a(%rip) #13.13 vmovss %xmm0, 16+a(%rip) #13.13 vmovss %xmm1, 127988+b(%rip) #14.13 vmovss %xmm9, (%rsp) #13.13[spill] vaddss 56(%rsp), %xmm3, %xmm15 #13.38[spill] vmovss %xmm15, 24+a(%rip) #13.13 vmovss 24+b(%rip), %xmm15 #13.20 vaddss %xmm13, %xmm15, %xmm0 #13.38 vmovss %xmm0, 28+a(%rip) #13.13 # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 ..B1.7: # Preds ..B1.7 ..B1.6 # Execution count [3.20e+09] vmovups 4+c(,%r8,4), %ymm9 #13.31 lea (,%r8,4), %r9 #13.13 vmovups 4+d(,%r8,4), %ymm0 #13.38 vfmadd213ps b(,%r8,4), %ymm0, %ymm9 #13.38 vmovups 36+d(,%r8,4), %ymm0 #13.38 vmovups %ymm9, 4+a(%r9) #13.13 vmovups 36+c(,%r8,4), %ymm9 #13.31 vfmadd213ps 32+b(,%r8,4), %ymm0, %ymm9 #13.38 vmovups 68+d(,%r8,4), %ymm0 #13.38 vmovups %ymm9, 36+a(%r9) #13.13 vmovups 68+c(,%r8,4), %ymm9 #13.31 vfmadd213ps 64+b(,%r8,4), %ymm0, %ymm9 #13.38 vmovups 100+d(,%r8,4), %ymm0 #13.38 vmovups %ymm9, 68+a(%r9) #13.13 vmovups 100+c(,%r8,4), %ymm9 #13.31 vfmadd213ps 96+b(,%r8,4), %ymm0, %ymm9 #13.38 addq $32, %r8 #12.9 vmovups %ymm9, 100+a(%r9) #13.13 cmpq $31975, %r8 #12.9 jb ..B1.7 # Prob 99% #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 ..B1.8: # Preds ..B1.7 # Execution count [1.00e+05] movq %rsi, %r9 #12.9 movq %rax, %r8 #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 ..B1.9: # Preds ..B1.9 ..B1.8 # Execution count [3.20e+09] vmovups 127904+c(,%r9,4), %xmm9 #13.31 vmovups 127904+d(,%r9,4), %xmm0 #13.38 vfmadd213ps b(,%r8,4), %xmm0, %xmm9 #13.38 addq $4, %r8 #12.9 vmovups %xmm9, 127904+a(,%r9,4) #13.13 addq $4, %r9 #12.9 cmpq $20, %r9 #12.9 jb ..B1.9 # Prob 99% #12.9 # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 edx xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 ..B1.10: # Preds ..B1.9 # Execution count [1.07e+09] incl %edx #11.5 vmovss 127980+b(%rip), %xmm0 #13.20 vmovss (%rsp), %xmm9 #[spill] vfmadd231ss %xmm6, %xmm4, %xmm0 #13.38 cmpl $100000, %edx #11.5 jb ..B1.2 # Prob 99% #11.5 # LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15 edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 ..B1.11: # Preds ..B1.10 # Execution count [1.00e+00] vmovss %xmm0, 127984+a(%rip) #13.13 vaddss 32(%rsp), %xmm1, %xmm1 #13.38[spill] vaddss 24(%rsp), %xmm2, %xmm2 #13.38[spill] vmovss %xmm1, 127992+a(%rip) #13.13 vmovss %xmm2, 127988+a(%rip) #13.13 vzeroupper #17.1 addq $136, %rsp #17.1 .cfi_def_cfa_offset 8 ret #17.1
Confirmed. ICC applies loop distribution but again our cost-modeling doesn't want that to happen. I suspect we want to detect extra incentives there (make dependences "good", allow interchange, etc.)
Note after "fixing" (disabling) the costing issue we get to Fuse partitions because they are in the same dependence scc: Part 1: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 18, 20, 21 Part 2: 1, 2, 3, 7, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21 Loop nest 1 not distributed. still a[i] = b[i - 1] can be performed separately but second while b[i] = b[i + 1] needs to be performed first. That means the dependence analysis interpretation needs improvement.
Note it's only the outer loop that confuses us here. With that removed we have the following because of yet another "heuristic" to disable distribution. Possible alias data dependence to break: Fuse partitions because there is no point to distribute loop: Part 1: 0, 1, 5, 9, 10, 11, 12, 13, 14, 15, 16 Part 2: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 16
(In reply to Richard Biener from comment #3) > Note it's only the outer loop that confuses us here. With that removed we > have > the following because of yet another "heuristic" to disable distribution. In fact we first analyze the whole nest but then continue to look at the inner loop only, so this isn't really an issue. The fusing because of shared memory refs is only because of the double use of d[i], b[i], b[i-1] or b[i+1] are not detected as problematic for distribution (the "same memory object" check isn't working as intended). Fuse partitions because they have shared memory refs: Part 1: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 16 Part 2: 0, 1, 5, 9, 10, 11, 12, 13, 14, 15, 16 note the intersection of both partitions includes half of the stmts (0, 1, 5, 6, 15, 16) that would be duplicated (5 is the d[i] load) while the other half is different. To defeat the final fusing reason we need a positive motivation, like tracking whether we know a partition can or cannot be vectorized (or whether we are not sure). For the partition containing the b[i], b[i+1] dependence distance of 1 we know we cannot vectorize (with a VF > 0).