typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D], aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D]; // %2.3 real_t main(struct args_t * func_args) { // loop interchanging // imperfectly nested loops for (int nl = 0; nl < 200*(iterations/LEN_2D); nl++) { for (int i = 0; i < LEN_2D; i++) { a[i] += b[i] * c[i]; for (int j = 1; j < LEN_2D; j++) { aa[j][i] = aa[j-1][i] + bb[j][i] * a[i]; } } } } runs about 10 times faster on zen3 built by icc -O3 -ip -Ofast -g -march=core-avx2 -mtune=core-avx2 -vec s235.c main: # parameter 1: %rdi ..B1.1: # Preds ..B1.0 # Execution count [1.77e+00] .cfi_startproc ..___tag_value_main.1: ..L2: #9.1 pushq %rbp #9.1 .cfi_def_cfa_offset 16 movq %rsp, %rbp #9.1 .cfi_def_cfa 6, 16 .cfi_offset 6, -16 andq $-128, %rsp #9.1 subq $128, %rsp #9.1 movl $3, %edi #9.1 xorl %esi, %esi #9.1 call __intel_new_feature_proc_init #9.1 # LOE rbx r12 r13 r14 r15 ..B1.12: # Preds ..B1.1 # Execution count [1.77e+00] vstmxcsr (%rsp) #9.1 xorl %eax, %eax #14.5 orl $32832, (%rsp) #9.1 vldmxcsr (%rsp) #9.1 # LOE rbx r12 r13 r14 r15 eax ..B1.2: # Preds ..B1.8 ..B1.12 # Execution count [7.83e+04] xorl %edx, %edx #15.9 # LOE rdx rbx r12 r13 r14 r15 eax ..B1.3: # Preds ..B1.3 ..B1.2 # Execution count [2.00e+07] vmovups b(,%rdx,4), %ymm1 #16.21 lea (,%rdx,4), %rcx #16.13 vmovups 32+b(,%rdx,4), %ymm3 #16.21 vmovups 64+b(,%rdx,4), %ymm5 #16.21 vmovups 96+b(,%rdx,4), %ymm7 #16.21 vmovups 128+b(,%rdx,4), %ymm9 #16.21 vmovups 160+b(,%rdx,4), %ymm11 #16.21 vmovups 192+b(,%rdx,4), %ymm13 #16.21 vmovups 224+b(,%rdx,4), %ymm15 #16.21 vmovups c(,%rdx,4), %ymm0 #16.28 vmovups 32+c(,%rdx,4), %ymm2 #16.28 vmovups 64+c(,%rdx,4), %ymm4 #16.28 vmovups 96+c(,%rdx,4), %ymm6 #16.28 vmovups 128+c(,%rdx,4), %ymm8 #16.28 vmovups 160+c(,%rdx,4), %ymm10 #16.28 vmovups 192+c(,%rdx,4), %ymm12 #16.28 vmovups 224+c(,%rdx,4), %ymm14 #16.28 vfmadd213ps a(,%rdx,4), %ymm0, %ymm1 #16.13 vfmadd213ps 32+a(,%rdx,4), %ymm2, %ymm3 #16.13 vfmadd213ps 64+a(,%rdx,4), %ymm4, %ymm5 #16.13 vfmadd213ps 96+a(,%rdx,4), %ymm6, %ymm7 #16.13 vfmadd213ps 128+a(,%rdx,4), %ymm8, %ymm9 #16.13 vfmadd213ps 160+a(,%rdx,4), %ymm10, %ymm11 #16.13 vfmadd213ps 192+a(,%rdx,4), %ymm12, %ymm13 #16.13 vfmadd213ps 224+a(,%rdx,4), %ymm14, %ymm15 #16.13 vmovups %ymm1, a(%rcx) #16.13 vmovups %ymm3, 32+a(%rcx) #16.13 vmovups %ymm5, 64+a(%rcx) #16.13 vmovups %ymm7, 96+a(%rcx) #16.13 vmovups %ymm9, 128+a(%rcx) #16.13 vmovups %ymm11, 160+a(%rcx) #16.13 vmovups %ymm13, 192+a(%rcx) #16.13 vmovups %ymm15, 224+a(%rcx) #16.13 addq $64, %rdx #15.9 cmpq $256, %rdx #15.9 jb ..B1.3 # Prob 99% #15.9 # LOE rdx rbx r12 r13 r14 r15 eax ..B1.4: # Preds ..B1.3 # Execution count [7.83e+04] xorl %ecx, %ecx #17.13 xorl %edx, %edx #17.13 # LOE rdx rbx r12 r13 r14 r15 eax ecx ..B1.5: # Preds ..B1.7 ..B1.4 # Execution count [2.00e+07] xorl %esi, %esi #15.9 # LOE rdx rbx rsi r12 r13 r14 r15 eax ecx ..B1.6: # Preds ..B1.6 ..B1.5 # Execution count [5.11e+09] vmovups a(,%rsi,4), %ymm1 #18.52 lea (%rdx,%rsi,4), %rdi #18.17 vmovups 32+a(,%rsi,4), %ymm3 #18.52 vmovups 64+a(,%rsi,4), %ymm5 #18.52 vmovups 96+a(,%rsi,4), %ymm7 #18.52 vmovups 128+a(,%rsi,4), %ymm9 #18.52 vmovups 160+a(,%rsi,4), %ymm11 #18.52 vmovups 192+a(,%rsi,4), %ymm13 #18.52 vmovups 224+a(,%rsi,4), %ymm15 #18.52 vmovups 1024+bb(%rdx,%rsi,4), %ymm0 #18.41 vmovups 1056+bb(%rdx,%rsi,4), %ymm2 #18.41 vmovups 1088+bb(%rdx,%rsi,4), %ymm4 #18.41 vmovups 1120+bb(%rdx,%rsi,4), %ymm6 #18.41 vmovups 1152+bb(%rdx,%rsi,4), %ymm8 #18.41 vmovups 1184+bb(%rdx,%rsi,4), %ymm10 #18.41 vmovups 1216+bb(%rdx,%rsi,4), %ymm12 #18.41 vmovups 1248+bb(%rdx,%rsi,4), %ymm14 #18.41 vfmadd213ps aa(%rdx,%rsi,4), %ymm0, %ymm1 #18.52 vfmadd213ps 32+aa(%rdx,%rsi,4), %ymm2, %ymm3 #18.52 vfmadd213ps 64+aa(%rdx,%rsi,4), %ymm4, %ymm5 #18.52 vfmadd213ps 96+aa(%rdx,%rsi,4), %ymm6, %ymm7 #18.52 vfmadd213ps 128+aa(%rdx,%rsi,4), %ymm8, %ymm9 #18.52 vfmadd213ps 160+aa(%rdx,%rsi,4), %ymm10, %ymm11 #18.52 vfmadd213ps 192+aa(%rdx,%rsi,4), %ymm12, %ymm13 #18.52 vfmadd213ps 224+aa(%rdx,%rsi,4), %ymm14, %ymm15 #18.52 vmovups %ymm1, 1024+aa(%rdi) #18.17 vmovups %ymm3, 1056+aa(%rdi) #18.17 vmovups %ymm5, 1088+aa(%rdi) #18.17 vmovups %ymm7, 1120+aa(%rdi) #18.17 vmovups %ymm9, 1152+aa(%rdi) #18.17 vmovups %ymm11, 1184+aa(%rdi) #18.17 vmovups %ymm13, 1216+aa(%rdi) #18.17 vmovups %ymm15, 1248+aa(%rdi) #18.17 addq $64, %rsi #15.9 cmpq $256, %rsi #15.9 jb ..B1.6 # Prob 99% #15.9 # LOE rdx rbx rsi r12 r13 r14 r15 eax ecx ..B1.7: # Preds ..B1.6 # Execution count [2.00e+07] incl %ecx #17.13 addq $1024, %rdx #17.13 cmpl $255, %ecx #17.13 jb ..B1.5 # Prob 99% #17.13 # LOE rdx rbx r12 r13 r14 r15 eax ecx ..B1.8: # Preds ..B1.7 # Execution count [7.83e+04] incl %eax #14.5 cmpl $78000, %eax #14.5 jb ..B1.2 # Prob 99% #14.5 # LOE rbx r12 r13 r14 r15 eax ..B1.9: # Preds ..B1.8 # Execution count [1.00e+00] vzeroupper #22.1 xorl %eax, %eax #22.1 movq %rbp, %rsp #22.1 popq %rbp #22.1 .cfi_def_cfa 7, 8 .cfi_restore 6 ret #22.1
linterchange says: Consider loop interchange for loop_nest<1 - 3> Access Strides for DRs: a[i_33]: <0, 4, 0> b[i_33]: <0, 4, 0> c[i_33]: <0, 4, 0> a[i_33]: <0, 4, 0> aa[_6][i_33]: <0, 4, 1024> bb[j_34][i_33]: <0, 4, 1024> aa[j_34][i_33]: <0, 4, 1024> Loop(3) carried vars: Induction: j_34 = {1, 1}_3 Induction: ivtmp_53 = {255, 4294967295}_3 Loop(2) carried vars: Induction: i_33 = {0, 1}_2 Induction: ivtmp_51 = {256, 4294967295}_2 and then doesn't do anything. I suppose the best thing to do here is to first distribute the loop nest, but our cost modeling fuses the two obvious candidates: Fuse partitions because they have shared memory refs: Part 1: 0, 1, 2, 3, 4, 5, 6, 7, 19, 20, 21 Part 2: 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 so this is a case that asks for better cost modeling there.
another testcase typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 // array definitions real_t aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D]; int main(struct args_t * func_args) { // loop interchange // interchanging with one of two inner loops for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) { for (int i = 1; i < LEN_2D; i++) { for (int j = 1; j < LEN_2D; j++) { aa[j][i] = aa[j-1][i] + cc[j][i]; } for (int j = 1; j < LEN_2D; j++) { bb[j][i] = bb[j][i-1] + cc[j][i]; } } dummy(); } return aa[0][0]; }
this one is 7s with gcc and 0.4s with icc. typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 // array definitions real_t aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D]; int main(struct args_t * func_args) { // loop interchange // interchanging with one of two inner loops for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) { for (int i = 1; i < LEN_2D; i++) { for (int j = 1; j < LEN_2D; j++) { aa[j][i] = aa[j-1][i] + cc[j][i]; } for (int j = 1; j < LEN_2D; j++) { bb[i][j] = bb[i-1][j] + cc[i][j]; } } dummy(); } return aa[0][0]; }
s275: typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 // array definitions real_t a[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D]; int main(struct args_t * func_args) { // control flow // if around inner loop, interchanging needed for (int i = 0; i < LEN_2D; i++) aa[0][i]=1; for (int nl = 0; nl < 10*(iterations/LEN_2D); nl++) { for (int i = 0; i < LEN_2D; i++) { if (aa[0][i] > (real_t)0.) { for (int j = 1; j < LEN_2D; j++) { aa[j][i] = aa[j-1][i] + bb[j][i] * cc[j][i]; } } } dummy(); } return aa[0][0]; }
s2275: typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 // array definitions real_t a[LEN_2D],b[LEN_2D],c[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D]; int main(struct args_t * func_args) { // loop distribution is needed to be able to interchange for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) { for (int i = 0; i < LEN_2D; i++) { for (int j = 0; j < LEN_2D; j++) { aa[j][i] = aa[j][i] + bb[j][i] * cc[j][i]; } a[i] = b[i] + c[i] * d[i]; } dummy(); } return aa[0][0]; }
(In reply to Jan Hubicka from comment #4) > s275: > typedef float real_t; > > #define iterations 100000 > #define LEN_1D 32000 > #define LEN_2D 256 > // array definitions > > real_t > a[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D], > tt[LEN_2D][LEN_2D]; > > int main(struct args_t * func_args) > { > // control flow > // if around inner loop, interchanging needed > > for (int i = 0; i < LEN_2D; i++) > aa[0][i]=1; > > for (int nl = 0; nl < 10*(iterations/LEN_2D); nl++) { > for (int i = 0; i < LEN_2D; i++) { > if (aa[0][i] > (real_t)0.) { > for (int j = 1; j < LEN_2D; j++) { > aa[j][i] = aa[j-1][i] + bb[j][i] * cc[j][i]; > } > } > } > dummy(); > } > return aa[0][0]; > } This would need to be supported by interchange itself - it's basically un-unswitching the condition, moving it inside the innermost loop and then performing interchange. Not sure how difficult it would be to handle this, but it looks like a candidate for diagnostic aka 'note: interchanging loops will improve performance' aka "fix your code".