Summary: | s235, s2233, s275, s2275 and s233 benchmarks of TSVC is vectorized better by icc than gcc (loop interchange) | ||
---|---|---|---|
Product: | gcc | Reporter: | Jan Hubicka <hubicka> |
Component: | tree-optimization | Assignee: | Not yet assigned to anyone <unassigned> |
Status: | NEW --- | ||
Severity: | normal | CC: | amker, crazylht, hjl.tools, rguenth |
Priority: | P3 | Keywords: | missed-optimization |
Version: | 11.0 | ||
Target Milestone: | --- | ||
Host: | Target: | ||
Build: | Known to work: | ||
Known to fail: | Last reconfirmed: | 2021-03-08 00:00:00 | |
Bug Depends on: | |||
Bug Blocks: | 53947 |
Description
Jan Hubicka
2021-03-05 15:40:20 UTC
linterchange says: Consider loop interchange for loop_nest<1 - 3> Access Strides for DRs: a[i_33]: <0, 4, 0> b[i_33]: <0, 4, 0> c[i_33]: <0, 4, 0> a[i_33]: <0, 4, 0> aa[_6][i_33]: <0, 4, 1024> bb[j_34][i_33]: <0, 4, 1024> aa[j_34][i_33]: <0, 4, 1024> Loop(3) carried vars: Induction: j_34 = {1, 1}_3 Induction: ivtmp_53 = {255, 4294967295}_3 Loop(2) carried vars: Induction: i_33 = {0, 1}_2 Induction: ivtmp_51 = {256, 4294967295}_2 and then doesn't do anything. I suppose the best thing to do here is to first distribute the loop nest, but our cost modeling fuses the two obvious candidates: Fuse partitions because they have shared memory refs: Part 1: 0, 1, 2, 3, 4, 5, 6, 7, 19, 20, 21 Part 2: 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 so this is a case that asks for better cost modeling there. another testcase typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 // array definitions real_t aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D]; int main(struct args_t * func_args) { // loop interchange // interchanging with one of two inner loops for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) { for (int i = 1; i < LEN_2D; i++) { for (int j = 1; j < LEN_2D; j++) { aa[j][i] = aa[j-1][i] + cc[j][i]; } for (int j = 1; j < LEN_2D; j++) { bb[j][i] = bb[j][i-1] + cc[j][i]; } } dummy(); } return aa[0][0]; } this one is 7s with gcc and 0.4s with icc. typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 // array definitions real_t aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D]; int main(struct args_t * func_args) { // loop interchange // interchanging with one of two inner loops for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) { for (int i = 1; i < LEN_2D; i++) { for (int j = 1; j < LEN_2D; j++) { aa[j][i] = aa[j-1][i] + cc[j][i]; } for (int j = 1; j < LEN_2D; j++) { bb[i][j] = bb[i-1][j] + cc[i][j]; } } dummy(); } return aa[0][0]; } s275: typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 // array definitions real_t a[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D]; int main(struct args_t * func_args) { // control flow // if around inner loop, interchanging needed for (int i = 0; i < LEN_2D; i++) aa[0][i]=1; for (int nl = 0; nl < 10*(iterations/LEN_2D); nl++) { for (int i = 0; i < LEN_2D; i++) { if (aa[0][i] > (real_t)0.) { for (int j = 1; j < LEN_2D; j++) { aa[j][i] = aa[j-1][i] + bb[j][i] * cc[j][i]; } } } dummy(); } return aa[0][0]; } s2275: typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 // array definitions real_t a[LEN_2D],b[LEN_2D],c[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D]; int main(struct args_t * func_args) { // loop distribution is needed to be able to interchange for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) { for (int i = 0; i < LEN_2D; i++) { for (int j = 0; j < LEN_2D; j++) { aa[j][i] = aa[j][i] + bb[j][i] * cc[j][i]; } a[i] = b[i] + c[i] * d[i]; } dummy(); } return aa[0][0]; } (In reply to Jan Hubicka from comment #4) > s275: > typedef float real_t; > > #define iterations 100000 > #define LEN_1D 32000 > #define LEN_2D 256 > // array definitions > > real_t > a[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D], > tt[LEN_2D][LEN_2D]; > > int main(struct args_t * func_args) > { > // control flow > // if around inner loop, interchanging needed > > for (int i = 0; i < LEN_2D; i++) > aa[0][i]=1; > > for (int nl = 0; nl < 10*(iterations/LEN_2D); nl++) { > for (int i = 0; i < LEN_2D; i++) { > if (aa[0][i] > (real_t)0.) { > for (int j = 1; j < LEN_2D; j++) { > aa[j][i] = aa[j-1][i] + bb[j][i] * cc[j][i]; > } > } > } > dummy(); > } > return aa[0][0]; > } This would need to be supported by interchange itself - it's basically un-unswitching the condition, moving it inside the innermost loop and then performing interchange. Not sure how difficult it would be to handle this, but it looks like a candidate for diagnostic aka 'note: interchanging loops will improve performance' aka "fix your code". |