This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/68030] Redundant address calculations in vectorized loop
- From: "rguenther at suse dot de" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Tue, 10 May 2016 17:05:10 +0000
- Subject: [Bug tree-optimization/68030] Redundant address calculations in vectorized loop
- Auto-submitted: auto-generated
- References: <bug-68030-4 at http dot gcc dot gnu dot org/bugzilla/>
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68030
--- Comment #7 from rguenther at suse dot de <rguenther at suse dot de> ---
On May 10, 2016 6:25:57 PM GMT+02:00, "amker at gcc dot gnu.org"
<gcc-bugzilla@gcc.gnu.org> wrote:
>https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68030
>
>--- Comment #6 from amker at gcc dot gnu.org ---
>It's not only the vectorizer generating CSE sub-optimal code, pre and
>lim also
>do this kind of transform.
In another PR I suggested swapping LIM and PRE to cleanup after LIM. IIRC that
had some testsuite regressions.
>Compiling the attached example with below command line
>
>$ ./gcc -S -Ofast -march=haswell pr68030.c -o pr68030.S
>-fdump-tree-vect-details -fdump-tree-slp -fdump-tree-ivopts-details
>-fdump-tree-all -fno-tree-vectorize
>
>Gives below dump info before IVOPT:
>
> <bb 2>:
> local_Filter_33 = global_Filters;
> pretmp_887 = global_Output;
> pretmp_889 = global_Input;
> goto <bb 7>;
>
> <bb 3>:
>
> <bb 4>:
> # ix_187 = PHI <_202(3), 2(7)>
> # ivtmp_1065 = PHI <ivtmp_1064(3), 512(7)>
> _154 = ix_187 + -2;
> _157 = _154 + _971;
> _158 = (long unsigned int) _157;
> _159 = _158 * 4;
> _160 = pretmp_889 + _159;
> _161 = *_160;
> _165 = *local_Filter_33;
> _166 = _161 * _165;
> _170 = ix_187 + -1;
> _173 = _170 + _971;
> _174 = (long unsigned int) _173;
> _175 = _174 * 4;
> _176 = pretmp_889 + _175;
> _177 = *_176;
> _181 = MEM[(float *)local_Filter_33 + 4B];
> _182 = _177 * _181;
> _81 = _166 + _182;
> _189 = ix_187 + _971;
> _190 = (long unsigned int) _189;
> _191 = _190 * 4;
> _192 = pretmp_889 + _191;
> _193 = *_192;
> _197 = MEM[(float *)local_Filter_33 + 8B];
> _198 = _193 * _197;
> _202 = ix_187 + 1;
> _205 = _202 + _971;
> _206 = (long unsigned int) _205;
> _207 = _206 * 4;
> _208 = pretmp_889 + _207;
> _209 = *_208;
> _213 = MEM[(float *)local_Filter_33 + 12B];
> _214 = _209 * _213;
> _218 = ix_187 + 2;
> _221 = _218 + _971;
> _222 = (long unsigned int) _221;
> _223 = _222 * 4;
> _224 = pretmp_889 + _223;
> _225 = *_224;
> _229 = MEM[(float *)local_Filter_33 + 16B];
> _230 = _225 * _229;
> _82 = _214 + _230;
> _67 = _81 + _82;
> _243 = _154 + _980;
> _244 = (long unsigned int) _243;
> _245 = _244 * 4;
> _246 = pretmp_889 + _245;
> _247 = *_246;
> _251 = MEM[(float *)local_Filter_33 + 20B];
> _252 = _247 * _251;
> _259 = _170 + _980;
> _260 = (long unsigned int) _259;
> _261 = _260 * 4;
> _262 = pretmp_889 + _261;
> _263 = *_262;
> _267 = MEM[(float *)local_Filter_33 + 24B];
> _268 = _263 * _267;
> _78 = _252 + _268;
> _275 = ix_187 + _980;
> _276 = (long unsigned int) _275;
> _277 = _276 * 4;
> _278 = pretmp_889 + _277;
> _279 = *_278;
> _283 = MEM[(float *)local_Filter_33 + 28B];
> _284 = _279 * _283;
> _72 = _198 + _284;
> _291 = _202 + _980;
> _292 = (long unsigned int) _291;
> _293 = _292 * 4;
> _294 = pretmp_889 + _293;
> _295 = *_294;
> _299 = MEM[(float *)local_Filter_33 + 32B];
> _300 = _295 * _299;
> _307 = _218 + _980;
> _308 = (long unsigned int) _307;
> _309 = _308 * 4;
> _310 = pretmp_889 + _309;
> _311 = *_310;
> _315 = MEM[(float *)local_Filter_33 + 36B];
> _316 = _311 * _315;
> _79 = _300 + _316;
> _56 = _78 + _79;
> _329 = _154 + _985;
> _330 = (long unsigned int) _329;
> _331 = _330 * 4;
> _332 = pretmp_889 + _331;
> _333 = *_332;
> _337 = MEM[(float *)local_Filter_33 + 40B];
> _338 = _333 * _337;
> _345 = _170 + _985;
> _346 = (long unsigned int) _345;
> _347 = _346 * 4;
> _348 = pretmp_889 + _347;
> _349 = *_348;
> _353 = MEM[(float *)local_Filter_33 + 44B];
> _354 = _349 * _353;
> _75 = _338 + _354;
> _361 = ix_187 + _985;
> _362 = (long unsigned int) _361;
> _363 = _362 * 4;
> _364 = pretmp_889 + _363;
> _365 = *_364;
> _369 = MEM[(float *)local_Filter_33 + 48B];
> _370 = _365 * _369;
> _377 = _202 + _985;
> _378 = (long unsigned int) _377;
> _379 = _378 * 4;
> _380 = pretmp_889 + _379;
> _381 = *_380;
> _385 = MEM[(float *)local_Filter_33 + 52B];
> _386 = _381 * _385;
> _393 = _218 + _985;
> _394 = (long unsigned int) _393;
> _395 = _394 * 4;
> _396 = pretmp_889 + _395;
> _397 = *_396;
> _401 = MEM[(float *)local_Filter_33 + 56B];
> _402 = _397 * _401;
> _76 = _386 + _402;
> _495 = _75 + _76;
> _415 = _154 + _991;
> _416 = (long unsigned int) _415;
> _417 = _416 * 4;
> _418 = pretmp_889 + _417;
> _419 = *_418;
> _423 = MEM[(float *)local_Filter_33 + 60B];
> _424 = _419 * _423;
> _431 = _170 + _991;
> _432 = (long unsigned int) _431;
> _433 = _432 * 4;
> _434 = pretmp_889 + _433;
> _435 = *_434;
> _439 = MEM[(float *)local_Filter_33 + 64B];
> _440 = _435 * _439;
> _572 = _424 + _440;
> _447 = ix_187 + _991;
> _448 = (long unsigned int) _447;
> _449 = _448 * 4;
> _450 = pretmp_889 + _449;
> _451 = *_450;
> _455 = MEM[(float *)local_Filter_33 + 68B];
> _456 = _451 * _455;
> _73 = _370 + _456;
> _65 = _72 + _73;
> _55 = _65 + _67;
> _25 = _55 + _56;
> _19 = _25 + _495;
> _463 = _202 + _991;
> _464 = (long unsigned int) _463;
> _465 = _464 * 4;
> _466 = pretmp_889 + _465;
> _467 = *_466;
> _471 = MEM[(float *)local_Filter_33 + 72B];
> _472 = _467 * _471;
> _479 = _218 + _991;
> _480 = (long unsigned int) _479;
> _481 = _480 * 4;
> _482 = pretmp_889 + _481;
> _483 = *_482;
> _487 = MEM[(float *)local_Filter_33 + 76B];
> _488 = _483 * _487;
> _556 = _472 + _488;
> _20 = _556 + _572;
> _429 = _19 + _20;
> _501 = _154 + _997;
> _502 = (long unsigned int) _501;
> _503 = _502 * 4;
> _504 = pretmp_889 + _503;
> _505 = *_504;
> _509 = MEM[(float *)local_Filter_33 + 80B];
> _510 = _505 * _509;
> _517 = _170 + _997;
> _518 = (long unsigned int) _517;
> _519 = _518 * 4;
> _520 = pretmp_889 + _519;
> _521 = *_520;
> _525 = MEM[(float *)local_Filter_33 + 84B];
> _526 = _521 * _525;
> _444 = _510 + _526;
> _533 = ix_187 + _997;
> _534 = (long unsigned int) _533;
> _535 = _534 * 4;
> _536 = pretmp_889 + _535;
> _537 = *_536;
> _541 = MEM[(float *)local_Filter_33 + 88B];
> _542 = _537 * _541;
> _549 = _202 + _997;
> _550 = (long unsigned int) _549;
> _551 = _550 * 4;
> _552 = pretmp_889 + _551;
> _553 = *_552;
> _557 = MEM[(float *)local_Filter_33 + 92B];
> _558 = _553 * _557;
> _565 = _218 + _997;
> _566 = (long unsigned int) _565;
> _567 = _566 * 4;
> _568 = pretmp_889 + _567;
> _569 = *_568;
> _573 = MEM[(float *)local_Filter_33 + 96B];
> _574 = _569 * _573;
> _445 = _558 + _574;
> _430 = _444 + _445;
> _257 = _429 + _430;
> sum_575 = _257 + _542;
> _21 = pretmp_887 + _363;
> *_21 = sum_575;
> ivtmp_1064 = ivtmp_1065 - 1;
> if (ivtmp_1064 != 0)
> goto <bb 3>;
> else
> goto <bb 5>;
>
> <bb 5>:
> ivtmp_1062 = ivtmp_1063 - 1;
> if (ivtmp_1062 != 0)
> goto <bb 6>;
> else
> goto <bb 8>;
>
> <bb 6>:
>
> <bb 7>:
> # iy_186 = PHI <_990(6), 2(2)>
> # ivtmp_1063 = PHI <ivtmp_1062(6), 512(2)>
> _970 = iy_186 + -2;
> _971 = _970 * 516;
> _979 = iy_186 + -1;
> _980 = _979 * 516;
> _985 = iy_186 * 516;
> _990 = iy_186 + 1;
> _991 = _990 * 516;
> _996 = iy_186 + 2;
> _997 = _996 * 516;
> goto <bb 4>;
>
> <bb 8>:
> return;
>
>Most memory references in <bb 4> are accessing the same memory object,
>but
>IVOPT failed to group these IVs because PRE hoists some parts of
>address
>computation into <bb7>. And PRE/LIM creates more difficult code than
>vectorizer because the CSE opportunities are hidden by re-association.
>
>I will first try to fix vectorizer issue since PRE/LIM issue isn't that
>critical because it's only exposed in loops unrolled by tree cunroll,
>and in
>versioned/peeled loops only.