This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/68030] Redundant address calculations in vectorized loop
- From: "amker at gcc dot gnu.org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Tue, 10 May 2016 16:25:57 +0000
- Subject: [Bug tree-optimization/68030] Redundant address calculations in vectorized loop
- Auto-submitted: auto-generated
- References: <bug-68030-4 at http dot gcc dot gnu dot org/bugzilla/>
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68030
--- Comment #6 from amker at gcc dot gnu.org ---
It's not only the vectorizer generating CSE sub-optimal code, pre and lim also
do this kind of transform.
Compiling the attached example with below command line
$ ./gcc -S -Ofast -march=haswell pr68030.c -o pr68030.S
-fdump-tree-vect-details -fdump-tree-slp -fdump-tree-ivopts-details
-fdump-tree-all -fno-tree-vectorize
Gives below dump info before IVOPT:
<bb 2>:
local_Filter_33 = global_Filters;
pretmp_887 = global_Output;
pretmp_889 = global_Input;
goto <bb 7>;
<bb 3>:
<bb 4>:
# ix_187 = PHI <_202(3), 2(7)>
# ivtmp_1065 = PHI <ivtmp_1064(3), 512(7)>
_154 = ix_187 + -2;
_157 = _154 + _971;
_158 = (long unsigned int) _157;
_159 = _158 * 4;
_160 = pretmp_889 + _159;
_161 = *_160;
_165 = *local_Filter_33;
_166 = _161 * _165;
_170 = ix_187 + -1;
_173 = _170 + _971;
_174 = (long unsigned int) _173;
_175 = _174 * 4;
_176 = pretmp_889 + _175;
_177 = *_176;
_181 = MEM[(float *)local_Filter_33 + 4B];
_182 = _177 * _181;
_81 = _166 + _182;
_189 = ix_187 + _971;
_190 = (long unsigned int) _189;
_191 = _190 * 4;
_192 = pretmp_889 + _191;
_193 = *_192;
_197 = MEM[(float *)local_Filter_33 + 8B];
_198 = _193 * _197;
_202 = ix_187 + 1;
_205 = _202 + _971;
_206 = (long unsigned int) _205;
_207 = _206 * 4;
_208 = pretmp_889 + _207;
_209 = *_208;
_213 = MEM[(float *)local_Filter_33 + 12B];
_214 = _209 * _213;
_218 = ix_187 + 2;
_221 = _218 + _971;
_222 = (long unsigned int) _221;
_223 = _222 * 4;
_224 = pretmp_889 + _223;
_225 = *_224;
_229 = MEM[(float *)local_Filter_33 + 16B];
_230 = _225 * _229;
_82 = _214 + _230;
_67 = _81 + _82;
_243 = _154 + _980;
_244 = (long unsigned int) _243;
_245 = _244 * 4;
_246 = pretmp_889 + _245;
_247 = *_246;
_251 = MEM[(float *)local_Filter_33 + 20B];
_252 = _247 * _251;
_259 = _170 + _980;
_260 = (long unsigned int) _259;
_261 = _260 * 4;
_262 = pretmp_889 + _261;
_263 = *_262;
_267 = MEM[(float *)local_Filter_33 + 24B];
_268 = _263 * _267;
_78 = _252 + _268;
_275 = ix_187 + _980;
_276 = (long unsigned int) _275;
_277 = _276 * 4;
_278 = pretmp_889 + _277;
_279 = *_278;
_283 = MEM[(float *)local_Filter_33 + 28B];
_284 = _279 * _283;
_72 = _198 + _284;
_291 = _202 + _980;
_292 = (long unsigned int) _291;
_293 = _292 * 4;
_294 = pretmp_889 + _293;
_295 = *_294;
_299 = MEM[(float *)local_Filter_33 + 32B];
_300 = _295 * _299;
_307 = _218 + _980;
_308 = (long unsigned int) _307;
_309 = _308 * 4;
_310 = pretmp_889 + _309;
_311 = *_310;
_315 = MEM[(float *)local_Filter_33 + 36B];
_316 = _311 * _315;
_79 = _300 + _316;
_56 = _78 + _79;
_329 = _154 + _985;
_330 = (long unsigned int) _329;
_331 = _330 * 4;
_332 = pretmp_889 + _331;
_333 = *_332;
_337 = MEM[(float *)local_Filter_33 + 40B];
_338 = _333 * _337;
_345 = _170 + _985;
_346 = (long unsigned int) _345;
_347 = _346 * 4;
_348 = pretmp_889 + _347;
_349 = *_348;
_353 = MEM[(float *)local_Filter_33 + 44B];
_354 = _349 * _353;
_75 = _338 + _354;
_361 = ix_187 + _985;
_362 = (long unsigned int) _361;
_363 = _362 * 4;
_364 = pretmp_889 + _363;
_365 = *_364;
_369 = MEM[(float *)local_Filter_33 + 48B];
_370 = _365 * _369;
_377 = _202 + _985;
_378 = (long unsigned int) _377;
_379 = _378 * 4;
_380 = pretmp_889 + _379;
_381 = *_380;
_385 = MEM[(float *)local_Filter_33 + 52B];
_386 = _381 * _385;
_393 = _218 + _985;
_394 = (long unsigned int) _393;
_395 = _394 * 4;
_396 = pretmp_889 + _395;
_397 = *_396;
_401 = MEM[(float *)local_Filter_33 + 56B];
_402 = _397 * _401;
_76 = _386 + _402;
_495 = _75 + _76;
_415 = _154 + _991;
_416 = (long unsigned int) _415;
_417 = _416 * 4;
_418 = pretmp_889 + _417;
_419 = *_418;
_423 = MEM[(float *)local_Filter_33 + 60B];
_424 = _419 * _423;
_431 = _170 + _991;
_432 = (long unsigned int) _431;
_433 = _432 * 4;
_434 = pretmp_889 + _433;
_435 = *_434;
_439 = MEM[(float *)local_Filter_33 + 64B];
_440 = _435 * _439;
_572 = _424 + _440;
_447 = ix_187 + _991;
_448 = (long unsigned int) _447;
_449 = _448 * 4;
_450 = pretmp_889 + _449;
_451 = *_450;
_455 = MEM[(float *)local_Filter_33 + 68B];
_456 = _451 * _455;
_73 = _370 + _456;
_65 = _72 + _73;
_55 = _65 + _67;
_25 = _55 + _56;
_19 = _25 + _495;
_463 = _202 + _991;
_464 = (long unsigned int) _463;
_465 = _464 * 4;
_466 = pretmp_889 + _465;
_467 = *_466;
_471 = MEM[(float *)local_Filter_33 + 72B];
_472 = _467 * _471;
_479 = _218 + _991;
_480 = (long unsigned int) _479;
_481 = _480 * 4;
_482 = pretmp_889 + _481;
_483 = *_482;
_487 = MEM[(float *)local_Filter_33 + 76B];
_488 = _483 * _487;
_556 = _472 + _488;
_20 = _556 + _572;
_429 = _19 + _20;
_501 = _154 + _997;
_502 = (long unsigned int) _501;
_503 = _502 * 4;
_504 = pretmp_889 + _503;
_505 = *_504;
_509 = MEM[(float *)local_Filter_33 + 80B];
_510 = _505 * _509;
_517 = _170 + _997;
_518 = (long unsigned int) _517;
_519 = _518 * 4;
_520 = pretmp_889 + _519;
_521 = *_520;
_525 = MEM[(float *)local_Filter_33 + 84B];
_526 = _521 * _525;
_444 = _510 + _526;
_533 = ix_187 + _997;
_534 = (long unsigned int) _533;
_535 = _534 * 4;
_536 = pretmp_889 + _535;
_537 = *_536;
_541 = MEM[(float *)local_Filter_33 + 88B];
_542 = _537 * _541;
_549 = _202 + _997;
_550 = (long unsigned int) _549;
_551 = _550 * 4;
_552 = pretmp_889 + _551;
_553 = *_552;
_557 = MEM[(float *)local_Filter_33 + 92B];
_558 = _553 * _557;
_565 = _218 + _997;
_566 = (long unsigned int) _565;
_567 = _566 * 4;
_568 = pretmp_889 + _567;
_569 = *_568;
_573 = MEM[(float *)local_Filter_33 + 96B];
_574 = _569 * _573;
_445 = _558 + _574;
_430 = _444 + _445;
_257 = _429 + _430;
sum_575 = _257 + _542;
_21 = pretmp_887 + _363;
*_21 = sum_575;
ivtmp_1064 = ivtmp_1065 - 1;
if (ivtmp_1064 != 0)
goto <bb 3>;
else
goto <bb 5>;
<bb 5>:
ivtmp_1062 = ivtmp_1063 - 1;
if (ivtmp_1062 != 0)
goto <bb 6>;
else
goto <bb 8>;
<bb 6>:
<bb 7>:
# iy_186 = PHI <_990(6), 2(2)>
# ivtmp_1063 = PHI <ivtmp_1062(6), 512(2)>
_970 = iy_186 + -2;
_971 = _970 * 516;
_979 = iy_186 + -1;
_980 = _979 * 516;
_985 = iy_186 * 516;
_990 = iy_186 + 1;
_991 = _990 * 516;
_996 = iy_186 + 2;
_997 = _996 * 516;
goto <bb 4>;
<bb 8>:
return;
Most memory references in <bb 4> are accessing the same memory object, but
IVOPT failed to group these IVs because PRE hoists some parts of address
computation into <bb7>. And PRE/LIM creates more difficult code than
vectorizer because the CSE opportunities are hidden by re-association.
I will first try to fix vectorizer issue since PRE/LIM issue isn't that
critical because it's only exposed in loops unrolled by tree cunroll, and in
versioned/peeled loops only.