This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GVN-PRE sucks -- well, for SPECCFP2000 mgrid anyway


Hi,

SPEC2000's mgrid benchmark has typical code fortran that looks
like this:

      DO I3=2,N-1
        DO I2=2,N-1
          DO I1=2,N-1
            R(I1,I2,I3) = V(I1,I2,I3)
     >      -A(0)*( U(I1,  I2,  I3  ) )
     >      -A(1)*( U(I1-1,I2,  I3  ) + U(I1+1,I2,  I3  )
     >                 +  U(I1,  I2-1,I3  ) + U(I1,  I2+1,I3  )
     >                 +  U(I1,  I2,  I3-1) + U(I1,  I2,  I3+1) )
     >      -A(2)*( U(I1-1,I2-1,I3  ) + U(I1+1,I2-1,I3  )
     >                 +  U(I1-1,I2+1,I3  ) + U(I1+1,I2+1,I3  )
     >                 +  U(I1,  I2-1,I3-1) + U(I1,  I2+1,I3-1)
     >                 +  U(I1,  I2-1,I3+1) + U(I1,  I2+1,I3+1)
     >                 +  U(I1-1,I2,  I3-1) + U(I1-1,I2,  I3+1)
     >                 +  U(I1+1,I2,  I3-1) + U(I1+1,I2,  I3+1) )
     >      -A(3)*( U(I1-1,I2-1,I3-1) + U(I1+1,I2-1,I3-1)
     >                 +  U(I1-1,I2+1,I3-1) + U(I1+1,I2+1,I3-1)
     >                 +  U(I1-1,I2-1,I3+1) + U(I1+1,I2-1,I3+1)
     >                 +  U(I1-1,I2+1,I3+1) + U(I1+1,I2+1,I3+1) )
          END DO
        END DO
      END DO

where R,V,A, and U are function arguments and I[123] are loop
counters.

When gfortran flattens the arrays it exposes a lot of redundancies
for address arithmetic, most of which is loop invariant.  So PRE
gets to work, moves everything it can out of the loop, and we get:

  # BLOCK 14
  # PRED: 0 [79.0%]  (false,exec)
<L28>:;
  pretmp.202<D919>_117 = (int8<D8>)2;
  pretmp.204<D921>_475 = stride.10<D458>_28 * pretmp.202<D919>_117;
  # SUCC: 1 [100.0%]  (fallthru)

  # BLOCK 1
  # PRED: 17 [100.0%]  (fallthru) 14 [100.0%]  (fallthru)
  # prephitmp.205<D922>_466 = PHI <T.95<D560>_214(17), pretmp.204<D921>_475(14)>;
  # prephitmp.203<D920>_122 = PHI <T.94<D559>_213(17), pretmp.202<D919>_117(14)>;
  # TMT.201<D892>_425 = PHI <TMT.201<D892>_502(14), TMT.201<D892>_532(17)>;
  # TMT.200<D891>_357 = PHI <TMT.200<D891>_503(14), TMT.200<D891>_531(17)>;
  # i3<D445>_262 = PHI <2(14), T.93<D558>_212(17)>;
  # count.18<D466>_261 = PHI <count.18<D466>_60(14), count.18<D466>_118(17)>;
<L2>:;
  pretmp.206<D923>_465 = pretmp.202<D919>_117;
  pretmp.208<D925>_463 = stride.8<D456>_24 * pretmp.206<D923>_465;
  pretmp.210<D927>_461 = (int8<D8>)i3<D445>_262;
  pretmp.212<D929>_453 = stride.10<D458>_28 * pretmp.210<D927>_461;
  pretmp.214<D931>_451 = pretmp.208<D925>_463 + pretmp.212<D929>_453;
  pretmp.216<D933>_449 = i3<D445>_262 - 1;
  pretmp.218<D935>_447 = (int8<D8>)pretmp.216<D933>_449;
  pretmp.220<D937>_441 = stride.10<D458>_28 * pretmp.218<D935>_447;
  pretmp.222<D939>_411 = pretmp.208<D925>_463 + pretmp.220<D937>_441;
  pretmp.224<D941>_409 = i3<D445>_262 + 1;
  pretmp.226<D943>_403 = (int8<D8>)pretmp.224<D941>_409;
  pretmp.228<D945>_401 = stride.10<D458>_28 * pretmp.226<D943>_403;
  pretmp.230<D947>_399 = pretmp.208<D925>_463 + pretmp.228<D945>_401;
  # SUCC: 2 [100.0%]  (fallthru,exec)

  # BLOCK 2
  # PRED: 16 [100.0%]  (fallthru) 1 [100.0%]  (fallthru,exec)
  # prephitmp.231<D948>_398 = PHI <T.134<D599>_313(16), pretmp.230<D947>_399(1)>;
  # prephitmp.229<D946>_400 = PHI <T.95<D560>_214(16), pretmp.228<D945>_401(1)>;
  # prephitmp.227<D944>_402 = PHI <T.94<D559>_213(16), pretmp.226<D943>_403(1)>;
  # prephitmp.225<D942>_408 = PHI <T.93<D558>_212(16), pretmp.224<D941>_409(1)>;
  # prephitmp.223<D940>_410 = PHI <T.124<D589>_289(16), pretmp.222<D939>_411(1)>;
  # prephitmp.221<D938>_440 = PHI <T.87<D552>_203(16), pretmp.220<D937>_441(1)>;
  # prephitmp.219<D936>_442 = PHI <T.86<D551>_202(16), pretmp.218<D935>_447(1)>;
  # prephitmp.217<D934>_448 = PHI <T.85<D550>_201(16), pretmp.216<D933>_449(1)>;
  # prephitmp.215<D932>_450 = PHI <T.80<D545>_193(16), pretmp.214<D931>_451(1)>;
  # prephitmp.213<D930>_452 = PHI <T.37<D502>_127(16), pretmp.212<D929>_453(1)>;
  # prephitmp.211<D928>_454 = PHI <T.36<D501>_126(16), pretmp.210<D927>_461(1)>;
  # prephitmp.209<D926>_462 = PHI <T.79<D544>_190(16), pretmp.208<D925>_463(1)>;
  # prephitmp.207<D924>_464 = PHI <T.78<D543>_189(16), pretmp.206<D923>_465(1)>;
  # TMT.201<D892>_426 = PHI <TMT.201<D892>_425(1), TMT.201<D892>_532(16)>;
  # TMT.200<D891>_358 = PHI <TMT.200<D891>_357(1), TMT.200<D891>_531(16)>;
  # i2<D447>_242 = PHI <2(1), T.77<D542>_188(16)>;
  # count.18<D466>_241 = PHI <count.18<D466>_60(1), count.18<D466>_123(16)>;
<L4>:;
  pretmp.232<D949>_397 = (int8<D8>)i2<D447>_242;
  pretmp.234<D951>_382 = stride.8<D456>_24 * pretmp.232<D949>_397;
  pretmp.236<D953>_377 = pretmp.210<D927>_461;
  pretmp.238<D955>_375 = prephitmp.205<D922>_466;
  pretmp.240<D957>_373 = pretmp.234<D951>_382 + pretmp.238<D955>_375;
  pretmp.242<D959>_371 = pretmp.206<D923>_465;
  pretmp.244<D961>_347 = pretmp.240<D957>_373 + pretmp.242<D959>_371;
  pretmp.246<D963>_345 = offset.11<D459>_34 + pretmp.244<D961>_347;
  pretmp.248<D965>_343 = i2<D447>_242 - 1;
  pretmp.250<D967>_337 = (int8<D8>)pretmp.248<D965>_343;
  pretmp.252<D969>_335 = stride.8<D456>_24 * pretmp.250<D967>_337;
  pretmp.254<D971>_333 = pretmp.238<D955>_375 + pretmp.252<D969>_335;
  pretmp.256<D973>_314 = pretmp.242<D959>_371 + pretmp.254<D971>_333;
  pretmp.258<D975>_311 = offset.11<D459>_34 + pretmp.256<D973>_314;
  pretmp.260<D977>_309 = i2<D447>_242 + 1;
  pretmp.262<D979>_307 = (int8<D8>)pretmp.260<D977>_309;
  pretmp.264<D981>_300 = stride.8<D456>_24 * pretmp.262<D979>_307;
  pretmp.266<D983>_278 = pretmp.238<D955>_375 + pretmp.264<D981>_300;
  pretmp.268<D985>_275 = pretmp.242<D959>_371 + pretmp.266<D983>_278;
  pretmp.270<D987>_273 = offset.11<D459>_34 + pretmp.268<D985>_275;
  pretmp.272<D989>_271 = pretmp.216<D933>_449;
  pretmp.274<D991>_265 = pretmp.218<D935>_447;
  pretmp.276<D993>_263 = pretmp.220<D937>_441;
  pretmp.278<D995>_259 = pretmp.234<D951>_382 + pretmp.276<D993>_263;
  pretmp.280<D997>_253 = pretmp.242<D959>_371 + pretmp.278<D995>_259;
  pretmp.282<D999>_251 = offset.11<D459>_34 + pretmp.280<D997>_253;
  pretmp.284<D1001>_249 = pretmp.224<D941>_409;
  pretmp.286<D1003>_247 = pretmp.226<D943>_403;
  pretmp.288<D1005>_239 = pretmp.228<D945>_401;
  pretmp.290<D1007>_237 = pretmp.234<D951>_382 + pretmp.288<D1005>_239;
  pretmp.292<D1009>_235 = pretmp.242<D959>_371 + pretmp.290<D1007>_237;
  pretmp.294<D1011>_230 = offset.11<D459>_34 + pretmp.292<D1009>_235;
  pretmp.296<D1013>_228 = pretmp.252<D969>_335 + pretmp.276<D993>_263;
  pretmp.298<D1015>_211 = pretmp.242<D959>_371 + pretmp.296<D1013>_228;
  pretmp.300<D1017>_205 = offset.11<D459>_34 + pretmp.298<D1015>_211;
  pretmp.302<D1019>_500 = pretmp.264<D981>_300 + pretmp.276<D993>_263;
  pretmp.304<D1021>_498 = pretmp.242<D959>_371 + pretmp.302<D1019>_500;
  pretmp.306<D1023>_496 = offset.11<D459>_34 + pretmp.304<D1021>_498;
  pretmp.308<D1025>_494 = pretmp.252<D969>_335 + pretmp.288<D1005>_239;
  pretmp.310<D1027>_492 = pretmp.242<D959>_371 + pretmp.308<D1025>_494;
  pretmp.312<D1029>_490 = offset.11<D459>_34 + pretmp.310<D1027>_492;
  pretmp.314<D1031>_488 = pretmp.264<D981>_300 + pretmp.288<D1005>_239;
  pretmp.316<D1033>_486 = pretmp.242<D959>_371 + pretmp.314<D1031>_488;
  pretmp.318<D1035>_484 = offset.11<D459>_34 + pretmp.316<D1033>_486;
  # SUCC: 3 [100.0%]  (fallthru,exec)

  # BLOCK 3
  # PRED: 15 [100.0%]  (fallthru) 2 [100.0%]  (fallthru,exec)

[ Notice how all this stuff is hoisted even out of the
  outermost loop! ]

Needless to say that on almost any architecture this will cause so
much spilling that PRE severely pessimizes the generated code.  And
indeed, on AMD64, mgrid compiled with -fno-tree-pre is almost twice
as fast as mgrid with PRE enabled.

I guess this is convincing evidence that we need to teach tree PRE
about register pressure, just like tree loop invariant code motion.

Gr.
Steven



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]