This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug tree-optimization/24653] New: [4.1 regression] EON regressed seriously on x86-64


Eon seems to be our largest regression on x86-64 relative to all previous GCCs
up to 3.3-hammer branch.
The slowdown is visible at -O2 for about 7%, at -O3 -ffast-math -march=k8
-funroll-all-loops and profile feedback it is already over 10%.
I've hacked sources so inline decisiosns of 4.0 and 4.1 mostly match
(initializer of gritIterator needs to be marked alwaysinline, but doing so
won't make situation better) and got following profile out of 4.1:
CPU: AMD64 processors, speed 2400.17 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask
of 0x00 (No unit mask) count 500000
samples  %        symbol name
120973   12.0901  mrSurfaceList::viewingHit(ggRay3 const&, double, double,
double, mrViewingHitRecord&, ggMaterialRecord&) const120622   12.0550 
mrGrid::viewingHit(ggRay3 const&, double, double, double, mrViewingHitRecord&,
ggMaterialRecord&) const
119050   11.8979  ggSpectrum::Set(float)
82415     8.2366  mrGrid::shadowHit(ggRay3 const&, double, double, double,
double&, ggVector3&, int&, ggSpectrum&) const
43740     4.3714  ggRayXZRectangleIntersect(ggRay3 const&, float, float, float,
float, float, double, double, double&, double&,
double&)
32433     3.2414  ggRayBoxIntersect(ggRay3 const&, ggBox3 const&, double,
double, ggONB3&, ggPoint3&, double&)
28028     2.8011  mrSurfaceList::shadowHit(ggRay3 const&, double, double,
double, double&, ggVector3&, int&, ggSpectrum&) const
25568     2.5553  mrInstance::shadowHit(ggRay3 const&, double, double, double,
double&, ggVector3&, int&, ggSpectrum&) const
25541     2.5526  mrXZRectangle::shadowHit(ggRay3 const&, double, double,
double, double&, ggVector3&, int&, ggSpectrum&) const

and out of 4.0:
110943   12.0589  ggSpectrum::Set(float)
97609    10.6095  mrGrid::viewingHit(ggRay3 const&, double, double, double,
mrViewingHitRecord&, ggMaterialRecord&) const
93316    10.1429  mrSurfaceList::viewingHit(ggRay3 const&, double, double,
double, mrViewingHitRecord&, ggMaterialRecord&) const68093     7.4013 
mrGrid::shadowHit(ggRay3 const&, double, double, double, double&, ggVector3&,
int&, ggSpectrum&) const
31232     3.3947  ggRayXZRectangleIntersect(ggRay3 const&, float, float, float,
float, float, double, double, double&, double&,
double&)
30519     3.3172  mrSurfaceList::shadowHit(ggRay3 const&, double, double,
double, double&, ggVector3&, int&, ggSpectrum&) const
25570     2.7793  mrInstance::shadowHit(ggRay3 const&, double, double, double,
double&, ggVector3&, int&, ggSpectrum&) const
24977     2.7149  mrCookPixelRenderer::directLight(ggRay3 const&, double,
ggPoint3 const&, ggONB3 const&, ggPoint2 const&, ggBRD

So all the *hit functions got consistently slower.  Fortunately the all looks
pretty much same (walk data using grid iterator), so it might have common
cause.
I will attach oprofiled assembly of the viewingHit function. Main difference is
in the longest BB of function that is pretty much different because 4.1 SRA out
the iterator.  -fno-tree-sra won't make regression disappear.  In 4.1
-fno-tree-sra the BB in question looks like:
  k_462 = iterator.k;
Honza  j_463 = iterator.j;
  i_464 = iterator.i;
  D.46656_476 = p_142->e[0];
  D.46609.e[0] = D.46656_476;
  D.46657_477 = p_142->e[1];
  D.46609.e[1] = D.46657_477;
  D.46658_478 = p_142->e[2];
  D.46609.e[2] = D.46658_478;
  D.46611_481 = D.46609.e[2];
  D.46612_482 = (double) k_462;
  D.46614_484 = D.46467_416 * D.46612_482;
  e2_485 = D.46611_481 + D.46614_484;
  D.46616.e[0] = D.46656_476;
  D.46616.e[1] = D.46657_477;
  D.46616.e[2] = D.46658_478;
  D.46618_497 = D.46616.e[1];
  D.46619_498 = (double) j_463;
  D.46621_500 = D.46458_394 * D.46619_498;
  e1_501 = D.46618_497 + D.46621_500;
  D.46623.e[0] = D.46656_476;
  D.46623.e[1] = D.46657_477;
  D.46623.e[2] = D.46658_478;
  D.46625_513 = D.46623.e[0];
  D.46626_514 = (double) i_464;
  D.46628_516 = D.46449_372 * D.46626_514;
  e0_517 = D.46625_513 + D.46628_516;
  boxMin.e[0] = e0_517;
  boxMin.e[1] = e1_501;
  boxMin.e[2] = e2_485;
  D.46630.e[0] = D.46656_476;
  D.46630.e[1] = D.46657_477;
  D.46630.e[2] = D.46658_478;
  D.46632_536 = D.46630.e[2];
  D.46633_537 = k_462 + 1;
  D.46634_538 = (double) D.46633_537;
  D.46635_540 = D.46467_416 * D.46634_538;
  e2_541 = D.46632_536 + D.46635_540;
  D.46637.e[0] = D.46656_476;
  D.46637.e[1] = D.46657_477;
  D.46637.e[2] = D.46658_478;
  D.46639_553 = D.46637.e[1];
  D.46640_554 = j_463 + 1;
  D.46641_555 = (double) D.46640_554;
  D.46642_557 = D.46458_394 * D.46641_555;
  e1_558 = D.46639_553 + D.46642_557;
  D.46644.e[0] = D.46656_476;
  D.46644.e[1] = D.46657_477;
  D.46644.e[2] = D.46658_478;
  D.46646_570 = D.46644.e[0];
  D.46647_571 = i_464 + 1;
  D.46648_572 = (double) D.46647_571;
  D.46649_574 = D.46449_372 * D.46648_572;
  e0_575 = D.46646_570 + D.46649_574;
  boxMax.e[0] = e0_575;
  boxMax.e[1] = e1_558;
  boxMax.e[2] = e2_541;
  cellBox.pmin.e[2] = 0.0;
  cellBox.pmin.e[1] = 0.0;
  cellBox.pmin.e[0] = 0.0;
  cellBox.pmax.e[2] = 0.0;
  cellBox.pmax.e[1] = 0.0;
  cellBox.pmax.e[0] = 0.0;
  cellBox.pmin = boxMin;
  cellBox.pmax = boxMax;
  D.46719.e[0] = D.46538_345;
  D.46719.e[1] = D.46535_342;
  D.46719.e[2] = D.46532_339;
  o_601 = D.46719.e[0];
  D.46721.e[0] = D.46523_329;
  D.46721.e[1] = D.46521_327;
  D.46721.e[2] = D.46519_325;
  temp_612 = D.46721.e[0];
  if (temp_612 != 0.0) goto <L46>; else goto <L48>;
4.0 version is:
<L32>:;
  t1.187_430 = t1;
  iterator.tCellMax = t1.187_430;
  D.41484_448 = v_60->e[2];
  e2_449 = t1.187_430 * D.41484_448;
  D.41486_450 = v_60->e[1];
  e1_451 = t1.187_430 * D.41486_450;
  D.41488_452 = v_60->e[0];
  e0_453 = t1.187_430 * D.41488_452;
  D.41499_462 = p_74->e[2];
  p$e$2_464 = e2_449 + D.41499_462;
  D.41502_465 = p_74->e[1];
  p$e$1_467 = e1_451 + D.41502_465;
  D.41505_468 = p_74->e[0];
  p$e$0_470 = e0_453 + D.41505_468;
  D.41425_480 = iterator.iGrid;
  this_481 = &D.41425_480->gridBox;
  p_484 = &this_481->pmin;
  SR.507_487 = p_484->e[0];
  D.41429_493 = p$e$0_470 - SR.507_487;
  D.41430_495 = D.41425_480->xDimension;
  D.41431_496 = D.41429_493 / D.41430_495;
  D.41432_497 = (int) D.41431_496;
  iterator.i = D.41432_497;
  D.41425_502 = iterator.iGrid;
  this_503 = &D.41425_502->gridBox;
  p_506 = &this_503->pmin;
  SR.509_510 = p_506->e[1];
  D.41438_515 = p$e$1_467 - SR.509_510;
  D.41439_517 = D.41425_502->yDimension;
  D.41440_518 = D.41438_515 / D.41439_517;
  D.41441_519 = (int) D.41440_518;
  iterator.j = D.41441_519;
  D.41425_524 = iterator.iGrid;
  this_525 = &D.41425_524->gridBox;
  p_528 = &this_525->pmin;
  SR.511_533 = p_528->e[2];
  D.41447_537 = p$e$2_464 - SR.511_533;
  D.41448_539 = D.41425_524->zDimension;
  D.41449_540 = D.41447_537 / D.41448_539;
  D.41450_541 = (int) D.41449_540;
  iterator.k = D.41450_541;
  D.41451_543 = iterator.i;
  D.41425_544 = iterator.iGrid;
  D.41452_545 = D.41425_544->nx;
  if (D.41451_543 >= D.41452_545) goto <L45>; else goto <L46>;

What seems strange is that the [2] in 4.0 version gets just used in arithmetic,
while 4.1 copies it around for some reason I don't follow (yet)

Honza


-- 
           Summary: [4.1 regression] EON regressed seriously on x86-64
           Product: gcc
           Version: 4.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: hubicka at gcc dot gnu dot org
 GCC build triplet: x86_64-gnu-linux
  GCC host triplet: x86_64-gnu-linux
GCC target triplet: x86_64-gnu-linux


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24653


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]