This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/24653] New: [4.1 regression] EON regressed seriously on x86-64
- From: "hubicka at gcc dot gnu dot org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 3 Nov 2005 11:26:20 -0000
- Subject: [Bug tree-optimization/24653] New: [4.1 regression] EON regressed seriously on x86-64
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
Eon seems to be our largest regression on x86-64 relative to all previous GCCs
up to 3.3-hammer branch.
The slowdown is visible at -O2 for about 7%, at -O3 -ffast-math -march=k8
-funroll-all-loops and profile feedback it is already over 10%.
I've hacked sources so inline decisiosns of 4.0 and 4.1 mostly match
(initializer of gritIterator needs to be marked alwaysinline, but doing so
won't make situation better) and got following profile out of 4.1:
CPU: AMD64 processors, speed 2400.17 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask
of 0x00 (No unit mask) count 500000
samples % symbol name
120973 12.0901 mrSurfaceList::viewingHit(ggRay3 const&, double, double,
double, mrViewingHitRecord&, ggMaterialRecord&) const120622 12.0550
mrGrid::viewingHit(ggRay3 const&, double, double, double, mrViewingHitRecord&,
ggMaterialRecord&) const
119050 11.8979 ggSpectrum::Set(float)
82415 8.2366 mrGrid::shadowHit(ggRay3 const&, double, double, double,
double&, ggVector3&, int&, ggSpectrum&) const
43740 4.3714 ggRayXZRectangleIntersect(ggRay3 const&, float, float, float,
float, float, double, double, double&, double&,
double&)
32433 3.2414 ggRayBoxIntersect(ggRay3 const&, ggBox3 const&, double,
double, ggONB3&, ggPoint3&, double&)
28028 2.8011 mrSurfaceList::shadowHit(ggRay3 const&, double, double,
double, double&, ggVector3&, int&, ggSpectrum&) const
25568 2.5553 mrInstance::shadowHit(ggRay3 const&, double, double, double,
double&, ggVector3&, int&, ggSpectrum&) const
25541 2.5526 mrXZRectangle::shadowHit(ggRay3 const&, double, double,
double, double&, ggVector3&, int&, ggSpectrum&) const
and out of 4.0:
110943 12.0589 ggSpectrum::Set(float)
97609 10.6095 mrGrid::viewingHit(ggRay3 const&, double, double, double,
mrViewingHitRecord&, ggMaterialRecord&) const
93316 10.1429 mrSurfaceList::viewingHit(ggRay3 const&, double, double,
double, mrViewingHitRecord&, ggMaterialRecord&) const68093 7.4013
mrGrid::shadowHit(ggRay3 const&, double, double, double, double&, ggVector3&,
int&, ggSpectrum&) const
31232 3.3947 ggRayXZRectangleIntersect(ggRay3 const&, float, float, float,
float, float, double, double, double&, double&,
double&)
30519 3.3172 mrSurfaceList::shadowHit(ggRay3 const&, double, double,
double, double&, ggVector3&, int&, ggSpectrum&) const
25570 2.7793 mrInstance::shadowHit(ggRay3 const&, double, double, double,
double&, ggVector3&, int&, ggSpectrum&) const
24977 2.7149 mrCookPixelRenderer::directLight(ggRay3 const&, double,
ggPoint3 const&, ggONB3 const&, ggPoint2 const&, ggBRD
So all the *hit functions got consistently slower. Fortunately the all looks
pretty much same (walk data using grid iterator), so it might have common
cause.
I will attach oprofiled assembly of the viewingHit function. Main difference is
in the longest BB of function that is pretty much different because 4.1 SRA out
the iterator. -fno-tree-sra won't make regression disappear. In 4.1
-fno-tree-sra the BB in question looks like:
k_462 = iterator.k;
Honza j_463 = iterator.j;
i_464 = iterator.i;
D.46656_476 = p_142->e[0];
D.46609.e[0] = D.46656_476;
D.46657_477 = p_142->e[1];
D.46609.e[1] = D.46657_477;
D.46658_478 = p_142->e[2];
D.46609.e[2] = D.46658_478;
D.46611_481 = D.46609.e[2];
D.46612_482 = (double) k_462;
D.46614_484 = D.46467_416 * D.46612_482;
e2_485 = D.46611_481 + D.46614_484;
D.46616.e[0] = D.46656_476;
D.46616.e[1] = D.46657_477;
D.46616.e[2] = D.46658_478;
D.46618_497 = D.46616.e[1];
D.46619_498 = (double) j_463;
D.46621_500 = D.46458_394 * D.46619_498;
e1_501 = D.46618_497 + D.46621_500;
D.46623.e[0] = D.46656_476;
D.46623.e[1] = D.46657_477;
D.46623.e[2] = D.46658_478;
D.46625_513 = D.46623.e[0];
D.46626_514 = (double) i_464;
D.46628_516 = D.46449_372 * D.46626_514;
e0_517 = D.46625_513 + D.46628_516;
boxMin.e[0] = e0_517;
boxMin.e[1] = e1_501;
boxMin.e[2] = e2_485;
D.46630.e[0] = D.46656_476;
D.46630.e[1] = D.46657_477;
D.46630.e[2] = D.46658_478;
D.46632_536 = D.46630.e[2];
D.46633_537 = k_462 + 1;
D.46634_538 = (double) D.46633_537;
D.46635_540 = D.46467_416 * D.46634_538;
e2_541 = D.46632_536 + D.46635_540;
D.46637.e[0] = D.46656_476;
D.46637.e[1] = D.46657_477;
D.46637.e[2] = D.46658_478;
D.46639_553 = D.46637.e[1];
D.46640_554 = j_463 + 1;
D.46641_555 = (double) D.46640_554;
D.46642_557 = D.46458_394 * D.46641_555;
e1_558 = D.46639_553 + D.46642_557;
D.46644.e[0] = D.46656_476;
D.46644.e[1] = D.46657_477;
D.46644.e[2] = D.46658_478;
D.46646_570 = D.46644.e[0];
D.46647_571 = i_464 + 1;
D.46648_572 = (double) D.46647_571;
D.46649_574 = D.46449_372 * D.46648_572;
e0_575 = D.46646_570 + D.46649_574;
boxMax.e[0] = e0_575;
boxMax.e[1] = e1_558;
boxMax.e[2] = e2_541;
cellBox.pmin.e[2] = 0.0;
cellBox.pmin.e[1] = 0.0;
cellBox.pmin.e[0] = 0.0;
cellBox.pmax.e[2] = 0.0;
cellBox.pmax.e[1] = 0.0;
cellBox.pmax.e[0] = 0.0;
cellBox.pmin = boxMin;
cellBox.pmax = boxMax;
D.46719.e[0] = D.46538_345;
D.46719.e[1] = D.46535_342;
D.46719.e[2] = D.46532_339;
o_601 = D.46719.e[0];
D.46721.e[0] = D.46523_329;
D.46721.e[1] = D.46521_327;
D.46721.e[2] = D.46519_325;
temp_612 = D.46721.e[0];
if (temp_612 != 0.0) goto <L46>; else goto <L48>;
4.0 version is:
<L32>:;
t1.187_430 = t1;
iterator.tCellMax = t1.187_430;
D.41484_448 = v_60->e[2];
e2_449 = t1.187_430 * D.41484_448;
D.41486_450 = v_60->e[1];
e1_451 = t1.187_430 * D.41486_450;
D.41488_452 = v_60->e[0];
e0_453 = t1.187_430 * D.41488_452;
D.41499_462 = p_74->e[2];
p$e$2_464 = e2_449 + D.41499_462;
D.41502_465 = p_74->e[1];
p$e$1_467 = e1_451 + D.41502_465;
D.41505_468 = p_74->e[0];
p$e$0_470 = e0_453 + D.41505_468;
D.41425_480 = iterator.iGrid;
this_481 = &D.41425_480->gridBox;
p_484 = &this_481->pmin;
SR.507_487 = p_484->e[0];
D.41429_493 = p$e$0_470 - SR.507_487;
D.41430_495 = D.41425_480->xDimension;
D.41431_496 = D.41429_493 / D.41430_495;
D.41432_497 = (int) D.41431_496;
iterator.i = D.41432_497;
D.41425_502 = iterator.iGrid;
this_503 = &D.41425_502->gridBox;
p_506 = &this_503->pmin;
SR.509_510 = p_506->e[1];
D.41438_515 = p$e$1_467 - SR.509_510;
D.41439_517 = D.41425_502->yDimension;
D.41440_518 = D.41438_515 / D.41439_517;
D.41441_519 = (int) D.41440_518;
iterator.j = D.41441_519;
D.41425_524 = iterator.iGrid;
this_525 = &D.41425_524->gridBox;
p_528 = &this_525->pmin;
SR.511_533 = p_528->e[2];
D.41447_537 = p$e$2_464 - SR.511_533;
D.41448_539 = D.41425_524->zDimension;
D.41449_540 = D.41447_537 / D.41448_539;
D.41450_541 = (int) D.41449_540;
iterator.k = D.41450_541;
D.41451_543 = iterator.i;
D.41425_544 = iterator.iGrid;
D.41452_545 = D.41425_544->nx;
if (D.41451_543 >= D.41452_545) goto <L45>; else goto <L46>;
What seems strange is that the [2] in 4.0 version gets just used in arithmetic,
while 4.1 copies it around for some reason I don't follow (yet)
Honza
--
Summary: [4.1 regression] EON regressed seriously on x86-64
Product: gcc
Version: 4.1.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: hubicka at gcc dot gnu dot org
GCC build triplet: x86_64-gnu-linux
GCC host triplet: x86_64-gnu-linux
GCC target triplet: x86_64-gnu-linux
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24653