This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
mainline performance regression (observered on SPARC)
- From: Dan Nicolaescu <dann at ics dot uci dot edu>
- To: gcc at gcc dot gnu dot org
- Date: Fri, 24 May 2002 11:36:48 -0700
- Subject: mainline performance regression (observered on SPARC)
Comparing the performance of g++-3.1 and mainline on the oopack
benchmark from
http://www.coyotegulch.com/reviews/intel_comp/icc-6.0_gcc-3.0.4_benchmarks.tar.gz
on a sparc-sun-solaris2.7 machine with an Ultra-IIi processor.
reveals that there's a regression in performance for the mainline for
the "Complex" benchmark.
I don't know if this is SPARC specific, I haven't tried it on any
other system.
OOPACK:
g++-3.1
Seconds Mflops
Test Iterations C OOP C OOP Ratio
---- ---------- ----------- ----------- -----
Max 100000 3.1 3.4 32.5 29.2 1.1
Matrix 1000 3.8 4.0 66.0 63.0 1.0
Complex 100000 8.8 30.8 91.4 26.0 3.5
Iterator 100000 2.4 2.4 83.3 82.6 1.0
mainline g++
Seconds Mflops
Test Iterations C OOP C OOP Ratio
---- ---------- ----------- ----------- -----
Max 100000 3.0 3.7 33.0 26.7 1.2
Matrix 1000 3.8 3.8 66.5 65.6 1.0
Complex 100000 8.7 41.6 91.5 19.2 4.8
Iterator 100000 2.4 2.4 83.0 82.6 1.0
The assembly for the offending function is below. The difference is
just one extra "add" instruction and a different schedule.
The code was compiled with: -O3 -mcpu=ultrasparc
_ZNK16ComplexBenchmark9oop_styleEv: _ZNK16ComplexBenchmark9oop_styleEv:
.LLFB19: .LLFB19:
!#PROLOGUE# 0 !#PROLOGUE# 0
save %sp, -208, %sp save %sp, -208, %sp
.LLCFI10: .LLCFI12:
!#PROLOGUE# 1 !#PROLOGUE# 1
sethi %hi(.LLC37), %g1 sethi %hi(.LLC36), %i2
sethi %hi(.LLC36), %i1 sethi %hi(.LLC37), %g1
ldd [%g1+%lo(.LLC37)], %f0 sethi %hi(Y), %i1
sethi %hi(Y), %i0 sethi %hi(X), %i0
sethi %hi(X), %g1 ldd [%i2+%lo(.LLC36)], %f0
ldd [%i1+%lo(.LLC36)], %f2 or %i1, %lo(Y), %o7
or %i0, %lo(Y), %o7 or %i0, %lo(X), %l0
or %g1, %lo(X), %l0 std %f0, [%fp-32]
std %f0, [%fp-24] ldd [%g1+%lo(.LLC37)], %f0
mov 0, %i0 mov 0, %g1
std %f2, [%fp-32] std %f0, [%fp-24]
ldd [%fp-32], %o0 ldd [%fp-32], %o4
ldd [%fp-24], %o2 ldd [%fp-24], %o2
.LL184: .LL185:
sll %i0, 4, %i1 sll %g1, 4, %i5
std %o0, [%fp-96] std %o4, [%fp-96]
add %i0, 1, %i0 add %l0, %i5, %i2
std %o2, [%fp-88] add %i5, %l0, %i4
add %l0, %i1, %i3 add %o7, %i5, %l1
add %i1, %l0, %g1 add %g1, 1, %g1
ldd [%i3], %i4 cmp %g1, 999
cmp %i0, 999 ldd [%i2], %i0
ldd [%fp-96], %f28 std %o2, [%fp-88]
std %i4, [%fp-112] ldd [%l1], %o0
add %o7, %i1, %i4 ldd [%i4+8], %i2
ldd [%g1+8], %i2 add %i5, %o7, %i4
add %i1, %o7, %g1 std %i0, [%fp-112]
ldd [%i4], %o4 std %i2, [%fp-104]
std %i2, [%fp-104] ldd [%fp-96], %f28
add %o7, %i1, %i2 ldd [%i4+8], %i0
ldd [%fp-112], %f8 std %o0, [%fp-64]
ldd [%fp-88], %f30 std %i0, [%fp-56]
ldd [%fp-104], %f14 ldd [%fp-88], %f24
fmuld %f28, %f8, %f26 ldd [%fp-112], %f26
fmuld %f30, %f8, %f24 ldd [%fp-104], %f8
ldd [%g1+8], %i4 fmuld %f28, %f26, %f20
fmuld %f30, %f14, %f12 fmuld %f24, %f8, %f22
std %o4, [%fp-64] fmuld %f28, %f8, %f16
fmuld %f28, %f14, %f22 ldd [%fp-64], %f12
ldd [%fp-64], %f18 fmuld %f24, %f26, %f18
std %i4, [%fp-56] ldd [%fp-56], %f6
fsubd %f26, %f12, %f20 fsubd %f20, %f22, %f14
ldd [%fp-56], %f6 faddd %f16, %f18, %f10
faddd %f22, %f24, %f16 faddd %f12, %f14, %f4
faddd %f18, %f20, %f10 std %f4, [%fp-48]
std %f10, [%fp-48] faddd %f6, %f10, %f2
ldd [%fp-48], %i4 std %f2, [%fp-40]
faddd %f6, %f16, %f4 ldd [%fp-48], %i0
std %f4, [%fp-40] std %i0, [%l1]
std %i4, [%i2] ldd [%fp-40], %i2
ldd [%fp-40], %i2 ble,pt %icc, .LL185
ble,pt %icc, .LL184 std %i2, [%i4+8]
std %i2, [%g1+8] nop
nop return %i7+8
return %i7+8 nop
nop
Any idea what is the cause of this?
The code is kind of silly, it moves stuff around between the integer
and fp registers, probably an artifact of the SPARC calling
conventions (passing doubles in the integer registers). But shouldn't
inlining get rid of this?