This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
loop unrolling problems
- To: egcs at egcs dot cygnus dot com
- Subject: loop unrolling problems
- From: Dan Nicolaescu <dann at godzilla dot ics dot uci dot edu>
- Date: 01 Jul 1999 17:23:25 -0700
Hi!
This is for:
gcc -v
Reading specs from /home/dann/build/egcs-2906/lib/gcc-lib/sparc-sun-solaris2.5.1/gcc-2.95/specs
gcc version gcc-2.95 19990629 (prerelease
The following simple code:
int
main()
{
register int k;
float x[1002], y[1002];
for (k=1; k<=999; k++){
x[k] = y[k+1] - y[k];
}
}
when compiled with: gcc -save-temps -O3 -fstrict-aliasing -funroll-all-loops
generates the following sparc assembly (just the loop):
(the comments are mine)
.LL6:
ld [%g1], %f2 ! load y[k]
add %o7, 4, %g2
ld [%g1+4], %f4 ! load y[k+1]
add %o7, 8, %g3
fsubs %f4, %f2, %f4
add %o7, 12, %i0
add %o7, 16, %i1
add %o7, 20, %i2
add %o7, 24, %i3
st %f4, [%o7+%g4] ! store x[k]
ld [%g1+4], %f2 ! load y[k+1] again!!!
add %o7, 28, %i4
ld [%g1+8], %f3
add %o7, 32, %i5
fsubs %f3, %f2, %f3
add %o0, 9, %o0
add %o7, 36, %o7
cmp %o0, 999
st %f3, [%g2+%g4]
[snip]
so you would think that this program has to do 1000 loads, but it
actualy does 1998...
I thought that gcc might not know that the y[k] store cannot clobber
x[k] in this case so I tried the equivalent fortran program (with g77):
PROGRAM TEST
REAL X(1002)
REAL Y(1002)
INTEGER K
DO 9, K=1,999
X(K)=Y(K+1)-Y(K)
9 CONTINUE
END
.LL6:
ld [%o0-4], %f2
addcc %o3, -9, %o3
ld [%o1], %f3
fsubs %f3, %f2, %f3
st %f3, [%o2]
ld [%o0], %f3
ld [%o1+4], %f2
fsubs %f2, %f3, %f2
st %f2, [%o2+4]
ld [%o0+4], %f2
ld [%o1+8], %f3
fsubs %f3, %f2, %f3
st %f3, [%o2+8]
[snip]
as you can see, again, the same problem.
This kind of loop appears quite frequently in fortran codes, so I bet
the fortran quys won't be too happy about this...
Well then let's hold gcc's hand and tell it that the 2 arrays cannot
overlap, using restrict (if my understanding of how restrict works is
correct)
void
test (float * __restrict__ x, float * __restrict__ y)
{
register int k;
for (k=1; k<=999; k++){
x[k] = y[k+1] - y[k];
}
}
int
main()
{
register int k;
float x[1002], y[1002];
test (x,y);
}
.LL6:
ld [%g1], %f3
add %g4, 4, %g2
ld [%g4+%o7], %f2
add %g4, 8, %g3
fsubs %f3, %f2, %f2
add %g4, 12, %i0
add %g4, 16, %i1
add %g4, 20, %i2
add %g4, 24, %i3
st %f2, [%g4+%o0]
ld [%g1+4], %f3
add %g4, 28, %i4
ld [%g2+%o7], %f2
add %g4, 32, %i5
fsubs %f3, %f2, %f2
add %g4, 36, %g4
addcc %o1, -9, %o1
st %f2, [%g2+%o0]
ld [%g1+8], %f3
ld [%g3+%o7], %f2
fsubs %f3, %f2, %f2
st %f2, [%g3+%o0]
ld [%g1+12], %f3
ld [%i0+%o7], %f2
fsubs %f3, %f2, %f2
st %f2, [%i0+%o0]
ld [%g1+16], %f3
ld [%i1+%o7], %f2
fsubs %f3, %f2, %f2
st %f2, [%i1+%o0]
ld [%g1+20], %f3
ld [%i2+%o7], %f2
fsubs %f3, %f2, %f2
st %f2, [%i2+%o0]
ld [%g1+24], %f3
ld [%i3+%o7], %f2
fsubs %f3, %f2, %f2
st %f2, [%i3+%o0]
ld [%g1+28], %f3
ld [%i4+%o7], %f2
fsubs %f3, %f2, %f2
st %f2, [%i4+%o0]
ld [%g1+32], %f3
ld [%i5+%o7], %f2
add %g1, 36, %g1
fsubs %f3, %f2, %f2
bpos .LL6
st %f2, [%i5+%o0]
ret
restore
the same problem, too many loads
It looks like when it unrolls the loop gcc cannot tell that some of
the array elements are the same, so it does not need to load them
again...
The weird thing is if I unroll the loop by hand:
int
main()
{
register int k;
double x[1002], y[1002];
for (k=1; k<=999; k+=2){
x[k] = y[k+1] - y[k];
x[k + 1] = y[k + 2] - y[k + 1];
}
}
I still get too many loads.
But if I make the 2 arrays globals, the the generated code is the following:
.LL6:
ld [%o0+%g1], %f4
add %o0, 8, %i2
ld [%o7+%g1], %f2
add %o7, 8, %g3
ld [%o1], %f7
fsubs %f4, %f2, %f2
ld [%o1+8], %f3
add %o0, 16, %i4
ld [%i2+%g1], %f6
add %o7, 16, %g2
fsubs %f7, %f4, %f7
ld [%g3+%g1], %f8
st %f2, [%o7+%g4]
ld [%o1+16], %f5
add %o0, 24, %i5
fsubs %f3, %f6, %f3
ld [%i4+%g1], %f9
st %f7, [%o0+%g4]
ld [%g2+%g1], %f4
add %o7, 24, %i1
fsubs %f6, %f8, %f8
ld [%o1+24], %f7
add %o0, 32, %i3
ld [%i5+%g1], %f11
add %o7, 32, %i0
fsubs %f9, %f4, %f4
st %f8, [%g3+%g4]
ld [%i1+%g1], %f6
st %f3, [%i2+%g4]
ld [%i3+%g1], %f10
fsubs %f5, %f9, %f5
st %f4, [%g2+%g4]
ld [%i0+%g1], %f3
add %o2, 10, %o2
ld [%o1+32], %f2
fsubs %f11, %f6, %f6
st %f5, [%i4+%g4]
add %o1, 40, %o1
add %o0, 40, %o0
add %o7, 40, %o7
fsubs %f7, %f11, %f7
st %f6, [%i1+%g4]
cmp %o2, 999
fsubs %f10, %f3, %f3
st %f7, [%i5+%g4]
fsubs %f2, %f10, %f2
st %f3, [%i0+%g4]
ble .LL6
st %f2, [%i3+%g4]
ret
restore
so now there are less loads...
A very useful tool to use when playing with this is "ifreq" from the
SHADE/Spixtoools package available for download from Sun, it can count
the number and type of instructions executed by certain function.
--dan