This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]

loop unrolling problems

To: egcs at egcs dot cygnus dot com
Subject: loop unrolling problems
From: Dan Nicolaescu <dann at godzilla dot ics dot uci dot edu>
Date: 01 Jul 1999 17:23:25 -0700

Hi!

This is for:

gcc -v
Reading specs from /home/dann/build/egcs-2906/lib/gcc-lib/sparc-sun-solaris2.5.1/gcc-2.95/specs
gcc version gcc-2.95 19990629 (prerelease


The following simple code: 

int
main()
{
  register int    k;
  float  x[1002], y[1002];
  
  for (k=1; k<=999; k++){
    x[k] = y[k+1] - y[k];
  }
  
}

when compiled with: gcc -save-temps -O3 -fstrict-aliasing -funroll-all-loops


generates the following sparc assembly (just the loop): 
(the comments are mine)


.LL6:
	ld	[%g1], %f2      ! load y[k]
	add	%o7, 4, %g2
	ld	[%g1+4], %f4    ! load y[k+1]
	add	%o7, 8, %g3
	fsubs	%f4, %f2, %f4
	add	%o7, 12, %i0
	add	%o7, 16, %i1
	add	%o7, 20, %i2
	add	%o7, 24, %i3
	st	%f4, [%o7+%g4]  ! store x[k]
	ld	[%g1+4], %f2    ! load y[k+1] again!!!
	add	%o7, 28, %i4
	ld	[%g1+8], %f3
	add	%o7, 32, %i5
	fsubs	%f3, %f2, %f3
	add	%o0, 9, %o0
	add	%o7, 36, %o7
	cmp	%o0, 999
	st	%f3, [%g2+%g4]
[snip]

so you would think that this program has to do 1000 loads, but it
actualy does 1998...


I thought that gcc might not know that the y[k] store cannot clobber
x[k] in this case so I tried the equivalent fortran program (with g77):

      PROGRAM TEST
      
      REAL X(1002)
      REAL Y(1002)
      INTEGER K

      DO 9, K=1,999
         X(K)=Y(K+1)-Y(K)
 9    CONTINUE

      END


.LL6:
        ld      [%o0-4], %f2
        addcc   %o3, -9, %o3
        ld      [%o1], %f3
        fsubs   %f3, %f2, %f3
        st      %f3, [%o2]
        ld      [%o0], %f3
        ld      [%o1+4], %f2
        fsubs   %f2, %f3, %f2
        st      %f2, [%o2+4]
        ld      [%o0+4], %f2
        ld      [%o1+8], %f3
        fsubs   %f3, %f2, %f3
        st      %f3, [%o2+8]
[snip]

as you can see, again, the same problem. 

This kind of loop appears quite frequently in fortran codes, so I bet
the fortran quys won't be too happy about this...


Well then let's hold gcc's hand and tell it that the 2 arrays cannot
overlap, using restrict (if my understanding of how restrict works is
correct)

void
test (float * __restrict__ x, float * __restrict__ y)
{
  register int    k;

  for (k=1; k<=999; k++){
    x[k] = y[k+1] - y[k];
  }
  
}


int
main()
{
  register int    k;
  float  x[1002], y[1002];

  test (x,y);
  
}

.LL6:
	ld	[%g1], %f3
	add	%g4, 4, %g2
	ld	[%g4+%o7], %f2
	add	%g4, 8, %g3
	fsubs	%f3, %f2, %f2
	add	%g4, 12, %i0
	add	%g4, 16, %i1
	add	%g4, 20, %i2
	add	%g4, 24, %i3
	st	%f2, [%g4+%o0]
	ld	[%g1+4], %f3
	add	%g4, 28, %i4
	ld	[%g2+%o7], %f2
	add	%g4, 32, %i5
	fsubs	%f3, %f2, %f2
	add	%g4, 36, %g4
	addcc	%o1, -9, %o1
	st	%f2, [%g2+%o0]
	ld	[%g1+8], %f3
	ld	[%g3+%o7], %f2
	fsubs	%f3, %f2, %f2
	st	%f2, [%g3+%o0]
	ld	[%g1+12], %f3
	ld	[%i0+%o7], %f2
	fsubs	%f3, %f2, %f2
	st	%f2, [%i0+%o0]
	ld	[%g1+16], %f3
	ld	[%i1+%o7], %f2
	fsubs	%f3, %f2, %f2
	st	%f2, [%i1+%o0]
	ld	[%g1+20], %f3
	ld	[%i2+%o7], %f2
	fsubs	%f3, %f2, %f2
	st	%f2, [%i2+%o0]
	ld	[%g1+24], %f3
	ld	[%i3+%o7], %f2
	fsubs	%f3, %f2, %f2
	st	%f2, [%i3+%o0]
	ld	[%g1+28], %f3
	ld	[%i4+%o7], %f2
	fsubs	%f3, %f2, %f2
	st	%f2, [%i4+%o0]
	ld	[%g1+32], %f3
	ld	[%i5+%o7], %f2
	add	%g1, 36, %g1
	fsubs	%f3, %f2, %f2
	bpos	.LL6
	st	%f2, [%i5+%o0]
	ret
	restore

the same problem, too many loads

It looks like when it unrolls the loop gcc cannot tell that some of
the array elements are the same, so it does not need to load them
again...

The weird thing is if I unroll the loop by hand: 


int
main()
{
  
  register int    k;
  double  x[1002], y[1002];    
  for (k=1; k<=999; k+=2){
    x[k] = y[k+1] - y[k];
    x[k + 1] = y[k + 2] - y[k + 1];
  }
}

I still get too many loads. 

But if I make the 2 arrays globals, the the generated code is the following:


.LL6:
	ld	[%o0+%g1], %f4
	add	%o0, 8, %i2
	ld	[%o7+%g1], %f2
	add	%o7, 8, %g3
	ld	[%o1], %f7
	fsubs	%f4, %f2, %f2
	ld	[%o1+8], %f3
	add	%o0, 16, %i4
	ld	[%i2+%g1], %f6
	add	%o7, 16, %g2
	fsubs	%f7, %f4, %f7
	ld	[%g3+%g1], %f8
	st	%f2, [%o7+%g4]
	ld	[%o1+16], %f5
	add	%o0, 24, %i5
	fsubs	%f3, %f6, %f3
	ld	[%i4+%g1], %f9
	st	%f7, [%o0+%g4]
	ld	[%g2+%g1], %f4
	add	%o7, 24, %i1
	fsubs	%f6, %f8, %f8
	ld	[%o1+24], %f7
	add	%o0, 32, %i3
	ld	[%i5+%g1], %f11
	add	%o7, 32, %i0
	fsubs	%f9, %f4, %f4
	st	%f8, [%g3+%g4]
	ld	[%i1+%g1], %f6
	st	%f3, [%i2+%g4]
	ld	[%i3+%g1], %f10
	fsubs	%f5, %f9, %f5
	st	%f4, [%g2+%g4]
	ld	[%i0+%g1], %f3
	add	%o2, 10, %o2
	ld	[%o1+32], %f2
	fsubs	%f11, %f6, %f6
	st	%f5, [%i4+%g4]
	add	%o1, 40, %o1
	add	%o0, 40, %o0
	add	%o7, 40, %o7
	fsubs	%f7, %f11, %f7
	st	%f6, [%i1+%g4]
	cmp	%o2, 999
	fsubs	%f10, %f3, %f3
	st	%f7, [%i5+%g4]
	fsubs	%f2, %f10, %f2
	st	%f3, [%i0+%g4]
	ble	.LL6
	st	%f2, [%i3+%g4]
	ret
	restore

so now there are less loads...


A very useful tool to use when playing with this is "ifreq" from the
SHADE/Spixtoools package available for download from Sun, it can count
the number and type of instructions executed by certain function.

--dan

Follow-Ups:
- Re: loop unrolling problems
  - From: Jeffrey A Law

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]