This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Re: temporal optimisation in loops


On Tue, Jul 03, 2001 at 04:29:59PM +0100, Vincent Penquerc'h wrote:
> > the small tests we've done work terrible
> 
> I take you meant that these tests work fine, right ?

Yes it's just a mistake :-)

Here are some examples:

We tranform every data reference in a polyhedral representation.
>From the analysis of this representation we propose a transformation. 
The validation is based on the analysis of the data dependence vectors.
If the transformed dependence vectors respects the old ones, 
then we validate the transformation and generate new indices and new bounds 
for the loop nest.


Consider the following loop nest:

$ cat ./s10.c

int main() 
{
  int i,j,k,N;
  int A[1001][1000];
  N = 500;

  for( i = 1; i <= N; i++)
    for(j = 1; j<=N; j++)
      for(k = 1; k<=N; k++)
	A[i+N][j+k-1] = i*N + i;

  /*
  for(i=1;i<10;i++)
    {
      for(j=1;j<5;j++)
	printf("A[%d][%d] = %d | " , i+N,j,A[i+N][j] );
      printf("\n");
    }	    
  */
  return 0;
}

and the transformed loop :

$ cat ./os10.c

#define MAX(A,B) (A)>(B)?(A):(B)
#define MIN(A,B) (A)>(B)?(B):(A)
int main() 
{
  int i,j,k,N;
  int r;
  int A[1001][1000];
  N = 500;
 
  for( i = 1; i <= N; i++)
    for(j = 2; j<= i + N; j++)
    {
      r = MIN((j-1),N);
      for(k = MAX(1,(-i+j)); k<=r; k++)
	A[i+N][j-1] = i*N + i;
    }

  /*
    for(i=1;i<10;i++)
    {
      for(j=1;j<5;j++)
	printf("A[%d][%d] = %d | " , i+N,j,A[i+N][j] );
      printf("\n");
    }	    
  */
  return 0;
}


Now let's evaluate the speed:

$ gcc -o s10 s10.c
$ gcc -o os10 os10.c

$ cat /proc/cpuinfo 
processor       : 0
vendor_id       : CyrixInstead
cpu family      : 6
model           : 1
model name      : 6x86MX 2x Core/Bus Clock
stepping        : 3
cpu MHz         : 150.346
fdiv_bug        : no
hlt_bug         : no
sep_bug         : no
f00f_bug        : no
coma_bug        : yes
fpu             : yes
fpu_exception   : yes
cpuid level     : 1
wp              : yes
flags           : fpu de tsc msr cx8 mtrr cmov mmx
bogomips        : 299.83

$ time ./s10
36.69user 0.10system 0:39.21elapsed 93%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (69major+500minor)pagefaults 0swaps

$ time ./os10
16.41user 0.04system 0:16.85elapsed 97%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (69major+501minor)pagefaults 0swaps

$ gcc -O2 -o s10 s10.c
$ gcc -O2 -o os10 os10.c

$ time ./os10
1.22user 0.03system 0:01.25elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (69major+501minor)pagefaults 0swaps

$ time ./s10
3.00user 0.02system 0:03.06elapsed 98%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (69major+501minor)pagefaults 0swaps

$ gcc --version
3.0

 


Another example:

    for(i=1; i<=N; i++)
	for(j=1; j<=M; j++)
	    for(k=1; k<=i; k++)
		for(l=1; l<=k; l++)
		{
		    A[i][j] = B[k][j] + B[i][k];
		    C[k][l] = E[l][k] + A[i][l];
		    E[i][k] = C[i][j] + D[k][l];
		}

and its transformed form:

    for(i=1; i<=M; i++)
	for(j=1; j<=N; j++)
	    for(k=j; k<=N; k++)
		for(l=1; l<=j; l++)
		{
		    A[k][i] = B[j][i] + B[k][j];
		    C[j][l] = E[l][j] + A[k][l];
		    E[k][j] = C[k][i] + D[j][l];
		}

 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]