This is the mail archive of the
gcc-help@gcc.gnu.org
mailing list for the GCC project.
Re: better load/store scheduling
- From: Sergei Organov <osv at javad dot com>
- To: gcc-help at gcc dot gnu dot org
- Date: Thu, 01 Mar 2007 14:22:10 +0300
- Subject: Re: better load/store scheduling
- References: <96CDC40E4321F84FA0FB83A1EF2A422864B93F@Hermes.shaktisystems.com>
"Ben Cheng" <bccheng@peakstreaminc.com> writes:
> I am trying to tune the performance of hand-unrolled code. I was
> wondering what cmd-line options should I specify in order to get h[i+1]
> loaded before the store to g[i]:
>
>
> Code:
>
> void foo(int * __restrict g, int * __restrict h)
> {
> int i;
> for (i = 0; i < 4096; i+=2) {
> g[i] = h[i] + 10;
> g[i+1] = h[i+1] + 10;
> }
> }
Use temporaries:
void foo(int * __restrict g, int * __restrict h)
{
int i;
for (i = 0; i < 4096; i+=2) {
int a = h[i];
int b = h[i+1];
g[i] = a + 10;
g[i+1] = b + 10;
}
}
>
> Command line:
>
> gcc-4.0.2 -O3 loop.c -fargument-noalias-global -fstrict-aliasing -S
> loop.s
>
> Assembly code of the loop body:
>
> .L2:
> leal 0(,%ebx,4), %eax
> leal (%eax,%esi), %ecx
> leal (%edi,%eax), %eax
> movl -8(%ecx), %edx // = h[i]
> addl $10, %edx // + 10
> movl %edx, -8(%eax) // g[i] =
> movl -4(%ecx), %edx // = h[i+1]
> addl $10, %edx // + 10
> movl %edx, -4(%eax) // g[i+1] =
> addl $2, %ebx
> cmpl $4098, %ebx
> jne .L2
With gcc 4.0.4, it gives:
.L2:
leal 0(,%ebx,4), %edx
addl $2, %ebx
leal (%esi,%edx), %eax
addl %edi, %edx
movl -4(%eax), %ecx
movl -8(%eax), %eax
addl $10, %ecx
addl $10, %eax
cmpl $4098, %ebx
movl %eax, -8(%edx)
movl %ecx, -4(%edx)
jne .L2
With gcc 4.1.2, it gives:
.L2:
movl -4(%ebx,%ecx,4), %eax
movl -8(%ebx,%ecx,4), %edx
addl $10, %eax
addl $10, %edx
movl %edx, -8(%esi,%ecx,4)
movl %eax, -4(%esi,%ecx,4)
addl $2, %ecx
cmpl $4098, %ecx
jne .L2
-- Sergei.