This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Re: rtlopt loop unroller question
- From: Yossi Markovich <YOSSIM at il dot ibm dot com>
- To: Zdenek Dvorak <rakdver at atrey dot karlin dot mff dot cuni dot cz>
- Cc: "gcc gnu" <gcc at gcc dot gnu dot org>, Ayal Zaks <ZAKS at il dot ibm dot com>, Dorit Naishlos <DORIT at il dot ibm dot com>
- Date: Sun, 26 Oct 2003 12:26:56 +0200
- Subject: Re: rtlopt loop unroller question
Hello ,
>> However, if we complicate the example a little bit:
>>
>> {
>> int A[N];
>> int B[N];
>> int C[N];
>> int i;
>>
>> for (i=0; i<N; i++)
>> A[i]=B[i]+C[i];
>> return A;
>> }
>>
>> we still get the inefficient addressing calculations.
>> Using static variables instead of local ones yields
>> much better code with or without your patch.
>I am not sure what you refer to here (could you please demonstrate on
>assembler)?
The thing is that for the above code we get the following asm code :
_foo:
lis r3,0xfffe
stmw r17,-60(r1)
ori r2,r3,10976
li r0,1250
stwux r1,r1,r2
mtctr r0
addi r9,r1,24
addis r12,r9,0x1
L5:
addi r8,r12,4
addi r7,r12,8
addi r6,r12,12
addi r5,r12,16
addi r4,r12,20
addi r3,r12,24
addi r2,r12,28
lwz r19,-25528(r12)
lwz r17,14472(r12)
lwz r18,14472(r8)
lwz r10,14472(r7)
add r24,r19,r17
lwz r25,-25528(r8)
addis r19,r12,0xffff
lwz r26,-25528(r7)
addis r17,r2,0xffff
lwz r27,-25528(r6)
add r25,r25,r18
lwz r20,14472(r6)
add r26,r26,r10
lwz r28,-25528(r5)
addis r10,r8,0xffff
lwz r21,14472(r5)
addis r8,r7,0xffff
lwz r29,-25528(r4)
addis r7,r6,0xffff
lwz r22,14472(r4)
addis r6,r5,0xffff
lwz r11,-25528(r3)
addis r5,r4,0xffff
lwz r23,14472(r3)
add r27,r27,r20
lwz r0,-25528(r2)
add r28,r28,r21
lwz r9,14472(r2)
add r29,r29,r22
add r11,r11,r23
addis r4,r3,0xffff
add r18,r0,r9
stw r24,8(r19)
addi r12,r12,32
stw r25,8(r10)
stw r26,8(r8)
stw r27,8(r7)
stw r28,8(r6)
stw r29,8(r5)
stw r11,8(r4)
stw r18,8(r17)
bdnz L5
addi r3,r1,32
lwz r1,0(r1)
lmw r17,-60(r1)
blr
.align 2
.globl _main
where we still get the inefficient addressing calculations.
(even after applying your first patch)
notice that when declaring the arrays as static type ,
we get the following asm code
where we don't get the inefficient calculations (beside the redundant adds
):
_foo:
mflr r3
bcl 20,31,L1$pb
L1$pb:
stmw r28,-16(r1)
li r0,2500
mflr r31
mtctr r0
addis r2,r31,ha16(_A.0-L1$pb)
addis r11,r31,ha16(_B.1-L1$pb)
addis r9,r31,ha16(_C.2-L1$pb)
la r30,lo16(_A.0-L1$pb)(r2)
stw r3,8(r1)
la r2,lo16(_B.1-L1$pb)(r11)
la r12,lo16(_C.2-L1$pb)(r9)
li r28,-4
L5:
addi r8,r28,4
addi r7,r8,4
lwzx r29,r8,r12
addi r6,r7,4
lwzx r10,r8,r2
addi r28,r6,4
lwzx r4,r6,r12
lwzx r11,r7,r2
add r10,r10,r29
lwzx r3,r7,r12
lwzx r9,r6,r2
lwzx r0,r28,r2
add r11,r11,r3
lwzx r5,r28,r12
add r29,r9,r4
stwx r10,r8,r30
add r4,r0,r5
stwx r11,r7,r30
stwx r29,r6,r30
stwx r4,r28,r30
bdnz L5
lwz r12,8(r1)
mr r3,r30
lmw r28,-16(r1)
mtlr r12
blr