This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug middle-end/42505] New: loop canonicalization causes a lot of unnecessary temporary variables
- From: "sliao at google dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 25 Dec 2009 18:45:16 -0000
- Subject: [Bug middle-end/42505] New: loop canonicalization causes a lot of unnecessary temporary variables
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
This regression was caused by loop canonicalization.
The following example:
struct A {
int f1;
int f2;
};
int func(int c);
int test(struct A* src, struct A* dst, int count)
{
while (count--) {
if (!func(src->f2)) {
return 0;
}
*dst++ = *src++;
}
return 1;
}
gcc 4.2.1 compiles this to 40 bytes, gcc 4.4.0 to 48 bytes:
gcc 4.2.1 output:
test:
push {r4, r5, r6, lr}
mov r4, r0
mov r5, r1
mov r6, r2
b .L2
.L3:
ldr r0, [r4, #4]
bl func
cmp r0, #0
beq .L6
mov r3, r5
mov r2, r4
ldmia r2!, {r0, r1}
stmia r3!, {r0, r1}
mov r5, r3
mov r4, r2
.L2:
sub r6, r6, #1
bcs .L3
mov r0, #1
.L6:
@ sp needed for prologue
pop {r4, r5, r6, pc}
gcc 4.4.0 output:
push {r4, r5, r6, r7, lr} // note r7 is cloberred
sub sp, sp, #12 // why need to store smth on the stack?
mov r7, r0
str r1, [sp, #4] // why store r1 onto stack?
mov r6, r2
mov r5, #0
b .L2
.L5:
add r4, r7, r5
ldr r0, [r4, #4]
bl func
sub r6, r6, #1
cmp r0, #0
beq .L4
ldr r1, [sp, #4] // load from stack
add r3, r1, r5
add r5, r5, #8
ldmia r4!, {r1, r2}
stmia r3!, {r1, r2}
.L2:
cmp r6, #0
bne .L5
mov r0, #1
.L4:
add sp, sp, #12
@ sp needed for prologue
pop {r4, r5, r6, r7, pc}
This is caused by loop canonicalization pass (pass_iv_optimize) that was added
in gcc 4.4.
Final GIMPLE form in gcc 4.2.1 compiler:
test (src, dst, count)
{
int a;
int D.1545;
<bb 2>:
goto <bb 6> (<L3>);
<L0>:;
a = func (MEM[base: src, offset: 4]);
if (a == 0) goto <L8>; else goto <L2>;
<L8>:;
D.1545 = 0;
goto <bb 8> (<L5>);
<L2>:;
MEM[base: dst] = MEM[base: src];
dst = dst + 8B;
src = src + 8B;
<L3>:;
count = count - 1;
if (count != -1) goto <L0>; else goto <L9>;
<L9>:;
D.1545 = 1;
<L5>:;
return D.1545;
}
The final GIMPLE in gcc 4.4:
test (struct A * src, struct A * dst, int count)
{
unsigned int ivtmp.22; // induction variables introduced by pass_iv_optimize
unsigned int ivtmp.19;
int a;
int D.1274;
<bb 2>:
ivtmp.22 = (unsigned int) count; // copy of count, count itself is not used
anymore
ivtmp.19 = 0;
goto <bb 6>;
<bb 3>:
a = func (MEM[base: src + ivtmp.19, offset: 4]);
ivtmp.22 = ivtmp.22 - 1;
if (a == 0)
goto <bb 4>;
else
goto <bb 5>;
<bb 4>:
D.1274 = 0;
goto <bb 8>;
<bb 5>:
MEM[base: dst, index: ivtmp.19] = MEM[base: src, index: ivtmp.19];
ivtmp.19 = ivtmp.19 + 8;
<bb 6>:
if (ivtmp.22 != 0)
goto <bb 3>;
else
goto <bb 7>;
<bb 7>:
D.1274 = 1;
<bb 8>:
return D.1274;
}
The following RTL passes could not optimize these temporary induction variables
and they are spilled on the stack, which causes a lot of other inefficiencies.
The main question: there are three way to fix this:
1) turn off loop canonicalization for -Os
2) optimize the extra variable in the GIMPLE passes
3) optimize the extra variable in the RTL passes
--
Summary: loop canonicalization causes a lot of unnecessary
temporary variables
Product: gcc
Version: 4.4.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: sliao at google dot com
GCC build triplet: i686-linux
GCC host triplet: i686-linux
GCC target triplet: arm-eabi
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42505