[Bug regression/39838] New: [4.3/4.4/4.5 regression] unoptimal code for two simple loops
alexvod at google dot com
gcc-bugzilla@gcc.gnu.org
Tue Apr 21 17:49:00 GMT 2009
The following code:
struct A
{
int count;
int *data;
};
void func(int, int);
void test (struct A* p, const void **ptrArray, int count)
{
int i, j;
for (i = 0; i < p->count; i++)
{
for (j = 0; j < count; j++)
{
func (p->data[i], p->data[i + 1]);
}
}
}
is compiled to 50 bytes by GCC 4.2.1 and to 56 bytes by GCC 4.4.0 (and GCC
4.3.1 also) on ARM in thumb mode
GCC 4.2.1 (with -march=armv5te -mthumb -mthumb-interwork -fpic -Os)
test:
push {r4, r5, r6, r7, lr}
sub sp, sp, #12
mov r7, r0
mov r5, #0
str r2, [sp, #4]
b .L2
.L3:
ldr r3, [r7, #4]
add r4, r4, #1
ldr r0, [r3, r6]
add r3, r6, r3
ldr r1, [r3, #4]
bl func
.L5:
ldr r3, [sp, #4]
cmp r4, r3
blt .L3
add r5, r5, #1
.L2:
ldr r3, [r7]
cmp r5, r3
bge .L6
lsl r6, r5, #2
mov r4, #0
b .L5
.L6:
add sp, sp, #12
@ sp needed for prologue
pop {r4, r5, r6, r7, pc}
GCC 4.4.0:
test:
push {r4, r5, r6, r7, lr}
sub sp, sp, #12
mov r4, r0
str r2, [sp, #4]
mov r7, #4 // doesn't exist in 4.2.1
mov r5, #0
b .L2
.L3:
ldr r3, [r4, #4]
ldr r2, [sp]
ldr r1, [r3, r7]
ldr r0, [r3, r2]
bl func
add r6, r6, #1
.L5:
ldr r3, [sp, #4]
cmp r6, r3
blt .L3
add r5, r5, #1
add r7, r7, #4 // doesn't exist in 4.2.1
.L2:
ldr r3, [r4]
cmp r5, r3
bge .L6
lsl r2, r5, #2
str r2, [sp] // doesn't exist in 4.2.1
mov r6, #0
b .L5
.L6:
add sp, sp, #12
@ sp needed for prologue
pop {r4, r5, r6, r7, pc}
Changing -Os to -O2 produces even worse code (50->64, 56->74, +6 -> +10).
Bisection on trunk shows that it was changed by
http://gcc.gnu.org/viewcvs?view=rev&revision=125755 which was a merge of
pointer_plus branch (therefore adding Andrew Pinski in cc).
It also reproduces on x86 as well:
GCC 4.2.4 with -m32 -O2:
test:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $12, %esp
movl 8(%ebp), %edi
movl $0, -16(%ebp)
movl (%edi), %edx
testl %edx, %edx
jle .L8
.L4:
movl 16(%ebp), %eax
testl %eax, %eax
jle .L6
movl -16(%ebp), %esi
xorl %ebx, %ebx
sall $2, %esi
.p2align 4,,7
.L5:
movl 4(%edi), %eax
addl $1, %ebx
movl 4(%esi,%eax), %edx
movl %edx, 4(%esp)
movl (%eax,%esi), %eax
movl %eax, (%esp)
call func
cmpl 16(%ebp), %ebx
jne .L5
.L6:
addl $1, -16(%ebp)
movl -16(%ebp), %eax
cmpl %eax, (%edi)
jg .L4
.L8:
addl $12, %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
GCC 4.4.0 (with the same options):
test:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
movl $4, %esi
pushl %ebx
subl $44, %esp
movl 8(%ebp), %edi
movl $0, -28(%ebp)
movl (%edi), %edx
testl %edx, %edx
jle .L6
.p2align 4,,7
.p2align 3
.L3:
movl 16(%ebp), %eax
testl %eax, %eax
jle .L5
movl -28(%ebp), %ecx
movl %edi, %eax
xorl %ebx, %ebx
sall $2, %ecx
movl %ecx, %edi
movl %eax, %ecx
.p2align 4,,7
.p2align 3
.L4:
movl 4(%ecx), %eax
addl $1, %ebx
movl %ecx, -32(%ebp)
movl (%eax,%esi), %edx
movl %edx, 4(%esp)
movl (%eax,%edi), %eax
movl %eax, (%esp)
call func
movl -32(%ebp), %ecx
cmpl %ebx, 16(%ebp)
jg .L4
movl %ecx, %edi
.L5:
addl $1, -28(%ebp)
addl $4, %esi
movl -28(%ebp), %eax
cmpl %eax, (%edi)
jg .L3
.L6:
addl $44, %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
Some stat by instructions:
$ cat 1.s|grep -v '[.:]'|awk '{print $1}'|sort|uniq -c|sort -g
1 call
1 ret
1 sall
1 subl
1 xorl
2 cmpl
2 testl
3 addl
4 popl
4 pushl
12 movl
$ cat 2.s|grep -v '[.:]'|awk '{print $1}'|sort|uniq -c|sort -g
1 call
1 ret
1 sall
1 subl
1 xorl
2 cmpl
2 testl
4 addl
4 popl
4 pushl
19 movl
12->19 movl's is not very good.
--
Summary: [4.3/4.4/4.5 regression] unoptimal code for two simple
loops
Product: gcc
Version: 4.4.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: regression
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: alexvod at google dot com
GCC build triplet: x86_64-unknown-linux-gnu
GCC host triplet: x86_64-unknown-linux-gnu
GCC target triplet: x86_64-unknown-linux-gnu
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39838
More information about the Gcc-bugs
mailing list