[Bug rtl-optimization/36712] New: Inefficient loop unrolling
bmei at broadcom dot com
gcc-bugzilla@gcc.gnu.org
Thu Jul 3 09:07:00 GMT 2008
Loop unrolling generates far worse code compared with manually unrolled code.
In following code, the first version is GCC-unrolled and the second is
manually unrolled.
GCC-unrolled version mainly suffers from two issues. First, the
load/store offsets are registers. Extra ADD instructions are needed to
increase offset over iteration. In the contrast, manually unrolled code
makes use of immediate offset efficiently and only need one ADD to
adjust base register in the end. Second, the alias (dependence) analysis
is over conservative. The LOAD instruction of next unrolled iteration
cannot be moved beyond previous STORE instruction even they are clearly
not aliased. I suspect the failure of alias analysis is related to the
first issue of handling base and offset address. The .sched2 file shows
that the first loop body requires 57 cycles whereas the second one takes
50 cycles for arm9 (56 cycles vs 34 cycles for Xscale). It become even
worse for our VLIW porting due to longer latency of MUL and Load
instructions and incapability of filling all slots (120 cycles vs. 20
cycles)
tst.c
void Unroll( short s, int * restrict b_inout, int *restrict out)
{
int i;
for (i=0; i<64; i++)
{
b_inout[i] = b_inout[i] * s;
}
}
arm-elf-gcc tst.c -O2 -std=c99 -S -v -fdump-tree-all -da -mcpu=arm9
-funroll-loops
Unroll:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
mov r0, r0, asl #16
stmfd sp!, {r4, r5, r6}
mov r4, r1
mov r6, r0, asr #16
mov r5, #0
.L2:
ldr r1, [r4, r5]
add ip, r5, #4
mul r0, r6, r1
str r0, [r4, r5]
ldr r3, [r4, ip]
add r0, ip, #4
mul r2, r6, r3
str r2, [r4, ip]
ldr r1, [r4, r0]
add ip, r5, #12
mul r3, r6, r1
str r3, [r4, r0]
ldr r2, [r4, ip]
add r1, r5, #16
mul r3, r6, r2
str r3, [r4, ip]
ldr r0, [r4, r1]
add ip, r5, #20
mul r3, r6, r0
str r3, [r4, r1]
ldr r2, [r4, ip]
add r1, r5, #24
mul r0, r6, r2
str r0, [r4, ip]
ldr r3, [r4, r1]
add ip, r5, #28
mul r0, r6, r3
str r0, [r4, r1]
ldr r2, [r4, ip]
add r5, r5, #32
mul r3, r6, r2
cmp r5, #256
str r3, [r4, ip]
bne .L2
ldmfd sp!, {r4, r5, r6}
bx lr
.size Unroll, .-Unroll
tst2.c:
void ManualUnroll( short s, int * restrict b_inout, int *restrict out)
{
int i;
for (i=0; i<64;)
{
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
b_inout[i] = b_inout[i] * s;
i++;
}
}
arm-elf-gcc tst2.c -O2 -std=c99 -S -mcpu=arm9
ManualUnroll:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
mov r0, r0, asl #16
stmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
mov sl, r1
mov r9, r0, asr #16
add fp, r1, #256
.L7:
ldr r3, [sl, #0]
ldr r2, [sl, #4]
ldr r1, [sl, #8]
ldr r0, [sl, #12]
ldr ip, [sl, #16]
add r4, sl, #20
ldmia r4, {r4, r5, r6} @ phole ldm
mul r7, r9, r3
mul r8, r9, r2
mul r3, r9, r1
mul r2, r9, r0
mul r1, r9, ip
mul r0, r9, r4
mul ip, r9, r5
mul r4, r9, r6
stmia sl, {r7, r8} @ phole stm
str r3, [sl, #8]
str r2, [sl, #12]
str r1, [sl, #16]
str r0, [sl, #20]
str ip, [sl, #24]
str r4, [sl, #28]
add sl, sl, #32
cmp sl, fp
bne .L7
ldmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
bx lr
.size ManualUnroll, .-ManualUnroll
.ident "GCC: (GNU) 4.4.0 20080530 (experimental)"
My ARM compiler is built with following configuration
CC="gcc -m32 -static" CFLAGS="-g"
RANLIB_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ranlib"
AR_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ar"
AS_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-as"
LD_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ld"
../src/configure --prefix=/home/bmei/work/trunck-arm --enable-languages=c
--disable-nls --target=arm-elf --disable-shared
--with-mpfr=/projects/firepath/tools/team/packages/x86_64-rhel3-32/mpfr/2.3.0
--with-gmp=/projects/firepath/tools/team/packages/x86_64-rhel3-32/gmp/4.2.2
--disable-libssp
--
Summary: Inefficient loop unrolling
Product: gcc
Version: 4.4.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: bmei at broadcom dot com
GCC target triplet: arm-elf-gcc
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712
More information about the Gcc-bugs
mailing list