[Bug rtl-optimization/36712] New: Inefficient loop unrolling

bmei at broadcom dot com gcc-bugzilla@gcc.gnu.org
Thu Jul 3 09:07:00 GMT 2008


Loop unrolling generates far worse code compared with manually unrolled code. 
In following code, the first version is GCC-unrolled and the second is 
manually unrolled. 

GCC-unrolled version mainly suffers from two issues. First, the
load/store offsets are registers. Extra ADD instructions are needed to
increase offset over iteration. In the contrast, manually unrolled code
makes use of immediate offset efficiently and only need one ADD to
adjust base register in the end. Second, the alias (dependence) analysis
is over conservative. The LOAD instruction of next unrolled iteration
cannot be moved beyond previous STORE instruction even they are clearly
not aliased. I suspect the failure of alias analysis is related to the
first issue of handling base and offset address. The .sched2 file shows
that the first loop body requires 57 cycles whereas the second one takes
50 cycles for arm9 (56 cycles vs 34 cycles for Xscale).  It become even
worse for our VLIW porting due to longer latency of MUL and Load
instructions and incapability of filling all slots (120 cycles vs. 20
cycles)

tst.c
void Unroll( short s, int * restrict b_inout, int *restrict out)
{
        int i;
        for (i=0; i<64; i++)
        {
                b_inout[i] = b_inout[i] * s;
        }
}
arm-elf-gcc tst.c -O2  -std=c99 -S  -v -fdump-tree-all  -da  -mcpu=arm9
-funroll-loops
Unroll:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        mov     r0, r0, asl #16
        stmfd   sp!, {r4, r5, r6}
        mov     r4, r1
        mov     r6, r0, asr #16
        mov     r5, #0
.L2:
        ldr     r1, [r4, r5]
        add     ip, r5, #4
        mul     r0, r6, r1
        str     r0, [r4, r5]
        ldr     r3, [r4, ip]
        add     r0, ip, #4
        mul     r2, r6, r3
        str     r2, [r4, ip]
        ldr     r1, [r4, r0]
        add     ip, r5, #12
        mul     r3, r6, r1
        str     r3, [r4, r0]
        ldr     r2, [r4, ip]
        add     r1, r5, #16
        mul     r3, r6, r2
        str     r3, [r4, ip]
        ldr     r0, [r4, r1]
        add     ip, r5, #20
        mul     r3, r6, r0
        str     r3, [r4, r1]
        ldr     r2, [r4, ip]
        add     r1, r5, #24
        mul     r0, r6, r2
        str     r0, [r4, ip]
        ldr     r3, [r4, r1]
        add     ip, r5, #28
        mul     r0, r6, r3
        str     r0, [r4, r1]
        ldr     r2, [r4, ip]
        add     r5, r5, #32
        mul     r3, r6, r2
        cmp     r5, #256
        str     r3, [r4, ip]
        bne     .L2
        ldmfd   sp!, {r4, r5, r6}
        bx      lr
        .size   Unroll, .-Unroll


tst2.c:
void ManualUnroll( short s, int * restrict b_inout, int *restrict out)
{
        int i;
        for (i=0; i<64;)
        {
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
                b_inout[i] = b_inout[i] * s;
                i++;
        }
}

arm-elf-gcc tst2.c -O2  -std=c99 -S  -mcpu=arm9

ManualUnroll:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        mov     r0, r0, asl #16
        stmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
        mov     sl, r1
        mov     r9, r0, asr #16
        add     fp, r1, #256
.L7:
        ldr     r3, [sl, #0]
        ldr     r2, [sl, #4]
        ldr     r1, [sl, #8]
        ldr     r0, [sl, #12]
        ldr     ip, [sl, #16]
        add     r4, sl, #20
        ldmia   r4, {r4, r5, r6}        @ phole ldm
        mul     r7, r9, r3
        mul     r8, r9, r2
        mul     r3, r9, r1
        mul     r2, r9, r0
        mul     r1, r9, ip
        mul     r0, r9, r4
        mul     ip, r9, r5
        mul     r4, r9, r6
        stmia   sl, {r7, r8}    @ phole stm
        str     r3, [sl, #8]
        str     r2, [sl, #12]
        str     r1, [sl, #16]
        str     r0, [sl, #20]
        str     ip, [sl, #24]
        str     r4, [sl, #28]
        add     sl, sl, #32
        cmp     sl, fp
        bne     .L7
        ldmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
        bx      lr
        .size   ManualUnroll, .-ManualUnroll
        .ident  "GCC: (GNU) 4.4.0 20080530 (experimental)"


My ARM compiler is built with following configuration
CC="gcc -m32 -static" CFLAGS="-g"
RANLIB_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ranlib"
AR_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ar"
AS_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-as"
LD_FOR_TARGET="/home/aashley/work/sourceware/install/bin/arm-elf-ld"
../src/configure --prefix=/home/bmei/work/trunck-arm --enable-languages=c
--disable-nls --target=arm-elf  --disable-shared
--with-mpfr=/projects/firepath/tools/team/packages/x86_64-rhel3-32/mpfr/2.3.0
--with-gmp=/projects/firepath/tools/team/packages/x86_64-rhel3-32/gmp/4.2.2
--disable-libssp


-- 
           Summary: Inefficient loop unrolling
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: bmei at broadcom dot com
GCC target triplet: arm-elf-gcc


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36712



More information about the Gcc-bugs mailing list