This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Inefficient loop unrolling.

From: "Bingfeng Mei" <bmei at broadcom dot com>
To: gcc at gcc dot gnu dot org
Date: Wed, 2 Jul 2008 04:13:32 -0700
Subject: Inefficient loop unrolling.

Hello,
I am looking at GCC's loop unrolling and find it quite inefficient
compared with manually unrolled loop even for very simple loop. The
followings are a simple loop and its manually unrolled version. I didn't
apply any trick on manually unrolled one as it is exact replications of
original loop body. I have expected by -funroll-loops the first version
should produce code of similar quality as the second one. However,
compiled with ARM target of mainline GCC, both functions produce very
different results. 

GCC-unrolled version mainly suffers from two issues. First, the
load/store offsets are registers. Extra ADD instructions are needed to
increase offset over iteration. In the contrast, manually unrolled code
makes use of immediate offset efficiently and only need one ADD to
adjust base register in the end. Second, the alias (dependence) analysis
is over conservative. The LOAD instruction of next unrolled iteration
cannot be moved beyond previous STORE instruction even they are clearly
not aliased. I suspect the failure of alias analysis is related to the
first issue of handling base and offset address. The .sched2 file shows
that the first loop body requires 57 cycles whereas the second one takes
50 cycles for arm9 (56 cycles vs 34 cycles for Xscale).  It become even
worse for our VLIW porting due to longer latency of MUL and Load
instructions and incapability of filling all slots (120 cycles vs. 20
cycles)

By analyzing compilation phases, I believe if the loop unrolling happens
at the tree-level, or if we have an optimizing pass like "ivopts" after
loop unrolling in RTL level, GCC can produce far more efficient
loop-unrolled code.  "ivopts" pass really does a wonderful job in
optimizing induction variables. Strangely, I found some unrolling
functions at tree-level, but there is no independent tree-level loop
unrolling pass except "cunroll", which is complete unrolling.  What
prevents such a tree-level unrolling pass? Or is there any suggestion to
improve existing RTL level unrolling? Thanks in advance. 

Cheers,
Bingfeng Mei
Broadcom UK


void Unroll( short s, int * restrict b_inout, int *restrict out)
{
        int i;
	for (i=0; i<64; i++)
	{
		b_inout[i] = b_inout[i] * s;
	}
}


void ManualUnroll( short s, int * restrict b_inout, int *restrict out)
{
        int i;
	for (i=0; i<64;)
	{
		b_inout[i] = b_inout[i] * s;
                i++;
		b_inout[i] = b_inout[i] * s;
                i++;
		b_inout[i] = b_inout[i] * s;
                i++;
		b_inout[i] = b_inout[i] * s;
                i++;
		b_inout[i] = b_inout[i] * s;
                i++;
		b_inout[i] = b_inout[i] * s;
                i++;
		b_inout[i] = b_inout[i] * s;
                i++;
		b_inout[i] = b_inout[i] * s;
                i++;
	}
}


arm-elf-gcc tst2.c -O2  -std=c99 -S  -v -fdump-tree-all  -da  -mcpu=arm9
-funroll-loops
Unroll:
	@ args = 0, pretend = 0, frame = 0
	@ frame_needed = 0, uses_anonymous_args = 0
	@ link register save eliminated.
	mov	r0, r0, asl #16
	stmfd	sp!, {r4, r5, r6}
	mov	r4, r1
	mov	r6, r0, asr #16
	mov	r5, #0
.L2:
	ldr	r1, [r4, r5]
	add	ip, r5, #4
	mul	r0, r6, r1
	str	r0, [r4, r5]
	ldr	r3, [r4, ip]
	add	r0, ip, #4
	mul	r2, r6, r3
	str	r2, [r4, ip]
	ldr	r1, [r4, r0]
	add	ip, r5, #12
	mul	r3, r6, r1
	str	r3, [r4, r0]
	ldr	r2, [r4, ip]
	add	r1, r5, #16
	mul	r3, r6, r2
	str	r3, [r4, ip]
	ldr	r0, [r4, r1]
	add	ip, r5, #20
	mul	r3, r6, r0
	str	r3, [r4, r1]
	ldr	r2, [r4, ip]
	add	r1, r5, #24
	mul	r0, r6, r2
	str	r0, [r4, ip]
	ldr	r3, [r4, r1]
	add	ip, r5, #28
	mul	r0, r6, r3
	str	r0, [r4, r1]
	ldr	r2, [r4, ip]
	add	r5, r5, #32
	mul	r3, r6, r2
	cmp	r5, #256
	str	r3, [r4, ip]
	bne	.L2
	ldmfd	sp!, {r4, r5, r6}
	bx	lr
	.size	Unroll, .-Unroll

arm-elf-gcc tst2.c -O2  -std=c99 -S  -v -fdump-tree-all  -da  -mcpu=arm9

ManualUnroll:
	@ args = 0, pretend = 0, frame = 0
	@ frame_needed = 0, uses_anonymous_args = 0
	@ link register save eliminated.
	mov	r0, r0, asl #16
	stmfd	sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
	mov	sl, r1
	mov	r9, r0, asr #16
	add	fp, r1, #256
.L7:
	ldr	r3, [sl, #0]
	ldr	r2, [sl, #4]
	ldr	r1, [sl, #8]
	ldr	r0, [sl, #12]
	ldr	ip, [sl, #16]
	add	r4, sl, #20
	ldmia	r4, {r4, r5, r6}	@ phole ldm
	mul	r7, r9, r3
	mul	r8, r9, r2
	mul	r3, r9, r1
	mul	r2, r9, r0
	mul	r1, r9, ip
	mul	r0, r9, r4
	mul	ip, r9, r5
	mul	r4, r9, r6
	stmia	sl, {r7, r8}	@ phole stm
	str	r3, [sl, #8]
	str	r2, [sl, #12]
	str	r1, [sl, #16]
	str	r0, [sl, #20]
	str	ip, [sl, #24]
	str	r4, [sl, #28]
	add	sl, sl, #32
	cmp	sl, fp
	bne	.L7
	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
	bx	lr
	.size	ManualUnroll, .-ManualUnroll
	.ident	"GCC: (GNU) 4.4.0 20080530 (experimental)"

Follow-Ups:
- Re: Inefficient loop unrolling.
  - From: Richard Guenther
- Re: Inefficient loop unrolling.
  - From: Steven Bosscher

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]