Bug 24815 - loop unrolling ends up with too much reg+index addressing
Summary: loop unrolling ends up with too much reg+index addressing
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: rtl-optimization (show other bugs)
Version: 4.1.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on: 50749 19078
Blocks: 29842
  Show dependency treegraph
 
Reported: 2005-11-11 21:01 UTC by Jorn Wolfgang Rennecke
Modified: 2021-12-19 00:38 UTC (History)
1 user (show)

See Also:
Host:
Target: sh*-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed: 2012-07-22 00:00:00


Attachments
test case (242 bytes, text/plain)
2005-11-11 21:40 UTC, Jorn Wolfgang Rennecke
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Jorn Wolfgang Rennecke 2005-11-11 21:01:49 UTC
 
Comment 1 Andrew Pinski 2005-11-11 21:06:42 UTC
Try -fsplit-ivs-in-unroller 
Comment 2 Jorn Wolfgang Rennecke 2005-11-11 21:40:49 UTC
Created attachment 10222 [details]
test case

This testcase, compiled with -O2 -funroll-loops, shows numerous reg+index
addressing being used in the inner loop. which requires additional addition instructions.  (And since INDEX_REG_CLASS is CLASS_LIKELY_SPILLED_P, it also requires extra reg-reg copies).

When there ia a reg+index address giv, with one of the register being loop invariant, unrolling should perform strength reduction on the sum, so that
reg+offset addressing can be used.

E.g.:

L2:
        mov     r6,r3
        mov.l   @r3+,r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov     r7,r0
        mov.l   r9,@(r0,r5)
        mov     r7,r2
        add     #4,r2
        mov.l   @(4,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov     r2,r0
        mov.l   r1,@(r0,r5)
        add     #4,r2
        mov.l   @(4,r3),r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov     r2,r0
        mov.l   r9,@(r0,r5)
        add     #12,r7
        mov.l   @(12,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov     r7,r0
        mov.l   r3,@(r0,r5)
        add     #4,r7
        mov.l   @(16,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov     r7,r0
        mov.l   r1,@(r0,r5)
        add     #4,r7
        mov.l   @(20,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r2
        mov     r7,r0
        mov.l   r2,@(r0,r5)
        add     #4,r7
        mov.l   @(24,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov     r7,r0
        mov.l   r9,@(r0,r5)
        add     #4,r7
        mov.l   @(28,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov     r7,r0
        mov.l   r3,@(r0,r5)
        add     #4,r7
        mov.l   @(32,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov     r7,r0
        mov.l   r1,@(r0,r5)
        add     #4,r7
        mov.l   @(36,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r2
        mov     r7,r0
        mov.l   r2,@(r0,r5)
        add     #4,r7
        mov.l   @(40,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov     r7,r0
        mov.l   r9,@(r0,r5)
        add     #4,r7
        mov.l   @(44,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov     r7,r0
        mov.l   r3,@(r0,r5)
        add     #48,r6
        add     #4,r7

can be changed into:

        add r5,r7
L2:
        mov     @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov.l   r9,@r7
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov.l   r1,@(4,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov.l   r9,@(8,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov.l   r3,@(12,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov.l   r1,@(16,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r2
        mov.l   r2,@(20,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov.l   r9,@(24,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov.l   r3,@(28,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov.l   r1,@(32,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r2
        mov.l   r2,@(36,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov.l   r9,@(40,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov.l   r3,@(44,r7)
        add     #48,r7
, and beyond the reduced instruction count, the reduction of r0 usage also
makes this code simpler to schedule.
Comment 3 Jorn Wolfgang Rennecke 2005-11-11 21:48:22 UTC
(In reply to comment #1)
> Try -fsplit-ivs-in-unroller 
> 

No, that doesn't help.
Comment 4 Steven Bosscher 2006-04-24 22:37:46 UTC
I'm sure Zdenek would argue that strength reduction is not the unroller's task (and fwiw I agree), and Joern would argue that it is.  Either way, this is a valid bug report.
Comment 5 Oleg Endo 2012-07-22 16:25:55 UTC
As of rev 189746 I was able to reproduce the problem with the following reduced test case:

extern int tbl[1000];

void f (int* b, const int* a)
{
  for (int i = 0; i < 998; i++)
    b[i] = a[tbl[i]];
}

... compiled with '-O2 -m4-single -ml' (no loop unrolling):

        mov.l   .L6,r3          ! 66	movsi_ie/1	[length = 2]
        mov     #0,r7           ! 40	movsi_ie/3	[length = 2]
        mov.w   .L7,r2          ! 70	*movhi/1	[length = 2]
        .align 2
.L3:
        mov     r7,r0           ! 77	movsi_ie/2	[length = 2]
        mov.l   @(r0,r3),r1     ! 46	movsi_ie/7	[length = 2]
        dt      r2              ! 71	dect	[length = 2]
        shll2   r1              ! 47	ashlsi3_std/3	[length = 2]
        mov     r1,r0           ! 78	movsi_ie/2	[length = 2]
        mov.l   @(r0,r5),r1     ! 49	movsi_ie/7	[length = 2]
        mov     r7,r0           ! 79	movsi_ie/2	[length = 2]
        add     #4,r7           ! 51	*addsi3_compact	[length = 2]
        bf/s    .L3             ! 72	branch_false	[length = 2]
        mov.l   r1,@(r0,r4)     ! 50	movsi_ie/11	[length = 2]
        rts
        nop                     ! 83	*return_i	[length = 4]
        .align 1
.L7:
        .short	998
.L8:
        .align 2
.L6:
        .long	_tbl


... which would be better as:
        mov.l   .L6,r3
        mov.w   .L7,r2
.L3:
        mov.l   @r3+,r0
        dt      r2
        shll2   r0
        mov.l   @(r0,r5),r1
        mov.l   r1,@r4
        bf/s    .L3
        add     #4,r4

        rts
        nop

With loop unrolling enabled it looks similar to the code in comment #2.
It seems that this issue also depends on the auto-inc-dec related PR 50749.