Bug 55190 - ivopts causes loop setup bloat
Summary: ivopts causes loop setup bloat
Status: ASSIGNED
Alias: None
Product: gcc
Classification: Unclassified
Component: rtl-optimization (show other bugs)
Version: 4.8.0
: P3 enhancement
Target Milestone: ---
Assignee: bin cheng
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2012-11-03 12:16 UTC by Oleg Endo
Modified: 2024-04-24 06:34 UTC (History)
6 users (show)

See Also:
Host:
Target: sh*-*-* arm*-*-* powerpc*-*-*
Build:
Known to work:
Known to fail: 4.7.3, 4.8.0, 4.9.0
Last reconfirmed: 2013-02-16 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Oleg Endo 2012-11-03 12:16:18 UTC
The following code:

struct X
{
  int a, b, c, d, e;
};

int test (X* x, unsigned int c)
{
  int s = 0;
  unsigned int i;
  for (i = 0; i < c; ++i)
    s += x[i].b;
  return s;
}

results in:
        tst     r5,r5
        bt/s    .L4
        mov     r5,r1
        shll2   r1
        add     r5,r1
        mov.l   .L9,r2
        shll2   r1
        add     #-20,r1
        shlr2   r1
        mul.l   r2,r1
        mov.l   .L10,r2
        add     #4,r4
        mov     #0,r0
        sts     macl,r1
        and     r2,r1
        add     #1,r1
.L3:
        mov.l   @r4,r2
        dt      r1
        add     #20,r4
        bf/s    .L3
        add     r2,r0
        rts
        nop
.L4:
        rts
        mov     #0,r0
.L11:
        .align 2
.L9:
        .long   214748365
.L10:
        .long   1073741823

In such cases, the loop counter setup code that seems to be produced by ivopts can be left out, which would result in something like:

        tst     r5,r5
        bt/s    .L15
        add     #4,r4
        mov     #0,r0
.L14:
        mov.l   @r4,r1
        dt      r5
        add     #20,r4
        bf/s    .L14
        add     r1,r0
        rts
        nop
.L15:
        rts
        mov     #0,r0
Comment 1 Oleg Endo 2013-02-16 11:12:47 UTC
As of rev. 196091 this problem still exists.
Comment 2 Oleg Endo 2013-06-17 00:13:42 UTC
Looking at a simpler case (with -O2) ....

int test_0 (int* x, int y)
{
  int sum = 0;

  for (int i = 0; i < y; ++i)
    sum += x[i];

  return sum;
}

        cmp/pl  r5
        bf/s    .L6
        mov     #0,r0

        shll2   r5
        add     #-4,r5
        shlr2   r5
        add     #1,r5    // r5 = ((((unsigned int)y << 2) - 4) >> 2) + 1

        .align 2
.L3:
        mov.l   @r4+,r1
        dt      r5
        bf/s    .L3
        add     r1,r0
.L6:
        rts
        nop

In this case, if 'y' initially has the value '0x7FFFFFFF' the resulting loop count is truncated to '0x3FFFFFFF'.  This is sort of OK, since the resulting address would overflow and that is undefined behavior.
On the other hand, if an unlucky address for 'x' is passed in the first place, the resulting address might overflow much earlier than that.  Thus the loop counter fiddling seems pointless.

The tree-ssa-ivopts pass converts the loop to this:

Replacing exit test: if (y_3(D) > i_11)
int test_0(int*, int) (int * x, int y)
{
  unsigned int ivtmp.6;
  int i;
  int sum;
  unsigned int i.0;
  unsigned int _1;
  void * _2;
  int _9;
  unsigned int _19;
  unsigned int _20;
  unsigned int _21;

  <bb 2>:
  if (y_3(D) > 0)
    goto <bb 3>;
  else
    goto <bb 7>;

  <bb 3>:
  ivtmp.6_12 = (unsigned int) x_6(D);
  _1 = (unsigned int) y_3(D);
  _21 = _1 * 4;
  _20 = (unsigned int) x_6(D);
  _19 = _20 + _21;

  <bb 4>:
  # sum_16 = PHI <sum_10(6), 0(3)>
  # ivtmp.6_15 = PHI <ivtmp.6_14(6), ivtmp.6_12(3)>
  _2 = (void *) ivtmp.6_15;
  _9 = MEM[base: _2, offset: 0B];
  sum_10 = _9 + sum_16;
  ivtmp.6_14 = ivtmp.6_15 + 4;
  if (ivtmp.6_14 != _19)
    goto <bb 6>;
  else
    goto <bb 5>;

  <bb 5>:
  # sum_18 = PHI <sum_10(4)>
  goto <bb 7>;

  <bb 6>:
  goto <bb 4>;

  <bb 7>:
  # sum_13 = PHI <sum_18(5), 0(2)>
  return sum_13;

}

... which uses address '(x + y * 4)' as the loop exit test.

It is expanded to RTL as '(x + (y << 2))'


;; Generating RTL for gimple basic block 3

;; ivtmp.6_12 = (unsigned int) x_6(D);

(insn 38 37 0 (set (reg:SI 190 [ ivtmp.6 ])
        (reg/v/f:SI 194 [ x ])) -1
     (nil))

;; _19 = ivtmp.6_12 + _21;

(insn 39 38 40 (set (reg:SI 196 [ D.1617 ])
        (ashift:SI (reg/v:SI 195 [ y ])
            (const_int 2 [0x2]))) -1
     (nil))

(insn 40 39 0 (set (reg:SI 191 [ D.1617 ])
        (plus:SI (reg:SI 190 [ ivtmp.6 ])
            (reg:SI 196 [ D.1617 ]))) -1
     (nil))

... and remains until the loop2_doloop RTL pass, which converts the whole thing into a decrement-and-test loop and adds the other loop counter modifications:

Analyzing operand (reg:SI 191 [ D.1617 ]) of insn (insn 45 44 46 4 (set (reg:SI 147 t)
        (eq:SI (reg:SI 190 [ ivtmp.6 ])
            (reg:SI 191 [ D.1617 ]))) sh_tmp.cpp:5 17 {cmpeqsi_t}
     (expr_list:REG_DEAD (reg:SI 191 [ D.1617 ])
        (nil)))
  invariant (reg:SI 191 [ D.1617 ]) (in SI)
;; improved upper bound by one.
;; Determined upper bound -2.
Loop 1 is simple:
  simple exit 4 -> 7
  number of iterations: (lshiftrt:SI (plus:SI (minus:SI (reg:SI 191 [ D.1617 ])
            (reg:SI 190 [ ivtmp.6 ]))
        (const_int -4 [0xfffffffffffffffc]))
    (const_int 2 [0x2]))
  upper bound: 1073741822
  realistic bound: -1


The code in loop-iv.c works out the correct loop count if it gets the actual loop count upper bound instead of the truncated address upper bound if the tree-ssa-ivopts pass is turned off via -fno-ivopts.

I have also tried out the same code on ARM:

        cmp     r1, #0
        ble     .L4
        mov     r3, r0
        add     r1, r0, r1, asl #2
        mov     r0, #0
.L3:
        ldr     r2, [r3], #4
        cmp     r3, r1
        add     r0, r0, r2
        bne     .L3
        bx      lr
.L4:
        mov     r0, #0
        bx      lr

Since there is no doloop pattern on ARM, the code is left as it was output by the tree-ssa-ivopts pass, i.e. the exit test uses 'x + (y << 2)'.

So this doesn't seem to be a SH only issue.  However, I'm not sure whether this is more a problem of tree-ssa-ivopts or loop-iv.
If the tree-ssa-ivopts pass left the loop counter alone, the doloop pass would pick it up and the upper bound calculations in this case would go away.  However, targets that can do better without doloop (such as ARM) would probably suffer.  So probably it would be better to handle this overflow case in loop-iv.
Comment 3 bin.cheng 2013-09-30 07:15:57 UTC
ARM can benefit from doloop structure too, but it is implemented in different way. ARM backend defines special addsi_compare pattern and let combine pass combine decrement and comparison instruction, thus saving the comparison instruction.

IVOPT can be improved to select two iv candidates for the example loop, with auto-increment one for the memory access and decrement one for loop exit check.  This is especially good for target supports both doloop and auto-increment instructions like ARM and SH.

BUT most hand-written loops have incremental basic iv, so IVOPT depends on previous pass ivcanon to rewrite it into decremental iv, like below:

for (i = 0; i < 100; i++)
  //loop body

---->
for (i = 100; i > 0; i--)
  //modified loop body

Unfortunately, ivcanon pass only do such loop transformation for loop which iterates constant number times.

It seems difficult for RTL loop passes to revert decision made by IVOPT, so I think it should be done in GIMPLE IVOPT. I will give it a try.

Thanks.
Comment 4 Oleg Endo 2013-10-03 10:00:47 UTC
(In reply to bin.cheng from comment #3)
> ARM can benefit from doloop structure too, but it is implemented in
> different way. ARM backend defines special addsi_compare pattern and let
> combine pass combine decrement and comparison instruction, thus saving the
> comparison instruction.
> 
> IVOPT can be improved to select two iv candidates for the example loop, with
> auto-increment one for the memory access and decrement one for loop exit
> check.  This is especially good for target supports both doloop and
> auto-increment instructions like ARM and SH.

Yes, probably addressing mode selection for memories inside the loop can be covered somehow by ivopt.  However, this PR is about the calculations of the loop counter and the handling of overflows.

> 
> BUT most hand-written loops have incremental basic iv, so IVOPT depends on
> previous pass ivcanon to rewrite it into decremental iv, like below:
> 
> for (i = 0; i < 100; i++)
>   //loop body
> 
> ---->
> for (i = 100; i > 0; i--)
>   //modified loop body
> 
> Unfortunately, ivcanon pass only do such loop transformation for loop which
> iterates constant number times.
> 
> It seems difficult for RTL loop passes to revert decision made by IVOPT, so
> I think it should be done in GIMPLE IVOPT. I will give it a try.

I think this is a another story of missed ivopt opportunities.  Maybe open a new PR for this?
Comment 5 Oleg Endo 2014-07-31 18:03:52 UTC
As of r213381 the issue still persists.
Comment 6 Oleg Endo 2015-04-04 11:46:00 UTC
Another example:

const char* test (const char* s0, int c, int* rout)
{
  int r = 0;

  for (int i = 0; i < c; ++i)
//    r += s0[i];
    r += *s0++;

  *rout = r;
  return s0;
}

On SH compiles to:

_test:
        cmp/pl	r5
        bf	.L4
        mov	r4,r3       // r3 = s0
        add	r5,r4       // r4 = s0 + c
        mov	r4,r1       // r1 = s0 + c
        mov	#0,r2
        sub	r3,r1       // r1 = (s0 + c) - s0 = c  (d'uh)
	.align 2
.L3:
        mov.b   @r3+,r7
        dt      r1
        bf/s    .L3
        add     r7,r2
.L2:
        mov.l   r2,@r6
        rts	
        mov     r4,r0
	.align 1
.L4:
        bra     .L2
        mov     #0,r2
Comment 7 bin cheng 2015-04-07 01:42:10 UTC
I saw similar cases on arm.  If it's fine I would like to revisit this in the coming stage1.
Comment 8 Oleg Endo 2015-04-07 07:27:08 UTC
I've adjusted the title, since the issues here are not SH specific.  There is also a similar PR 62233.
Comment 9 Oleg Endo 2016-02-22 13:56:55 UTC
As of r233601 the issue still persists.
Comment 10 Alan Modra 2016-08-15 08:51:07 UTC
Seen also on powerpc.  Even for simple testcases like
int b[30];
void
fn1 (unsigned int n)
{
  unsigned int i;
  for (i = 0; i < n; i++)
    b[i] = 42;
}

I see for -m32
fn1:
        cmpwi 0,3,0
        beq 0,.L1
        slwi 9,3,2      ####
        lis 10,b-4@ha
        addi 9,9,-4     ####
        la 10,b-4@l(10)
        srwi 9,9,2      ####
        li 8,42
        addi 9,9,1      ####
        mtctr 9
        .p2align 4,,15
.L3:
        stwu 8,4(10)
        bdnz .L3
.L1:
        blr
All of the insns marked #### are redundant.

For -m64 this loop doesn't even use bdnz
Loop 1 is simple:
  simple exit 4 -> 5
  infinite if: (expr_list:REG_DEP_TRUE (ne:SI (and:DI (plus:DI (minus:DI (ashift:DI (reg:DI 189)
                        (const_int 2 [0x2]))
                    (reg:DI 179 [ ivtmp.7 ]))
                (symbol_ref:DI ("b") [flags 0x80]  <var_decl 0x7fe7521de870 b>))
            (const_int 3 [0x3]))
        (const_int 0 [0]))
    (nil))
  number of iterations: (lshiftrt:DI (plus:DI (minus:DI (reg:DI 185 [ _18 ])
            (reg:DI 179 [ ivtmp.7 ]))
        (const_int -4 [0xfffffffffffffffc]))
    (const_int 2 [0x2]))
  upper bound: 29
  likely upper bound: 29
  realistic bound: -1
Doloop: Possible infinite iteration case.
Doloop: The loop is not suitable.
Comment 11 Oleg Endo 2016-08-15 11:34:55 UTC
And still on SH, but now with AMS optimization:

char*
test_func_00 (char* p, int c, int x)
{
  do
  {
    *--p = (char)x;
    *--p = (char)x;
    *--p = (char)x;
  } while (--c);
  return p;
}


Results in:
        mov     r5,r7
        mov.l   .L6,r1
        add     r7,r7
        mov     r7,r2
        add     r5,r2
        mul.l   r1,r2
        mov     r4,r3
        exts.b  r6,r6
        add     #-3,r3
        sts     macl,r0
        .align 2
.L2:
        mov     r3,r2
        add     #3,r2
        mov.b   r6,@-r2
        dt      r0
        mov.b   r6,@-r2
        mov.b   r6,@r3
        mov     r2,r3
        bf/s    .L2
        add     #-4,r3

        add     r5,r7
        mov     r4,r0
        rts
        sub     r7,r0
.L7:
        .align 2
.L6:
        .long   -1431655765


And with -fno-ivopts we're getting something what one would expect:
        exts.b  r6,r6
        mov     r4,r1
        mov     r5,r2
        .align 2
.L2:
        mov.b   r6,@-r1
        dt      r2
        mov.b   r6,@-r1
        bf/s    .L2
        mov.b   r6,@-r1

        mov     r5,r0
        shll2   r0
        sub     r0,r5
        mov     r4,r0
        rts
        add     r5,r0
Comment 12 Oleg Endo 2023-07-07 06:49:19 UTC
(In reply to Oleg Endo from comment #0)
> The following code:
> 
> struct X
> {
>   int a, b, c, d, e;
> };
> 
> int test (X* x, unsigned int c)
> {
>   int s = 0;
>   unsigned int i;
>   for (i = 0; i < c; ++i)
>     s += x[i].b;
>   return s;
> }
> 
> results in:
>         tst     r5,r5
>         bt/s    .L4
>         mov     r5,r1
>         shll2   r1
>         add     r5,r1
>         mov.l   .L9,r2
>         shll2   r1
>         add     #-20,r1
>         shlr2   r1
>         mul.l   r2,r1
>         mov.l   .L10,r2
>         add     #4,r4
>         mov     #0,r0
>         sts     macl,r1
>         and     r2,r1
>         add     #1,r1
> .L3:
>         mov.l   @r4,r2
>         dt      r1
>         add     #20,r4
>         bf/s    .L3
>         add     r2,r0
>         rts
>         nop
> .L4:
>         rts
>         mov     #0,r0
> .L11:
>         .align 2
> .L9:
>         .long   214748365
> .L10:
>         .long   1073741823
> 

As of GCC 13, this still produces the same code with loop header bloat.