Bug 39838 - [4.8/4.9/5/6 regression] unoptimal code for two simple loops
Summary: [4.8/4.9/5/6 regression] unoptimal code for two simple loops
Status: ASSIGNED
Alias: None
Product: gcc
Classification: Unclassified
Component: middle-end (show other bugs)
Version: 4.4.0
: P2 normal
Target Milestone: 4.8.5
Assignee: Zdenek Dvorak
URL:
Keywords: missed-optimization
Depends on:
Blocks: 16996
  Show dependency treegraph
 
Reported: 2009-04-21 17:49 UTC by Alexander Vodomerov
Modified: 2014-12-19 13:26 UTC (History)
7 users (show)

See Also:
Host: x86_64-unknown-linux-gnu
Target: x86_64-unknown-linux-gnu
Build: x86_64-unknown-linux-gnu
Known to work:
Known to fail: 4.7.0
Last reconfirmed: 2010-01-02 00:58:34


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Alexander Vodomerov 2009-04-21 17:49:06 UTC
The following code:

struct A
{
  int count;
  int *data;
};
void func(int, int);
void test (struct A* p, const void **ptrArray, int count)
{
  int i, j;
  for (i = 0; i < p->count; i++)
    {
      for (j = 0; j < count; j++)
        {
          func (p->data[i], p->data[i + 1]);
        }
    }
}

is compiled to 50 bytes by GCC 4.2.1 and to 56 bytes by GCC 4.4.0 (and GCC 4.3.1 also) on ARM in thumb mode

GCC 4.2.1 (with -march=armv5te -mthumb -mthumb-interwork -fpic -Os)
test:
        push    {r4, r5, r6, r7, lr}
        sub     sp, sp, #12
        mov     r7, r0
        mov     r5, #0
        str     r2, [sp, #4]
        b       .L2
.L3:
        ldr     r3, [r7, #4]
        add     r4, r4, #1
        ldr     r0, [r3, r6]
        add     r3, r6, r3
        ldr     r1, [r3, #4]
        bl      func
.L5:
        ldr     r3, [sp, #4]
        cmp     r4, r3
        blt     .L3
        add     r5, r5, #1
.L2:
        ldr     r3, [r7]
        cmp     r5, r3
        bge     .L6
        lsl     r6, r5, #2
        mov     r4, #0
        b       .L5
.L6:
        add     sp, sp, #12
        @ sp needed for prologue
        pop     {r4, r5, r6, r7, pc}

GCC 4.4.0:
test:
        push    {r4, r5, r6, r7, lr}
        sub     sp, sp, #12
        mov     r4, r0
        str     r2, [sp, #4]
        mov     r7, #4          // doesn't exist in 4.2.1
        mov     r5, #0
        b       .L2
.L3:
        ldr     r3, [r4, #4]
        ldr     r2, [sp]
        ldr     r1, [r3, r7]
        ldr     r0, [r3, r2]
        bl      func
        add     r6, r6, #1
.L5:
        ldr     r3, [sp, #4]
        cmp     r6, r3
        blt     .L3
        add     r5, r5, #1
        add     r7, r7, #4      // doesn't exist in 4.2.1
.L2:
        ldr     r3, [r4]
        cmp     r5, r3
        bge     .L6
        lsl     r2, r5, #2
        str     r2, [sp]        // doesn't exist in 4.2.1
        mov     r6, #0
        b       .L5
.L6:
        add     sp, sp, #12
        @ sp needed for prologue
        pop     {r4, r5, r6, r7, pc}

Changing -Os to -O2 produces even worse code (50->64, 56->74, +6 -> +10).

Bisection on trunk shows that it was changed by http://gcc.gnu.org/viewcvs?view=rev&revision=125755 which was a merge of pointer_plus branch (therefore adding Andrew Pinski in cc).

It also reproduces on x86 as well:
GCC 4.2.4 with -m32 -O2:
test:
        pushl   %ebp
        movl    %esp, %ebp
        pushl   %edi
        pushl   %esi
        pushl   %ebx
        subl    $12, %esp
        movl    8(%ebp), %edi
        movl    $0, -16(%ebp)
        movl    (%edi), %edx
        testl   %edx, %edx
        jle     .L8
.L4:
        movl    16(%ebp), %eax
        testl   %eax, %eax
        jle     .L6
        movl    -16(%ebp), %esi
        xorl    %ebx, %ebx
        sall    $2, %esi
        .p2align 4,,7
.L5:
        movl    4(%edi), %eax
        addl    $1, %ebx
        movl    4(%esi,%eax), %edx
        movl    %edx, 4(%esp)
        movl    (%eax,%esi), %eax
        movl    %eax, (%esp)
        call    func
        cmpl    16(%ebp), %ebx
        jne     .L5
.L6:
        addl    $1, -16(%ebp)
        movl    -16(%ebp), %eax
        cmpl    %eax, (%edi)
        jg      .L4
.L8:
        addl    $12, %esp
        popl    %ebx
        popl    %esi
        popl    %edi
        popl    %ebp
        ret

GCC 4.4.0 (with the same options):
test:
        pushl   %ebp
        movl    %esp, %ebp
        pushl   %edi
        pushl   %esi
        movl    $4, %esi
        pushl   %ebx
        subl    $44, %esp
        movl    8(%ebp), %edi
        movl    $0, -28(%ebp)
        movl    (%edi), %edx
        testl   %edx, %edx
        jle     .L6
        .p2align 4,,7
        .p2align 3
.L3:
        movl    16(%ebp), %eax
        testl   %eax, %eax
        jle     .L5
        movl    -28(%ebp), %ecx
        movl    %edi, %eax
        xorl    %ebx, %ebx
        sall    $2, %ecx
        movl    %ecx, %edi
        movl    %eax, %ecx
        .p2align 4,,7
        .p2align 3
.L4:
        movl    4(%ecx), %eax
        addl    $1, %ebx
        movl    %ecx, -32(%ebp)
        movl    (%eax,%esi), %edx
        movl    %edx, 4(%esp)
        movl    (%eax,%edi), %eax
        movl    %eax, (%esp)
        call    func
        movl    -32(%ebp), %ecx
        cmpl    %ebx, 16(%ebp)
        jg      .L4
        movl    %ecx, %edi
.L5:
        addl    $1, -28(%ebp)
        addl    $4, %esi
        movl    -28(%ebp), %eax
        cmpl    %eax, (%edi)
        jg      .L3
.L6:
        addl    $44, %esp
        popl    %ebx
        popl    %esi
        popl    %edi
        popl    %ebp
        ret

Some stat by instructions:
$ cat 1.s|grep -v '[.:]'|awk '{print $1}'|sort|uniq -c|sort -g
      1 call
      1 ret
      1 sall
      1 subl
      1 xorl
      2 cmpl
      2 testl
      3 addl
      4 popl
      4 pushl
     12 movl

$ cat 2.s|grep -v '[.:]'|awk '{print $1}'|sort|uniq -c|sort -g
      1 call
      1 ret
      1 sall
      1 subl
      1 xorl
      2 cmpl
      2 testl
      4 addl
      4 popl
      4 pushl
     19 movl

12->19 movl's is not very good.
Comment 1 Andrew Pinski 2009-04-21 17:56:52 UTC
This is IV-opts messing way up as far as I can tell.  Pointer Plus just helped out PRE and code motion passes which confuses the hell out of IV-opts.
Comment 2 Alexander Vodomerov 2009-04-21 18:37:59 UTC
(In reply to comment #1)
> This is IV-opts messing way up as far as I can tell.  Pointer Plus just helped
> out PRE and code motion passes which confuses the hell out of IV-opts.
> 
I tried to use -fno-ivopts flag, but it doesn't have any effect on this.
Comment 3 Andrew Pinski 2009-06-22 05:43:49 UTC
Confirmed.
Comment 4 Richard Biener 2009-08-04 12:30:13 UTC
GCC 4.3.4 is being released, adjusting target milestone.
Comment 5 Steven Bosscher 2010-01-02 00:52:44 UTC
From "GCC: (GNU) 4.5.0 20090808 (experimental) [trunk revision 150579]":

      1 @
      1 bl
      1 pop
      1 push
      1 sub
      2 cmp
      2 str
      4 add
      4 mov
      7 ldr

test:
        push    {r4, r5, r6, r7, lr}
        sub     sp, sp, #12
        mov     r4, r0
        str     r2, [sp, #4]
        mov     r6, #0
        mov     r7, #0
        b       .L2
.L3:
        ldr     r3, [r4, #4]
        ldr     r2, [sp]
        ldr     r0, [r3, r6]
        ldr     r1, [r3, r2]
        bl      func
        add     r5, r5, #1
.L5:
        ldr     r3, [sp, #4]
        cmp     r5, r3
        blt     .L3
        ldr     r6, [sp]
        add     r7, r7, #1
.L2:
        ldr     r3, [r4]
        cmp     r7, r3
        bge     .L1
        add     r2, r6, #4
        str     r2, [sp]
        mov     r5, #0
        b       .L5
.L1:
        add     sp, sp, #12
        @ sp needed for prologue
        pop     {r4, r5, r6, r7, pc}
Comment 6 Steven Bosscher 2010-01-02 00:57:25 UTC
On x86 -O2:

      1 call
      1 leal
      1 ret
      1 subl
      2 cmpl
      2 testl
      2 xorl
      3 addl
      4 popl
      4 pushl
     20 movl

test:
	pushl	%ebp
	movl	%esp, %ebp
	pushl	%edi
	pushl	%esi
	xorl	%esi, %esi
	pushl	%ebx
	subl	$44, %esp
	movl	8(%ebp), %ecx
	movl	$0, -28(%ebp)
	movl	(%ecx), %edx
	testl	%edx, %edx
	jle	.L1
	.p2align 4,,7
	.p2align 3
.L8:
	movl	16(%ebp), %eax
	leal	4(%esi), %edi
	testl	%eax, %eax
	jle	.L6
	movl	%edi, %eax
	xorl	%ebx, %ebx
	movl	%ecx, %edi
	movl	%eax, %ecx
	.p2align 4,,7
	.p2align 3
.L4:
	movl	4(%edi), %eax
	addl	$1, %ebx
	movl	(%eax,%ecx), %edx
	movl	%edx, 4(%esp)
	movl	(%eax,%esi), %eax
	movl	%ecx, -32(%ebp)
	movl	%eax, (%esp)
	call	func
	movl	-32(%ebp), %ecx
	cmpl	%ebx, 16(%ebp)
	jg	.L4
	movl	%ecx, %eax
	movl	%edi, %ecx
	movl	%eax, %edi
.L6:
	addl	$1, -28(%ebp)
	movl	%edi, %esi
	movl	-28(%ebp), %eax
	cmpl	%eax, (%ecx)
	jg	.L8
.L1:
	addl	$44, %esp
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret
Comment 7 Richard Biener 2010-05-22 18:13:37 UTC
GCC 4.3.5 is being released, adjusting target milestone.
Comment 8 Jeffrey A. Law 2011-01-13 15:17:50 UTC
I'm by no means an expert on the current IV code, so please take this with a grain of salt.

If we look at the .ivcanon dump we have the two following critical blocks:

  # BLOCK 3 freq:9100
  # PRED: 4 [91.0%]  (true,exec)
  # VUSE <.MEM_21>
  D.1969_8 = p_4(D)->data;
  D.1972_11 = D.1969_8 + D.1971_10;
  # VUSE <.MEM_21>
  D.1973_12 = *D.1972_11;
  D.1977_17 = D.1969_8 + D.1976_16;
  # VUSE <.MEM_21>
  D.1978_18 = *D.1977_17;
  # .MEM_24 = VDEF <.MEM_21>
  func (D.1973_12, D.1978_18);
  j_19 = j_2 + 1;
  # SUCC: 4 [100.0%]  (fallthru,exec)

  # BLOCK 4 freq:10000
  # PRED: 7 [100.0%]  (fallthru,exec) 3 [100.0%]  (fallthru,exec)
  # j_2 = PHI <0(7), j_19(3)>
  # .MEM_21 = PHI <.MEM_22(7), .MEM_24(3)>
  if (j_2 < count_7(D))
    goto <bb 3>;
  else
    goto <bb 5>;
  # SUCC: 3 [91.0%]  (true,exec) 5 [9.0%]  (false,exec)

  # BLOCK 5 freq:900
  # PRED: 4 [9.0%]  (false,exec)
  i_20 = i_1 + 1;
  # SUCC: 6 [100.0%]  (fallthru,exec)

  # BLOCK 6 freq:989
  # PRED: 2 [100.0%]  (fallthru,exec) 5 [100.0%]  (fallthru,exec)
  # i_1 = PHI <0(2), i_20(5)>
  # .MEM_22 = PHI <.MEM_23(D)(2), .MEM_21(5)>
  # VUSE <.MEM_22>
  D.1979_5 = p_4(D)->count;
  if (i_1 < D.1979_5)
    goto <bb 7>;
  else
    goto <bb 8>;
  # SUCC: 7 [91.0%]  (true,exec) 8 [9.0%]  (false,exec)

  # BLOCK 7 freq:900
  # PRED: 6 [91.0%]  (true,exec)
  i.0_9 = (unsigned int) i_1;
  D.1971_10 = i.0_9 * 4;
  D.1975_15 = i.0_9 + 1;
  D.1976_16 = D.1975_15 * 4;
  goto <bb 4>;
  # SUCC: 4 [100.0%]  (fallthru,exec)

Note carefully how we use D1971_10 and D1976_16 to build addresses for the two memory references in block #3 (p->data[i] and p->data[i+1] respectively).

After IVopts we have:


  # BLOCK 3 freq:9100
  # PRED: 4 [91.0%]  (true,exec)
  # VUSE <.MEM_21>
  D.1969_8 = p_4(D)->data;
  D.1972_11 = D.1969_8 + D.1971_10;
  # VUSE <.MEM_21>
  D.1973_12 = *D.1972_11;
  D.1977_17 = D.1969_8 + D.1976_16;
  # VUSE <.MEM_21>
  D.1978_18 = *D.1977_17;
  # .MEM_24 = VDEF <.MEM_21>
  func (D.1973_12, D.1978_18);
  j_19 = j_2 + 1;
  # SUCC: 4 [100.0%]  (fallthru,exec)
  
  # BLOCK 4 freq:10000
  # PRED: 7 [100.0%]  (fallthru,exec) 3 [100.0%]  (fallthru,exec)
  # j_2 = PHI <0(7), j_19(3)>
  # .MEM_21 = PHI <.MEM_22(7), .MEM_24(3)>
  if (j_2 < count_7(D))
    goto <bb 3>;
  else
    goto <bb 5>;
  # SUCC: 3 [91.0%]  (true,exec) 5 [9.0%]  (false,exec)
  
  # BLOCK 5 freq:900
  # PRED: 4 [9.0%]  (false,exec)
  i_20 = i_1 + 1;
  ivtmp.12_14 = ivtmp.12_13 + 4;
  # SUCC: 6 [100.0%]  (fallthru,exec)

  # BLOCK 6 freq:989
  # PRED: 2 [100.0%]  (fallthru,exec) 5 [100.0%]  (fallthru,exec)
  # i_1 = PHI <0(2), i_20(5)>
  # .MEM_22 = PHI <.MEM_23(D)(2), .MEM_21(5)>
  # ivtmp.12_13 = PHI <4(2), ivtmp.12_14(5)>
  # VUSE <.MEM_22>
  D.1979_5 = p_4(D)->count;
  if (i_1 < D.1979_5)
    goto <bb 7>;
  else
    goto <bb 8>;
  # SUCC: 7 [91.0%]  (true,exec) 8 [9.0%]  (false,exec)

  # BLOCK 7 freq:900
  # PRED: 6 [91.0%]  (true,exec) 
  D.1992_6 = (unsigned int) i_1;
  D.1993_26 = D.1992_6 * 4; 
  D.1971_10 = D.1993_26;
  D.1976_16 = ivtmp.12_13; 
  goto <bb 4>; 
  # SUCC: 4 [100.0%]  (fallthru,exec)

UGH.  Everything involving ivtmp.12 is a waste of time.  We really just need to realize that D1976_16 is D1971_10 + 4 which avoids all the nonsense with ivtmp.12 and I *think* would restore the quality of this code.   I don't know enough about the current ivopts code to prototype this and verify that such a change would restore the quality of this code.

Zdenek, can you take a look?
Comment 9 Zdenek Dvorak 2011-01-17 11:04:22 UTC
> UGH.  Everything involving ivtmp.12 is a waste of time.  We really just need to
> realize that D1976_16 is D1971_10 + 4 which avoids all the nonsense with
> ivtmp.12 and I *think* would restore the quality of this code.   I don't know
> enough about the current ivopts code to prototype this and verify that such a
> change would restore the quality of this code.

actually, ivopts know that D1976_16 is D1971_10 + 4; however, since both

D1971_10 = ivtmp.12 - 4;
and
D1971_10 = i << 2;

have the same complexity, it arbitrarily decides to use the latter.  The problem is in ivopts deciding to create ivtmp.12 at all.  However, this will be somewhat hard to avoid, since locally, replacing D1976_16 (= i << 2 + 4) by a new iv is a correct decision (it replaces addition and shift by a single addition).

Ideally, ivopts should recognize that D1976_16 and D1971_10 are used for memory addressing and use the appropriate addressing modes ([D.1969_8 + 4*i + 4] instead of [D.1972_11]).  However, that also fails since ivopts only recognize the memory references whose addresses are affine ivs (which is not the case here, as we cannot prove that D.1969_8 is invariant in the loop).
Comment 10 Richard Biener 2011-06-27 12:12:58 UTC
4.3 branch is being closed, moving to 4.4.7 target.
Comment 11 Richard Biener 2012-01-16 13:53:30 UTC
Re-confirmed for trunk.  IV optimization does not seem to be code-size
aware with -Os.
Comment 12 Jakub Jelinek 2012-03-13 12:45:41 UTC
4.4 branch is being closed, moving to 4.5.4 target.
Comment 13 Richard Biener 2012-07-02 11:41:56 UTC
The 4.5 branch is being closed, adjusting target milestone.
Comment 14 Jakub Jelinek 2013-04-12 15:16:06 UTC
GCC 4.6.4 has been released and the branch has been closed.
Comment 15 bin.cheng 2013-12-13 10:23:58 UTC
The situation gets a little bit better on 4_9 trunk.  The Os assembly code on cortex-m0 (thumb1 as reported) is like:
test:
	push	{r0, r1, r2, r4, r5, r6, r7, lr}
	mov	r6, r0
	mov	r4, #0
	str	r2, [sp, #4]
.L2:
	ldr	r2, [r6]
	cmp	r4, r2
	bge	.L7
	mov	r5, #0
	lsl	r7, r4, #2
	add	r2, r7, #4   <----move to before XXX 
	str	r2, [sp]     <----spill
.L3:
	ldr	r3, [sp, #4]
	cmp	r5, r3
	bge	.L8
	ldr	r3, [r6, #4]
	ldr	r2, [sp]     <----spill
	ldr	r0, [r3, r7]
	ldr	r1, [r3, r2] <----XXX
	bl	func
	add	r5, r5, #1
	b	.L3
.L8:
	add	r4, r4, #1
	b	.L2
.L7:
	@ sp needed
	pop	{r0, r1, r2, r4, r5, r6, r7, pc}
	.size	test, .-test

IVOPT chooses the original biv for all uses in outer loop, regression comes from long live range of "r2" and the corresponding spill.
Then I realized that GCC IVOPT computes iv (for non-linear uses) at original place, we may be able to teach IVOPT to compute the iv just before it's used in order to shrink live range of iv.  The patch I had at http://gcc.gnu.org/ml/gcc-patches/2013-11/msg00535.html is similar to this, only it computes iv uses at appropriate place for outside loop iv uses.

But this idea won't help this specific case because LIM will hoist all the computation to basic block .L2 after IVOPT.
Comment 16 bin.cheng 2013-12-13 11:00:53 UTC
For optimization level O2, the dump before IVOPT is like:

  <bb 2>:
  _21 = p_6(D)->count;
  if (_21 > 0)
    goto <bb 3>;
  else
    goto <bb 11>;

  <bb 3>:

  <bb 4>:
  # i_26 = PHI <i_20(10), 0(3)>
  if (count_8(D) > 0)
    goto <bb 5>;
  else
    goto <bb 9>;

  <bb 5>:
  pretmp_23 = (sizetype) i_26;
  pretmp_32 = pretmp_23 + 1;
  pretmp_33 = pretmp_32 * 4;
  pretmp_34 = pretmp_23 * 4;

  <bb 6>:
  # j_27 = PHI <j_19(7), 0(5)>
  _9 = p_6(D)->data;
  _13 = _9 + pretmp_33;
  _14 = *_13;
  _16 = _9 + pretmp_34;
  _17 = *_16;
  func (_17, _14);
  j_19 = j_27 + 1;
  if (count_8(D) > j_19)
    goto <bb 7>;
  else
    goto <bb 8>;

  <bb 7>:
  goto <bb 6>;

  <bb 8>:

  <bb 9>:
  i_20 = i_26 + 1;
  _7 = p_6(D)->count;
  if (_7 > i_20)
    goto <bb 10>;
  else
    goto <bb 11>;

  <bb 10>:
  goto <bb 4>;

  <bb 11>:
  return;

There might be two issues that block IVOPT choosing the biv(i) for pretmp_33 and pretmp_34:
1) on some target (like ARM), "i << 2 + 4" can be done in one instruction, if the cost is same as simple shift or plus, then overall cost of biv(i) is lower than the two candidate iv sets.  GCC doesn't do such check in get_computation_cost_at for now.
2) there is CSE opportunity between computation of pretmp_33 and pretmp_34, for example they can be computed as below:
   pretmp_33 = i << 2
   pretmp_34 = pretmp_33 + 4
but GCC IVOPT is insensitive to such CSE opportunities between different iv uses.  I guess this isn't easy because unless the two uses are very close in code (like this one), such CSE may avail to nothing.

These kind tweaks on cost are tricky(and most probably has no overall benefit) because the cost IVOPT computed from RTL is far from precise to do such fine granularity tuning.

Another point, as Zdenek pointed out, IVOPT doesn't know that pretmp_33/pretmp_34 are going to be used in memory accesses, which means some of address computation can be embedded by appropriate addressing mode.  In other words, computation of pretmp_33/pretmp_34 shouldn't be honored when computing overall cost and choosing iv candidates set.  Since "_9 + pretmp_33/pretmp_34" is not affine iv, the only way to handle this issue is to lower both memory accesses before IVOPT, into some code like below:

  <bb 5>:
  pretmp_23 = (sizetype) i_26;
  pretmp_32 = pretmp_23 + 1;

  <bb 6>:
  # j_27 = PHI <j_19(7), 0(5)>
  _9 = p_6(D)->data;
  _14 = MEM[_9 + pretmp_32 << 2];
  _17 = MEM[_9 + pretmp_23 << 2];
  func (_17, _14);
  j_19 = j_27 + 1;
  if (count_8(D) > j_19)
    goto <bb 7>;
  else
    goto <bb 8>;

With this code, the iv uses are biv(i), pretmp_23(i_26) and pretmp_32(i_26+1), and IVOPT won't even add the annoying candidate.
Comment 17 Richard Biener 2014-06-12 13:42:43 UTC
The 4.7 branch is being closed, moving target milestone to 4.8.4.
Comment 18 Jakub Jelinek 2014-12-19 13:26:01 UTC
GCC 4.8.4 has been released.