Summary: | [6 regression] unoptimal code for two simple loops | ||
---|---|---|---|
Product: | gcc | Reporter: | Alexander Vodomerov <alexvod> |
Component: | middle-end | Assignee: | Not yet assigned to anyone <unassigned> |
Status: | RESOLVED FIXED | ||
Severity: | normal | CC: | amker.cheng, dushistov, gcc-bugs, hubicka, jakub, jeffreyalaw, paulo, pinskia, vries |
Priority: | P2 | Keywords: | missed-optimization |
Version: | 4.4.0 | ||
Target Milestone: | 7.4 | ||
Host: | x86_64-unknown-linux-gnu | Target: | x86_64-unknown-linux-gnu |
Build: | x86_64-unknown-linux-gnu | Known to work: | |
Known to fail: | 4.7.0 | Last reconfirmed: | 2010-01-02 00:58:34 |
Bug Depends on: | |||
Bug Blocks: | 16996 |
Description
Alexander Vodomerov
2009-04-21 17:49:06 UTC
This is IV-opts messing way up as far as I can tell. Pointer Plus just helped out PRE and code motion passes which confuses the hell out of IV-opts. (In reply to comment #1) > This is IV-opts messing way up as far as I can tell. Pointer Plus just helped > out PRE and code motion passes which confuses the hell out of IV-opts. > I tried to use -fno-ivopts flag, but it doesn't have any effect on this. Confirmed. GCC 4.3.4 is being released, adjusting target milestone. From "GCC: (GNU) 4.5.0 20090808 (experimental) [trunk revision 150579]": 1 @ 1 bl 1 pop 1 push 1 sub 2 cmp 2 str 4 add 4 mov 7 ldr test: push {r4, r5, r6, r7, lr} sub sp, sp, #12 mov r4, r0 str r2, [sp, #4] mov r6, #0 mov r7, #0 b .L2 .L3: ldr r3, [r4, #4] ldr r2, [sp] ldr r0, [r3, r6] ldr r1, [r3, r2] bl func add r5, r5, #1 .L5: ldr r3, [sp, #4] cmp r5, r3 blt .L3 ldr r6, [sp] add r7, r7, #1 .L2: ldr r3, [r4] cmp r7, r3 bge .L1 add r2, r6, #4 str r2, [sp] mov r5, #0 b .L5 .L1: add sp, sp, #12 @ sp needed for prologue pop {r4, r5, r6, r7, pc} On x86 -O2: 1 call 1 leal 1 ret 1 subl 2 cmpl 2 testl 2 xorl 3 addl 4 popl 4 pushl 20 movl test: pushl %ebp movl %esp, %ebp pushl %edi pushl %esi xorl %esi, %esi pushl %ebx subl $44, %esp movl 8(%ebp), %ecx movl $0, -28(%ebp) movl (%ecx), %edx testl %edx, %edx jle .L1 .p2align 4,,7 .p2align 3 .L8: movl 16(%ebp), %eax leal 4(%esi), %edi testl %eax, %eax jle .L6 movl %edi, %eax xorl %ebx, %ebx movl %ecx, %edi movl %eax, %ecx .p2align 4,,7 .p2align 3 .L4: movl 4(%edi), %eax addl $1, %ebx movl (%eax,%ecx), %edx movl %edx, 4(%esp) movl (%eax,%esi), %eax movl %ecx, -32(%ebp) movl %eax, (%esp) call func movl -32(%ebp), %ecx cmpl %ebx, 16(%ebp) jg .L4 movl %ecx, %eax movl %edi, %ecx movl %eax, %edi .L6: addl $1, -28(%ebp) movl %edi, %esi movl -28(%ebp), %eax cmpl %eax, (%ecx) jg .L8 .L1: addl $44, %esp popl %ebx popl %esi popl %edi popl %ebp ret GCC 4.3.5 is being released, adjusting target milestone. I'm by no means an expert on the current IV code, so please take this with a grain of salt. If we look at the .ivcanon dump we have the two following critical blocks: # BLOCK 3 freq:9100 # PRED: 4 [91.0%] (true,exec) # VUSE <.MEM_21> D.1969_8 = p_4(D)->data; D.1972_11 = D.1969_8 + D.1971_10; # VUSE <.MEM_21> D.1973_12 = *D.1972_11; D.1977_17 = D.1969_8 + D.1976_16; # VUSE <.MEM_21> D.1978_18 = *D.1977_17; # .MEM_24 = VDEF <.MEM_21> func (D.1973_12, D.1978_18); j_19 = j_2 + 1; # SUCC: 4 [100.0%] (fallthru,exec) # BLOCK 4 freq:10000 # PRED: 7 [100.0%] (fallthru,exec) 3 [100.0%] (fallthru,exec) # j_2 = PHI <0(7), j_19(3)> # .MEM_21 = PHI <.MEM_22(7), .MEM_24(3)> if (j_2 < count_7(D)) goto <bb 3>; else goto <bb 5>; # SUCC: 3 [91.0%] (true,exec) 5 [9.0%] (false,exec) # BLOCK 5 freq:900 # PRED: 4 [9.0%] (false,exec) i_20 = i_1 + 1; # SUCC: 6 [100.0%] (fallthru,exec) # BLOCK 6 freq:989 # PRED: 2 [100.0%] (fallthru,exec) 5 [100.0%] (fallthru,exec) # i_1 = PHI <0(2), i_20(5)> # .MEM_22 = PHI <.MEM_23(D)(2), .MEM_21(5)> # VUSE <.MEM_22> D.1979_5 = p_4(D)->count; if (i_1 < D.1979_5) goto <bb 7>; else goto <bb 8>; # SUCC: 7 [91.0%] (true,exec) 8 [9.0%] (false,exec) # BLOCK 7 freq:900 # PRED: 6 [91.0%] (true,exec) i.0_9 = (unsigned int) i_1; D.1971_10 = i.0_9 * 4; D.1975_15 = i.0_9 + 1; D.1976_16 = D.1975_15 * 4; goto <bb 4>; # SUCC: 4 [100.0%] (fallthru,exec) Note carefully how we use D1971_10 and D1976_16 to build addresses for the two memory references in block #3 (p->data[i] and p->data[i+1] respectively). After IVopts we have: # BLOCK 3 freq:9100 # PRED: 4 [91.0%] (true,exec) # VUSE <.MEM_21> D.1969_8 = p_4(D)->data; D.1972_11 = D.1969_8 + D.1971_10; # VUSE <.MEM_21> D.1973_12 = *D.1972_11; D.1977_17 = D.1969_8 + D.1976_16; # VUSE <.MEM_21> D.1978_18 = *D.1977_17; # .MEM_24 = VDEF <.MEM_21> func (D.1973_12, D.1978_18); j_19 = j_2 + 1; # SUCC: 4 [100.0%] (fallthru,exec) # BLOCK 4 freq:10000 # PRED: 7 [100.0%] (fallthru,exec) 3 [100.0%] (fallthru,exec) # j_2 = PHI <0(7), j_19(3)> # .MEM_21 = PHI <.MEM_22(7), .MEM_24(3)> if (j_2 < count_7(D)) goto <bb 3>; else goto <bb 5>; # SUCC: 3 [91.0%] (true,exec) 5 [9.0%] (false,exec) # BLOCK 5 freq:900 # PRED: 4 [9.0%] (false,exec) i_20 = i_1 + 1; ivtmp.12_14 = ivtmp.12_13 + 4; # SUCC: 6 [100.0%] (fallthru,exec) # BLOCK 6 freq:989 # PRED: 2 [100.0%] (fallthru,exec) 5 [100.0%] (fallthru,exec) # i_1 = PHI <0(2), i_20(5)> # .MEM_22 = PHI <.MEM_23(D)(2), .MEM_21(5)> # ivtmp.12_13 = PHI <4(2), ivtmp.12_14(5)> # VUSE <.MEM_22> D.1979_5 = p_4(D)->count; if (i_1 < D.1979_5) goto <bb 7>; else goto <bb 8>; # SUCC: 7 [91.0%] (true,exec) 8 [9.0%] (false,exec) # BLOCK 7 freq:900 # PRED: 6 [91.0%] (true,exec) D.1992_6 = (unsigned int) i_1; D.1993_26 = D.1992_6 * 4; D.1971_10 = D.1993_26; D.1976_16 = ivtmp.12_13; goto <bb 4>; # SUCC: 4 [100.0%] (fallthru,exec) UGH. Everything involving ivtmp.12 is a waste of time. We really just need to realize that D1976_16 is D1971_10 + 4 which avoids all the nonsense with ivtmp.12 and I *think* would restore the quality of this code. I don't know enough about the current ivopts code to prototype this and verify that such a change would restore the quality of this code. Zdenek, can you take a look? > UGH. Everything involving ivtmp.12 is a waste of time. We really just need to
> realize that D1976_16 is D1971_10 + 4 which avoids all the nonsense with
> ivtmp.12 and I *think* would restore the quality of this code. I don't know
> enough about the current ivopts code to prototype this and verify that such a
> change would restore the quality of this code.
actually, ivopts know that D1976_16 is D1971_10 + 4; however, since both
D1971_10 = ivtmp.12 - 4;
and
D1971_10 = i << 2;
have the same complexity, it arbitrarily decides to use the latter. The problem is in ivopts deciding to create ivtmp.12 at all. However, this will be somewhat hard to avoid, since locally, replacing D1976_16 (= i << 2 + 4) by a new iv is a correct decision (it replaces addition and shift by a single addition).
Ideally, ivopts should recognize that D1976_16 and D1971_10 are used for memory addressing and use the appropriate addressing modes ([D.1969_8 + 4*i + 4] instead of [D.1972_11]). However, that also fails since ivopts only recognize the memory references whose addresses are affine ivs (which is not the case here, as we cannot prove that D.1969_8 is invariant in the loop).
4.3 branch is being closed, moving to 4.4.7 target. Re-confirmed for trunk. IV optimization does not seem to be code-size aware with -Os. 4.4 branch is being closed, moving to 4.5.4 target. The 4.5 branch is being closed, adjusting target milestone. GCC 4.6.4 has been released and the branch has been closed. The situation gets a little bit better on 4_9 trunk. The Os assembly code on cortex-m0 (thumb1 as reported) is like: test: push {r0, r1, r2, r4, r5, r6, r7, lr} mov r6, r0 mov r4, #0 str r2, [sp, #4] .L2: ldr r2, [r6] cmp r4, r2 bge .L7 mov r5, #0 lsl r7, r4, #2 add r2, r7, #4 <----move to before XXX str r2, [sp] <----spill .L3: ldr r3, [sp, #4] cmp r5, r3 bge .L8 ldr r3, [r6, #4] ldr r2, [sp] <----spill ldr r0, [r3, r7] ldr r1, [r3, r2] <----XXX bl func add r5, r5, #1 b .L3 .L8: add r4, r4, #1 b .L2 .L7: @ sp needed pop {r0, r1, r2, r4, r5, r6, r7, pc} .size test, .-test IVOPT chooses the original biv for all uses in outer loop, regression comes from long live range of "r2" and the corresponding spill. Then I realized that GCC IVOPT computes iv (for non-linear uses) at original place, we may be able to teach IVOPT to compute the iv just before it's used in order to shrink live range of iv. The patch I had at http://gcc.gnu.org/ml/gcc-patches/2013-11/msg00535.html is similar to this, only it computes iv uses at appropriate place for outside loop iv uses. But this idea won't help this specific case because LIM will hoist all the computation to basic block .L2 after IVOPT. For optimization level O2, the dump before IVOPT is like: <bb 2>: _21 = p_6(D)->count; if (_21 > 0) goto <bb 3>; else goto <bb 11>; <bb 3>: <bb 4>: # i_26 = PHI <i_20(10), 0(3)> if (count_8(D) > 0) goto <bb 5>; else goto <bb 9>; <bb 5>: pretmp_23 = (sizetype) i_26; pretmp_32 = pretmp_23 + 1; pretmp_33 = pretmp_32 * 4; pretmp_34 = pretmp_23 * 4; <bb 6>: # j_27 = PHI <j_19(7), 0(5)> _9 = p_6(D)->data; _13 = _9 + pretmp_33; _14 = *_13; _16 = _9 + pretmp_34; _17 = *_16; func (_17, _14); j_19 = j_27 + 1; if (count_8(D) > j_19) goto <bb 7>; else goto <bb 8>; <bb 7>: goto <bb 6>; <bb 8>: <bb 9>: i_20 = i_26 + 1; _7 = p_6(D)->count; if (_7 > i_20) goto <bb 10>; else goto <bb 11>; <bb 10>: goto <bb 4>; <bb 11>: return; There might be two issues that block IVOPT choosing the biv(i) for pretmp_33 and pretmp_34: 1) on some target (like ARM), "i << 2 + 4" can be done in one instruction, if the cost is same as simple shift or plus, then overall cost of biv(i) is lower than the two candidate iv sets. GCC doesn't do such check in get_computation_cost_at for now. 2) there is CSE opportunity between computation of pretmp_33 and pretmp_34, for example they can be computed as below: pretmp_33 = i << 2 pretmp_34 = pretmp_33 + 4 but GCC IVOPT is insensitive to such CSE opportunities between different iv uses. I guess this isn't easy because unless the two uses are very close in code (like this one), such CSE may avail to nothing. These kind tweaks on cost are tricky(and most probably has no overall benefit) because the cost IVOPT computed from RTL is far from precise to do such fine granularity tuning. Another point, as Zdenek pointed out, IVOPT doesn't know that pretmp_33/pretmp_34 are going to be used in memory accesses, which means some of address computation can be embedded by appropriate addressing mode. In other words, computation of pretmp_33/pretmp_34 shouldn't be honored when computing overall cost and choosing iv candidates set. Since "_9 + pretmp_33/pretmp_34" is not affine iv, the only way to handle this issue is to lower both memory accesses before IVOPT, into some code like below: <bb 5>: pretmp_23 = (sizetype) i_26; pretmp_32 = pretmp_23 + 1; <bb 6>: # j_27 = PHI <j_19(7), 0(5)> _9 = p_6(D)->data; _14 = MEM[_9 + pretmp_32 << 2]; _17 = MEM[_9 + pretmp_23 << 2]; func (_17, _14); j_19 = j_27 + 1; if (count_8(D) > j_19) goto <bb 7>; else goto <bb 8>; With this code, the iv uses are biv(i), pretmp_23(i_26) and pretmp_32(i_26+1), and IVOPT won't even add the annoying candidate. The 4.7 branch is being closed, moving target milestone to 4.8.4. GCC 4.8.4 has been released. The gcc-4_8-branch is being closed, re-targeting regressions to 4.9.3. GCC 4.9.3 has been released. GCC 4.9 branch is being closed Unasigning Zdenek. He is busy by his math :) on x86 we now generate wit -Os: test: .LFB0: .cfi_startproc pushl %ebp .cfi_def_cfa_offset 8 .cfi_offset 5, -8 movl %esp, %ebp .cfi_def_cfa_register 5 pushl %edi pushl %esi pushl %ebx .cfi_offset 7, -12 .cfi_offset 6, -16 .cfi_offset 3, -20 xorl %ebx, %ebx subl $28, %esp movl 8(%ebp), %eax .L2: cmpl %ebx, (%eax) jle .L1 leal 4(,%ebx,4), %edi xorl %esi, %esi leal -4(%edi), %ecx .L5: cmpl 16(%ebp), %esi jge .L8 movl 4(%eax), %edx movl %eax, -32(%ebp) incl %esi pushl %eax pushl %eax movl %ecx, -28(%ebp) pushl (%edx,%edi) pushl (%edx,%ecx) call func addl $16, %esp movl -32(%ebp), %eax movl -28(%ebp), %ecx jmp .L5 .L8: incl %ebx jmp .L2 .L1: leal -12(%ebp), %esp popl %ebx .cfi_restore 3 popl %esi .cfi_restore 6 popl %edi .cfi_restore 7 popl %ebp .cfi_restore 5 .cfi_def_cfa 4, 4 ret .cfi_endproc This is 79 bytes, while code quoted is 88 bytes. and -O2: test: .LFB0: .cfi_startproc pushl %ebp pushl %edi xorl %ebp, %ebp pushl %esi pushl %ebx subl $28, %esp movl 48(%esp), %esi movl (%esi), %ecx testl %ecx, %ecx jle .L1 .p2align 4,,10 .p2align 3 .L2: movl 56(%esp), %edx leal 1(%ebp), %eax movl %eax, 12(%esp) testl %edx, %edx jle .L6 leal 0(,%eax,4), %ebx xorl %ebp, %ebp leal -4(%ebx), %edi .p2align 4,,10 .p2align 3 .L4: movl 4(%esi), %edx subl $8, %esp addl $1, %ebp pushl (%edx,%ebx) pushl (%edx,%edi) call func addl $16, %esp cmpl %ebp, 56(%esp) jne .L4 .L6: movl 12(%esp), %ebp cmpl %ebp, (%esi) jg .L2 .L1: addl $28, %esp popl %ebx popl %esi popl %edi popl %ebp ret .LFE0: which is 100 bytes. So it seems that -O2 code got bigger and -Os code smaller I can also confirm Os is fixed on trunk @244877 using reported command line, while O2 goes up to 76 now. (In reply to amker from comment #23) > I can also confirm Os is fixed on trunk @244877 using reported command line, > while O2 goes up to 76 now. on arm (with -march=armv5te -mthumb -mthumb-interwork -fpic -Os). I don't think the codesize going up with -O2 is a significant issue here. It looks like the regression with -Os is fixed. We can also see that we're doing less memory traffic. I think we should mark this as resolved for gcc-7. GCC 5 branch is being closed GCC 6 branch is being closed, fixed in 7.x. |