This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/55906] New: suboptimal code generated for post-inc on Thumb1
- From: "amker.cheng at gmail dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Tue, 08 Jan 2013 05:55:51 +0000
- Subject: [Bug tree-optimization/55906] New: suboptimal code generated for post-inc on Thumb1
- Auto-submitted: auto-generated
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55906
Bug #: 55906
Summary: suboptimal code generated for post-inc on Thumb1
Classification: Unclassified
Product: gcc
Version: 4.8.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: amker.cheng@gmail.com
For below program:
int
ffs(int word)
{
int i;
if (!word)
return 0;
i = 0;
for (;;)
{
if (((1 << i++) & word) != 0)
return i;
}
}
The dump of 164t.optimized is like:
ffs (int word)
{
int i;
int _6;
int _7;
<bb 2>:
if (word_3(D) == 0)
goto <bb 6>;
else
goto <bb 3>;
<bb 3>:
<bb 4>:
# i_1 = PHI <0(3), i_5(5)>
i_5 = i_1 + 1;
_6 = word_3(D) >> i_1;
_7 = _6 & 1;
if (_7 != 0)
goto <bb 6>;
else
goto <bb 5>;
<bb 5>:
goto <bb 4>;
<bb 6>:
# i_2 = PHI <0(2), i_5(4)>
return i_2;
}
GCC increases i before i_1 is used, causing i_5 and i_1 to be partitioned into
different partitions as in expanded rtl:
2: r115:SI=r0:SI
3: NOTE_INSN_FUNCTION_BEG
9: pc={(r115:SI==0)?L33:pc}
REG_BR_PROB 0xf3c
10: NOTE_INSN_BASIC_BLOCK 4
4: r110:SI=0
18: L18:
11: NOTE_INSN_BASIC_BLOCK 5
12: r111:SI=r110:SI+0x1 <-----i_5/i_1 in different pseudos
13: r116:SI=r115:SI>>r110:SI
14: r118:SI=0x1
15: r117:SI=r116:SI&r118:SI
REG_EQUAL r116:SI&0x1
16: pc={(r117:SI!=0)?L21:pc}
REG_BR_PROB 0x384
17: NOTE_INSN_BASIC_BLOCK 6
5: r110:SI=r111:SI
19: pc=L18
20: barrier
33: L33:
32: NOTE_INSN_BASIC_BLOCK 7
6: r111:SI=0
21: L21:
22: NOTE_INSN_BASIC_BLOCK 8
23: r114:SI=r111:SI
27: r0:SI=r114:SI
30: use r0:SI
Finally, suboptimal codes are generated :
ffs:
mov r3, #0
push {r4, lr}
cmp r0, r3
beq .L2
mov r2, r3
mov r1, #1
.L3:
mov r4, r0
asr r4, r4, r2
add r3, r2, #1
tst r4, r1
bne .L2
mov r2, r3
b .L3
.L2:
mov r0, r3
@ sp needed
pop {r4, pc}
While GCC 4.6 generates better codes:
ffs:
push {lr}
sub r3, r0, #0
beq .L2
mov r3, #0
mov r2, #1
.L3:
mov r1, r0
asr r1, r1, r3
add r3, r3, #1
tst r1, r2
beq .L3
.L2:
mov r0, r3
@ sp needed for prologue
pop {pc}
The command line is:
arm-none-eabi-gcc -mthumb -mcpu=cortex-m0 -Os -S ffs.c -o ffs.S
Same problem exists when optimizing with "-O2"