[Bug rtl-optimization/54910] New: ARM: Missed optimization of very simple ctz function
linux at horizon dot com
gcc-bugzilla@gcc.gnu.org
Fri Oct 12 14:03:00 GMT 2012
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54910
Bug #: 54910
Summary: ARM: Missed optimization of very simple ctz function
Classification: Unclassified
Product: gcc
Version: 4.7.2
Status: UNCONFIRMED
Severity: minor
Priority: P3
Component: rtl-optimization
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: linux@horizon.com
Host: i386
Target: arm-linux-gnueabi
Given the following function:
/* Number of trailing zero bits in x. */
unsigned __attribute__((const))
ctz(unsigned x)
{
static unsigned char const ctz_table[16] = {
4, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0
};
int bit = 28;
if (x << 16) x <<= 16, bit -= 16;
if (x << 8) x <<= 8, bit -= 8;
if (x << 4) x <<= 4, bit -= 4;
return bit + ctz_table[x >> 28];
}
And the command line:
arm-linux-gnueabi-gcc-4.7 -W -Wall -O2 -mcpu=arm7tdmi -mthumb-interwork -marm
-S baz.c
I get the following ARM code (-O2, -mthumb-interwork):
.align 2
.global ctz
.type ctz, %function
ctz:
@ Function supports interworking.
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
movs r3, r0, asl #16
moveq r3, r0
movne r2, #12
moveq r2, #28
movs r1, r3, asl #8
movne r3, r1
subne r2, r2, #8
movs r1, r3, asl #4
movne r3, r1
ldr r1, .L18
ldrb r0, [r1, r3, lsr #28] @ zero_extendqisi2
subne r2, r2, #4
add r0, r0, r2
bx lr
.L19:
.align 2
.L18:
.word .LANCHOR0
.size ctz, .-ctz
.section .rodata
.align 2
.LANCHOR0 = . + 0
.type ctz_table.4122, %object
.size ctz_table.4122, 16
ctz_table.4122:
.byte 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
.ident "GCC: (Debian 4.7.2-1) 4.7.2"
What strikes me as strange about this code is that it uses 4-byte pointer
at .L18 to access an 16-byte table at .LANCHOR0. Why the heck not just put
the table at .L18 directly and replace the ldr with an adr? Save space and
time.
The thumb code is similar, but also fails to save the link register save,
despite the fact that this is an extremely simple leaf function:
.align 2
.global ctz
.code 16
.thumb_func
.type ctz, %function
ctz:
push {lr}
lsl r3, r0, #16
mov r2, #12
cmp r3, #0
bne .L8
mov r3, r0
mov r2, #28
.L8:
lsl r1, r3, #8
beq .L9
sub r2, r2, #8
mov r3, r1
.L9:
lsl r1, r3, #4
beq .L10
sub r2, r2, #4
mov r3, r1
.L10:
ldr r1, .L18
lsr r3, r3, #28
ldrb r0, [r1, r3]
@ sp needed for prologue
add r0, r0, r2
pop {r1}
bx r1
More information about the Gcc-bugs
mailing list