[Bug rtl-optimization/54910] New: ARM: Missed optimization of very simple ctz function

linux at horizon dot com gcc-bugzilla@gcc.gnu.org
Fri Oct 12 14:03:00 GMT 2012


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54910

             Bug #: 54910
           Summary: ARM: Missed optimization of very simple ctz function
    Classification: Unclassified
           Product: gcc
           Version: 4.7.2
            Status: UNCONFIRMED
          Severity: minor
          Priority: P3
         Component: rtl-optimization
        AssignedTo: unassigned@gcc.gnu.org
        ReportedBy: linux@horizon.com
              Host: i386
            Target: arm-linux-gnueabi


Given the following function:

/* Number of trailing zero bits in x. */
unsigned __attribute__((const))
ctz(unsigned x)
{
    static unsigned char const ctz_table[16] = {
        4, 0, 1, 0,  2, 0, 1, 0,
        3, 0, 1, 0,  2, 0, 1, 0
    };
    int bit = 28;

    if (x << 16)  x <<= 16, bit -= 16;
    if (x <<  8)  x <<=  8, bit -=  8;
    if (x <<  4)  x <<=  4, bit -=  4;
    return bit + ctz_table[x >> 28];
}
And the command line:

arm-linux-gnueabi-gcc-4.7 -W -Wall -O2 -mcpu=arm7tdmi -mthumb-interwork -marm
-S baz.c

I get the following ARM code (-O2, -mthumb-interwork):

    .align    2
    .global    ctz
    .type    ctz, %function
ctz:
    @ Function supports interworking.
    @ args = 0, pretend = 0, frame = 0
    @ frame_needed = 0, uses_anonymous_args = 0
    @ link register save eliminated.
    movs    r3, r0, asl #16
    moveq    r3, r0
    movne    r2, #12
    moveq    r2, #28
    movs    r1, r3, asl #8
    movne    r3, r1
    subne    r2, r2, #8
    movs    r1, r3, asl #4
    movne    r3, r1
    ldr    r1, .L18
    ldrb    r0, [r1, r3, lsr #28]    @ zero_extendqisi2
    subne    r2, r2, #4
    add    r0, r0, r2
    bx    lr
.L19:
    .align    2
.L18:
    .word    .LANCHOR0
    .size    ctz, .-ctz
    .section    .rodata
    .align    2
.LANCHOR0 = . + 0
    .type    ctz_table.4122, %object
    .size    ctz_table.4122, 16
ctz_table.4122:
    .byte    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
    .ident    "GCC: (Debian 4.7.2-1) 4.7.2"


What strikes me as strange about this code is that it uses 4-byte pointer
at .L18 to access an 16-byte table at .LANCHOR0.  Why the heck not just put
the table at .L18 directly and replace the ldr with an adr?  Save space and
time.


The thumb code is similar, but also fails to save the link register save,
despite the fact that this is an extremely simple leaf function:

    .align    2
    .global    ctz
    .code    16
    .thumb_func
    .type    ctz, %function
ctz:
    push    {lr}
    lsl    r3, r0, #16
    mov    r2, #12
    cmp    r3, #0
    bne    .L8
    mov    r3, r0
    mov    r2, #28
.L8:
    lsl    r1, r3, #8
    beq    .L9
    sub    r2, r2, #8
    mov    r3, r1
.L9:
    lsl    r1, r3, #4
    beq    .L10
    sub    r2, r2, #4
    mov    r3, r1
.L10:
    ldr    r1, .L18
    lsr    r3, r3, #28
    ldrb    r0, [r1, r3]
    @ sp needed for prologue
    add    r0, r0, r2
    pop    {r1}
    bx    r1



More information about the Gcc-bugs mailing list