Bug 40416 - unnecessary register spill
Summary: unnecessary register spill
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 4.5.0
: P3 normal
Target Milestone: 4.5.0
Assignee: Not yet assigned to anyone
URL: http://gcc.gnu.org/ml/gcc-patches/200...
Keywords:
Depends on:
Blocks: 16996
  Show dependency treegraph
 
Reported: 2009-06-11 14:26 UTC by Carrot
Modified: 2009-06-30 08:21 UTC (History)
2 users (show)

See Also:
Host: i686-linux
Target: arm-eabi
Build: i686-linux
Known to work:
Known to fail:
Last reconfirmed: 2009-06-15 09:14:05


Attachments
test case (247 bytes, application/octet-stream)
2009-06-11 14:34 UTC, Carrot
Details
preprocessed test case (3.02 KB, application/octet-stream)
2009-06-15 02:26 UTC, Carrot
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Carrot 2009-06-11 14:26:29 UTC
Compile the attached source code with options -O2 -Os -mthumb -fpic, we can get a unnecessary register spill.
Comment 1 Carrot 2009-06-11 14:34:18 UTC
Created attachment 17983 [details]
test case

The spilling is occurred around the first loop:

        push    {r4, r5, r6, r7, lr}
        sub     sp, sp, #12
        .loc 1 5 0
        str     r2, [sp, #4]          // A
        .loc 1 6 0
        add     r6, r1, r2
        mov     r4, r0
        .loc 1 8 0
        b       .L2
.L5:
        .loc 1 10 0
        mov     r7, #0
        ldrsh   r5, [r4, r7]
        .loc 1 12 0
        cmp     r2, r5
        bge     .L3
        .loc 1 14 0
        ldrb    r7, [r1]
        strb    r7, [r1, r2]
        .loc 1 15 0
        strh    r2, [r4]
        .loc 1 16 0
        lsl     r1, r2, #1
        sub     r2, r5, r2
        strh    r2, [r1, r4]
.L6:
        .loc 1 5 0
        ldr     r5, [sp, #4]     //   B
        lsl     r4, r5, #1
        add     r0, r0, r4
        b       .L4
.L3:
        .loc 1 19 0
        lsl     r7, r5, #1
        mov     ip, r7
        add     r4, r4, ip
        .loc 1 20 0
        add     r1, r1, r5
        .loc 1 21 0
        sub     r2, r2, r5
.L2:
        .loc 1 8 0
        cmp     r2, #0
        bgt     .L5
        b       .L6
.L4:
        .loc 1 30 0
        mov     r1, #0


The spilling is occurred at instruction A and reload at instruction B.

The spilled value is x. The source code computes next_runs and next_alpha before while loop and preserve them through the loop body. But the generated code preserve next_alpha, original runs and original x through the loop body and compute next_runs after the loop. This caused an extra usage of register and results in a register spilling.
Comment 2 Ramana Radhakrishnan 2009-06-12 12:54:07 UTC
(In reply to comment #1)
> Created an attachment (id=17983) [edit]
> test case
> 

Your attachment didn't have #include <stdint.h> - Please try and supply pre-processed input which is self contained . Adding a -I from a build directory can be rather painful . Thanks - 

> The spilling is occurred around the first loop:
> 
>         push    {r4, r5, r6, r7, lr}
>         sub     sp, sp, #12
>         .loc 1 5 0
>         str     r2, [sp, #4]          // A
>         .loc 1 6 0
>         add     r6, r1, r2
>         mov     r4, r0
>         .loc 1 8 0
>         b       .L2
> .L5:
>         .loc 1 10 0
>         mov     r7, #0
>         ldrsh   r5, [r4, r7]
>         .loc 1 12 0
>         cmp     r2, r5
>         bge     .L3
>         .loc 1 14 0
>         ldrb    r7, [r1]
>         strb    r7, [r1, r2]
>         .loc 1 15 0
>         strh    r2, [r4]
>         .loc 1 16 0
>         lsl     r1, r2, #1
>         sub     r2, r5, r2
>         strh    r2, [r1, r4]
> .L6:
>         .loc 1 5 0
>         ldr     r5, [sp, #4]     //   B
>         lsl     r4, r5, #1
>         add     r0, r0, r4
>         b       .L4
> .L3:
>         .loc 1 19 0
>         lsl     r7, r5, #1
>         mov     ip, r7
>         add     r4, r4, ip
>         .loc 1 20 0
>         add     r1, r1, r5
>         .loc 1 21 0
>         sub     r2, r2, r5
> .L2:
>         .loc 1 8 0
>         cmp     r2, #0
>         bgt     .L5
>         b       .L6
> .L4:
>         .loc 1 30 0
>         mov     r1, #0
> 
> 
> The spilling is occurred at instruction A and reload at instruction B.
> 
> The spilled value is x. The source code computes next_runs and next_alpha
> before while loop and preserve them through the loop body. But the generated
> code preserve next_alpha, original runs and original x through the loop body
> and compute next_runs after the loop. This caused an extra usage of register
> and results in a register spilling.
> 

Could you say what you'd like the code to be because I don't see an option but to spill one of the values here. ?
Comment 3 Carrot 2009-06-15 02:26:49 UTC
Created attachment 17998 [details]
preprocessed test case

A possible code sequence without spilling is:

        push    {r4, r5, r6, r7, lr}
        add     r6, r1, r2
        mov     r4, r0
        lsl     r7, r2, 1     // New
        add     r0, r0, r7    // New
        .loc 1 8 0
        b       .L2
.L5:
        .loc 1 10 0
        mov     r7, #0
        ldrsh   r5, [r4, r7]
        .loc 1 12 0
        cmp     r2, r5
        bge     .L3
        .loc 1 14 0
        ldrb    r7, [r1]
        strb    r7, [r1, r2]
        .loc 1 15 0
        strh    r2, [r4]
        .loc 1 16 0
        lsl     r1, r2, #1
        sub     r2, r5, r2
        strh    r2, [r1, r4]
.L6:
        .loc 1 5 0
        b       .L4
.L3:
        .loc 1 19 0
        lsl     r7, r5, #1
        mov     ip, r7
        add     r4, r4, ip
        .loc 1 20 0
        add     r1, r1, r5
        .loc 1 21 0
        sub     r2, r2, r5
.L2:
        .loc 1 8 0
        cmp     r2, #0
        bgt     .L5
        b       .L6
.L4:
        .loc 1 30 0
        mov     r1, #0
Comment 4 Carrot 2009-06-15 02:32:20 UTC
In the source code, only two extra variables next_runs and next_alpha need to be preserved through the while loop.

But in the gcc generated code, three variables are kept through the first loop. They are next_alpha, original runs and original x. The expression (next_runs = runs + x) is moved after the loop. This caused an extra var through the loop and resulted in register spilling.

The expression move is occurred in tree-ssa-sink pass. Daniel Berlin has confirmed it is a bug in this pass.

******** From Daniel **************
This looks like a bug, i think i know what causes it.
When I wrote this pass, i forgot to make this check:

 /* It doesn't make sense to move to a dominator that post-dominates
        frombb, because it means we've just moved it into a path that always
        executes if frombb executes, instead of reducing the number of
        executions .  */

     if (dominated_by_p (CDI_POST_DOMINATORS, frombb, commondom))

happen regardless of whether it is a single use statement or not.
So it will sink single use statements even if it's just moving them to
places that aren't executed less frequently.

Add that check (changing commondom to sinkbb) and it should stop moving it.
*********** End From Daniel ****************

I will send the patch later.
Comment 5 Steven Bosscher 2009-06-22 16:32:12 UTC
Did that patch go in already?
Comment 7 Steven Bosscher 2009-06-30 08:21:18 UTC
Please adjust the test case:

/* { dg-options "-O2 -Os -fdump-tree-sink-stats" } */

This makes no sense, -Os implies -O2.  So it should be:

/* { dg-options "-Os -fdump-tree-sink-stats" } */