[Bug rtl-optimization/98782] IRA artificially creating spills due to BB frequencies

Fri Jan 22 10:12:30 GMT 2021

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98782

Feng Xue <fxue at os dot amperecomputing.com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |fxue at os dot amperecomputing.com

--- Comment #1 from Feng Xue <fxue at os dot amperecomputing.com> ---
The value "foo + 1024" is spilled for both cases, but in different way. For bad
case, spill outside loop, and only reload inside. While for good case,
spill/reload pair occurs around the call to "bar", which might also consider
extra cost of using caller-saved registers. It seems that IRA has two different
logics to handle spilling.

[Bad case]
foo:
        stp     x29, x30, [sp, -80]!
        mov     w5, 753
        mov     x29, sp
        stp     x19, x20, [sp, 16]
        mov     x19, x1
        mul     w1, w0, w5
        stp     x21, x22, [sp, 32]
        mov     w22, 5271
        add     w2, w1, 7
        mov     w21, w5
        mul     w3, w0, w22
        mov     w20, 760
        mov     w22, 0
        str     w0, [sp, 76]
        add     x0, x19, 1024
        str     x0, [sp, 64]      // Spill (foo + 1024)
        .p2align 3,,7
.L5:
        ldrb    w0, [x19]
        cbz     w0, .L2
        ldr     w0, [sp, 76]
        stp     w1, w2, [sp, 56]
        str     w3, [sp, 72]
        bl      bar
        ldrb    w0, [x19, 1]!
        ldp     w1, w2, [sp, 56]
        add     w21, w21, w0
        ldr     w3, [sp, 72]
        mul     w20, w20, w0
        ldr     x0, [sp, 64]     // Reload (foo + 1024)
        add     w22, w22, w20
        cmp     x19, x0
        bne     .L5
        b       .L4
        .p2align 2,,3
.L2:
        ldrb    w0, [x19, 1]!
        add     w21, w21, w0
        mul     w20, w20, w0
        ldr     x0, [sp, 64]     // Reload (foo + 1024)
        add     w22, w22, w20
        cmp     x0, x19
        bne     .L5
.L4:
        add     w0, w20, w21
        add     w0, w0, w22
        ldp     x19, x20, [sp, 16]
        ldp     x21, x22, [sp, 32]
        ldp     x29, x30, [sp], 80
        ret

[Good case:]
foo:
        stp     x29, x30, [sp, -80]!
        mov     w5, 753
        add     x7, x1, 1024
        mul     w2, w0, w5
        mov     x29, sp
        stp     x21, x22, [sp, 32]
        mov     w21, 5271
        mov     w22, w5
        stp     x19, x20, [sp, 16]
        mov     x19, x1
        mul     w3, w0, w21
        stp     w2, w0, [sp, 72]  // Spill x(%w0)
        add     w2, w2, 7         // t2(%w2)
        mov     w21, 0
        mov     w20, 760
        .p2align 3,,7
.L5:
        ldrb    w0, [x19]
        cbz     w0, .L2
        ldp     w1, w0, [sp, 72]  // Reload x 
        stp     w2, w3, [sp, 56]  // Spill t2
        str     x7, [sp, 64]      // Spill (foo + 1024)
        bl      bar
        ldrb    w0, [x19, 1]!
        ldr     x7, [sp, 64]      // Reload (foo + 1024)
        add     w22, w22, w0
        ldp     w2, w3, [sp, 56]  // Reload t2
        mul     w20, w20, w0
        add     w21, w21, w20
        cmp     x19, x7
        bne     .L5
        b       .L4
        .p2align 2,,3
.L2:
        ldrb    w0, [x19, 1]!
        add     w22, w22, w0
        mul     w20, w20, w0
        add     w21, w21, w20
        cmp     x7, x19
        bne     .L5
.L4:
        add     w0, w20, w22
        add     w0, w0, w21
        ldp     x19, x20, [sp, 16]
        ldp     x21, x22, [sp, 32]
        ldp     x29, x30, [sp], 80
        ret

Even for good case, we could expect better spill/reload generation. Refer to
comments above, "x" and "t2" are similar, both loop invariant, but handled
differently. Spilling "t2" inside loop is worst than spilling it outside, as
what IRA does for "x". 

Both issues could be correlated to same thing.