[Bug rtl-optimization/11261] Weak code generated for JPEG compression

olegendo at gcc dot gnu.org gcc-bugzilla@gcc.gnu.org
Sun Jul 22 14:54:00 GMT 2012


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11261

Oleg Endo <olegendo at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |olegendo at gcc dot gnu.org

--- Comment #6 from Oleg Endo <olegendo at gcc dot gnu.org> 2012-07-22 14:54:11 UTC ---
As of rev 189746 the issue is still present.
With '-m4-single -O2 -mpretend-cmove' (scheduling disabled by default) the file
jidctflt.c from the CSiBE set contains the following sequence:

        ...
        sts     fpul,r0
        ftrc    fr7,fpul
        add     #4,r0
        shad    r6,r0
        and     r7,r0
        mov.b   @(r0,r2),r0
        mov.b   r0,@(1,r1)
        sts     fpul,r0
        ftrc    fr1,fpul
        fmov    fr5,fr1
        fadd    fr2,fr1
        fsub    fr2,fr5
        add     #4,r0
        shad    r6,r0
        and     r7,r0
        mov.b   @(r0,r2),r0
        mov.b   r0,@(6,r1)
        sts     fpul,r0
        ftrc    fr6,fpul
        add     #4,r0
        shad    r6,r0
        and     r7,r0
        mov.b   @(r0,r2),r0
        mov.b   r0,@(2,r1)
        sts     fpul,r0
        ftrc    fr1,fpul
        add     #4,r0
        shad    r6,r0
        and     r7,r0
        mov.b   @(r0,r2),r0
        mov.b   r0,@(5,r1)
        sts     fpul,r0
        ....

With '-m4-single -O2 -mpretend-cmove -fschedule-insns' it seems to be better. 
The address index is calculated in different regs than R0 and then copied to
R0, as described in comment #5:

        ftrc    fr7,fpul
        mov     r11,r0
        sts     fpul,r5
        fadd    fr3,fr1
        mov.b   @(r0,r2),r0
        add     #4,r4
        fmov    fr10,fr0
        fmac    fr0,fr11,fr2
        shad    r6,r4
        mov.b   r0,@(7,r1)
        and     r7,r4
        ftrc    fr1,fpul
        mov     r4,r0
        fadd    fr3,fr2
        mov.b   @(r0,r2),r0
        add     #4,r5
        sts     fpul,r4
        fsub    fr3,fr6
        shad    r6,r5
        mov.b   r0,@(1,r1)
        and     r7,r5
        fmov    fr5,fr1
        ftrc    fr6,fpul
        mov     r5,r0
        sts     fpul,r11
        add     #4,r4
        mov.b   @(r0,r2),r0
        fadd    fr2,fr1
        shad    r6,r4
        fsub    fr2,fr5
        mov.b   r0,@(6,r1)
        and     r7,r4
        mov     r4,r0
        ftrc    fr1,fpul
        mov.b   @(r0,r2),r0
        add     #4,r11
        sts     fpul,r4
        shad    r6,r11
        mov.b   r0,@(2,r1)

Thus, for this case it might be beneficial to enable sched1 again.
Although, looking at some recent sched1 vs. no-sched1 runtime numbers in PR
22553, sched1 it doesn't seem to make a big difference on average.



More information about the Gcc-bugs mailing list