This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

optimization/10469: [3.3, 3.4] constant V4SF loads get moved inside loop


>Number:         10469
>Category:       optimization
>Synopsis:       [3.3, 3.4] constant V4SF loads get moved inside loop
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    unassigned
>State:          open
>Class:          pessimizes-code
>Submitter-Id:   net
>Arrival-Date:   Wed Apr 23 20:46:00 UTC 2003
>Closed-Date:
>Last-Modified:
>Originator:     Richard Guenther
>Release:        gcc-3.3 (GCC) 3.3 20030423 (prerelease), gcc-3.4 (GCC) 3.4 20030422 (experimental)
>Organization:
>Environment:
ia32 (sse2), powerpc (altivec)
>Description:
For the following code, while good() creates perfectly optimal code for the loop, bad() moves the fv initialization inside the loop body as can be seen from the asm snippets below:

typedef float v4sf __attribute__((mode(V4SF)));

void good(float *r, float f, int cnt)
{
        float fv[4] __attribute__((aligned(__alignof__(v4sf)))) = { f, f, f, f };
        while (cnt--) {
                *(v4sf *)r = *(v4sf *)fv;
                r += 4;
        }
}

void bad(float *r, float f, int cnt)
{
        v4sf fv = { f, f, f, f };
        while (cnt--) {
                *(v4sf *)r = fv;
                r += 4;
        }
}

powerpc asm, generated with gcc-3.3 -O2 -S -fverbose-asm -maltivec simd.c:

good:
        cmpwi 0,4,0      #  cnt
        stwu 1,-32(1)
        addi 4,4,-1      #  cnt,  cnt
        stfs 1,20(1)     #  fv,  f
        stfs 1,8(1)      #  fv,  f
        stfs 1,12(1)     #  fv,  f
        stfs 1,16(1)     #  fv,  f
        beq- 0,.L7
        addi 4,4,1       #  cnt
        addi 9,1,8
        mtctr 4
        lvx 0,0,9
.L8:
        stvx 0,0,3       # * r
        addi 3,3,16      #  r,  r
        bdnz .L8

bad:
        stwu 1,-32(1)
        cmpwi 0,4,0      #  cnt
        addi 4,4,-1      #  cnt,  cnt
        stfs 1,8(1)
        lwz 9,8(1)
        mr 10,9
        mr 11,9
        mr 12,9
        beq- 0,.L15
        addi 4,4,1       #  cnt
        mtctr 4
.L16:
        addi 8,1,16
        stw 9,0(8)       #  fv
        stw 10,4(8)      #  fv
        stw 11,8(8)      #  fv
        stw 12,12(8)     #  fv
        lvx 0,0,8
        stvx 0,0,3       # * r
        addi 3,3,16      #  r,  r
        bdnz .L16

for ia32 similar things happen, not as bad, but

good:
        ...
        movaps  -24(%ebp), %xmm0
.L5:
        subl    $1, %edx        #  cnt
        movaps  %xmm0, (%ecx)   # * r
        addl    $16, %ecx       #  r
        cmpl    $-1, %edx       #  cnt
        jne     .L5

bad:
        ...
.L12:
        movaps  -24(%ebp), %xmm0        #  fv
        subl    $1, %eax        #  cnt
        movaps  %xmm0, (%edx)   # * r
        addl    $16, %edx       #  r
        cmpl    $-1, %eax       #  cnt
        jne     .L12

For gcc 3.4 similar things happen (ia32 only tested):

good:
        movaps  -24(%ebp), %xmm0        #, tmp62
.L4:
        subl    $1, %edx        #, cnt
        movaps  %xmm0, (%ecx)   # tmp62,* r
        addl    $16, %ecx       #, r
        cmpl    $-1, %edx       #, cnt
        jne     .L4     #,

bad:
        jmp     .L14    #
.L15:
        movaps  -24(%ebp), %xmm0        # fv,
        movaps  %xmm0, (%edx)   #,* r
        addl    $16, %edx       #, r
.L14:
        subl    $1, %eax        #, cnt
        cmpl    $-1, %eax       #, cnt
        jne     .L15    #,

So the more natural way to write the code pessimizes it without appearant reason.
>How-To-Repeat:
Compile the testcase with SSE2 or Altivec support on ia32/powerpc.
>Fix:
A workaround is to use a temporary array, as given in the example.
>Release-Note:
>Audit-Trail:
>Unformatted:


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]