[Bug target/38824] New: [4.4 regression] performance regression of sse code from 4.2/4.3

Tue Jan 13 11:25:00 GMT 2009

the following code shows a performance regression from gcc-4.2 to gcc-4.3 and
4.4 (20090111) on an intel core2 using the x86_64 architecture:

void bench_1(float * out, float * in, float f, unsigned int n)
{
    n /= 4;
    __m128 scalar = _mm_set_ps1(f);
    do
    {
        __m128 arg = _mm_load_ps(in);
        __m128 result = _mm_add_ps(arg, scalar);
        _mm_store_ps(out, result);
        in += 4;
        out += 4;
    }
    while (--n);
}

results, running the function 100000000 times, measured with performance
counters (requires a patched kernel), compiled with -O3 -mfpmath=sse -msse
gcc-4.2: 1946256122 cycles, 8394301290 instructions, 5005 branch misses
gcc-4.3: 2191990305 cycles, 7658465214 instructions, 3442 branch misses
gcc-4.4: 2532778908 cycles, 7462359830 instructions, 8593402 branch misses

although the instruction count decreases, the cycles spent in the function
increases. also gcc-4.4 shows a huge number of branch misses.

the generated code is

gcc-4.2:
.globl _Z7bench_1PfS_fj
        .type   _Z7bench_1PfS_fj, @function
_Z7bench_1PfS_fj:
.LFB2695:
        movaps  %xmm0, %xmm2
        shrl    $2, %edx
        shufps  $0, %xmm2, %xmm2
        movaps  %xmm2, %xmm1
        .p2align 4,,7
.L15:
        movaps  (%rsi), %xmm0
        addq    $16, %rsi
        addps   %xmm1, %xmm0
        movaps  %xmm0, (%rdi)
        addq    $16, %rdi
        subl    $1, %edx
        jne     .L15
        rep ; ret
.LFE2695:
        .size   _Z7bench_1PfS_fj, .-_Z7bench_1PfS_fj
        .align 2
        .p2align 4,,15

gcc-4.3
.globl _Z7bench_1PfS_fj
        .type   _Z7bench_1PfS_fj, @function
_Z7bench_1PfS_fj:
.LFB2563:
        movaps  %xmm0, %xmm2
        shrl    $2, %edx
        subl    $1, %edx
        xorl    %eax, %eax
        shufps  $0, %xmm2, %xmm2
        mov     %edx, %edx
        addq    $1, %rdx
        salq    $4, %rdx
        movaps  %xmm2, %xmm1
        .p2align 4,,10
        .p2align 3
.L17:
        movaps  (%rsi,%rax), %xmm0
        addps   %xmm1, %xmm0
        movaps  %xmm0, (%rdi,%rax)
        addq    $16, %rax
        cmpq    %rdx, %rax
        jne     .L17
        rep
        ret
.LFE2563:
        .size   _Z7bench_1PfS_fj, .-_Z7bench_1PfS_fj
        .p2align 4,,15

gcc-4.4
.globl _Z7bench_1PfS_fj
        .type   _Z7bench_1PfS_fj, @function
_Z7bench_1PfS_fj:
.LFB2489:
        .cfi_startproc
        .cfi_personality 0x3,__gxx_personality_v0
        shrl    $2, %edx
        shufps  $0, %xmm0, %xmm0
        subl    $1, %edx
        xorl    %eax, %eax
        addq    $1, %rdx
        salq    $4, %rdx
        .p2align 4,,10
        .p2align 3
.L17:
        movaps  %xmm0, %xmm1
        addps   (%rsi,%rax), %xmm1
        movaps  %xmm1, (%rdi,%rax)
        addq    $16, %rax
        cmpq    %rdx, %rax
        jne     .L17
        rep
        ret
        .cfi_endproc
.LFE2489:
        .size   _Z7bench_1PfS_fj, .-_Z7bench_1PfS_fj
        .p2align 4,,15

-- 
           Summary: [4.4 regression] performance regression of sse code from
                    4.2/4.3
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: tim at klingt dot org

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38824