[Bug target/38824] New: [4.4 regression] performance regression of sse code from 4.2/4.3
tim at klingt dot org
gcc-bugzilla@gcc.gnu.org
Tue Jan 13 11:25:00 GMT 2009
the following code shows a performance regression from gcc-4.2 to gcc-4.3 and
4.4 (20090111) on an intel core2 using the x86_64 architecture:
void bench_1(float * out, float * in, float f, unsigned int n)
{
n /= 4;
__m128 scalar = _mm_set_ps1(f);
do
{
__m128 arg = _mm_load_ps(in);
__m128 result = _mm_add_ps(arg, scalar);
_mm_store_ps(out, result);
in += 4;
out += 4;
}
while (--n);
}
results, running the function 100000000 times, measured with performance
counters (requires a patched kernel), compiled with -O3 -mfpmath=sse -msse
gcc-4.2: 1946256122 cycles, 8394301290 instructions, 5005 branch misses
gcc-4.3: 2191990305 cycles, 7658465214 instructions, 3442 branch misses
gcc-4.4: 2532778908 cycles, 7462359830 instructions, 8593402 branch misses
although the instruction count decreases, the cycles spent in the function
increases. also gcc-4.4 shows a huge number of branch misses.
the generated code is
gcc-4.2:
.globl _Z7bench_1PfS_fj
.type _Z7bench_1PfS_fj, @function
_Z7bench_1PfS_fj:
.LFB2695:
movaps %xmm0, %xmm2
shrl $2, %edx
shufps $0, %xmm2, %xmm2
movaps %xmm2, %xmm1
.p2align 4,,7
.L15:
movaps (%rsi), %xmm0
addq $16, %rsi
addps %xmm1, %xmm0
movaps %xmm0, (%rdi)
addq $16, %rdi
subl $1, %edx
jne .L15
rep ; ret
.LFE2695:
.size _Z7bench_1PfS_fj, .-_Z7bench_1PfS_fj
.align 2
.p2align 4,,15
gcc-4.3
.globl _Z7bench_1PfS_fj
.type _Z7bench_1PfS_fj, @function
_Z7bench_1PfS_fj:
.LFB2563:
movaps %xmm0, %xmm2
shrl $2, %edx
subl $1, %edx
xorl %eax, %eax
shufps $0, %xmm2, %xmm2
mov %edx, %edx
addq $1, %rdx
salq $4, %rdx
movaps %xmm2, %xmm1
.p2align 4,,10
.p2align 3
.L17:
movaps (%rsi,%rax), %xmm0
addps %xmm1, %xmm0
movaps %xmm0, (%rdi,%rax)
addq $16, %rax
cmpq %rdx, %rax
jne .L17
rep
ret
.LFE2563:
.size _Z7bench_1PfS_fj, .-_Z7bench_1PfS_fj
.p2align 4,,15
gcc-4.4
.globl _Z7bench_1PfS_fj
.type _Z7bench_1PfS_fj, @function
_Z7bench_1PfS_fj:
.LFB2489:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
shrl $2, %edx
shufps $0, %xmm0, %xmm0
subl $1, %edx
xorl %eax, %eax
addq $1, %rdx
salq $4, %rdx
.p2align 4,,10
.p2align 3
.L17:
movaps %xmm0, %xmm1
addps (%rsi,%rax), %xmm1
movaps %xmm1, (%rdi,%rax)
addq $16, %rax
cmpq %rdx, %rax
jne .L17
rep
ret
.cfi_endproc
.LFE2489:
.size _Z7bench_1PfS_fj, .-_Z7bench_1PfS_fj
.p2align 4,,15
--
Summary: [4.4 regression] performance regression of sse code from
4.2/4.3
Product: gcc
Version: 4.4.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: tim at klingt dot org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38824
More information about the Gcc-bugs
mailing list