[Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code
tim at klingt dot org
gcc-bugzilla@gcc.gnu.org
Mon Nov 17 18:50:00 GMT 2008
------- Comment #9 from tim at klingt dot org 2008-11-17 18:49 -------
i have updated the test program and attached preprocessed sources of gcc 4.3
and 4.4
the loop prefix contains
4.4 (9 invariant loads, one store of a generated constant to the stack):
pxor %xmm5, %xmm5
xorl %eax, %eax
movdqa %xmm5, %xmm0
xorl %edx, %edx
pcmpeqd %xmm5, %xmm0
movaps .LC2(%rip), %xmm14
psrld $31, %xmm0
movdqa .LC3(%rip), %xmm13
pslld $31, %xmm0
movaps .LC4(%rip), %xmm12
movaps .LC5(%rip), %xmm11
movaps .LC6(%rip), %xmm10
movaps .LC7(%rip), %xmm9
movaps .LC8(%rip), %xmm8
movaps .LC9(%rip), %xmm7
movaps .LC16(%rip), %xmm6
movdqa %xmm0, -24(%rsp)
4.3 (8 invariant loads, store one generated constant in register):
pxor %xmm6, %xmm6
xorl %edx, %edx
movdqa %xmm6, %xmm0
xorl %eax, %eax
pcmpeqd %xmm6, %xmm0
movaps .LC9(%rip), %xmm15
psrld $31, %xmm0
movaps .LC10(%rip), %xmm14
pslld $31, %xmm0
movaps .LC11(%rip), %xmm13
movaps .LC12(%rip), %xmm12
movaps .LC13(%rip), %xmm11
movdqa .LC14(%rip), %xmm10
movaps .LC15(%rip), %xmm9
movaps .LC16(%rip), %xmm8
movdqa %xmm0, %xmm7
body:
4.3 (7 loads from memory, 2 loads are used in the next instruction, others are
used later):
.L48:
movaps in(%rax), %xmm2
movaps .LC2(%rip), %xmm0
movdqa %xmm2, %xmm5
movdqa .LC3(%rip), %xmm4
pand %xmm7, %xmm5
movaps .LC4(%rip), %xmm1
addl $4, %edx
#APP
# 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm5, %xmm2
# 0 "" 2
#NO_APP
mulps %xmm2, %xmm0
movaps %xmm2, %xmm3
#APP
# 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvttps2dq %xmm0, %xmm0
# 0 "" 2
#NO_APP
pand %xmm0, %xmm4
paddd %xmm0, %xmm4
#APP
# 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvtdq2ps %xmm4, %xmm0
# 0 "" 2
#NO_APP
pand %xmm10, %xmm4
mulps %xmm0, %xmm1
psrld $1, %xmm4
subps %xmm1, %xmm3
movaps .LC5(%rip), %xmm1
mulps %xmm0, %xmm1
mulps .LC6(%rip), %xmm0
subps %xmm1, %xmm3
subps %xmm0, %xmm3
movaps .LC7(%rip), %xmm0
movaps %xmm3, %xmm1
cmpltps %xmm2, %xmm0
mulps %xmm3, %xmm1
movaps %xmm0, %xmm2
movaps .LC8(%rip), %xmm0
mulps %xmm1, %xmm0
addps %xmm15, %xmm0
mulps %xmm1, %xmm0
addps %xmm14, %xmm0
mulps %xmm1, %xmm0
addps %xmm13, %xmm0
mulps %xmm1, %xmm0
addps %xmm12, %xmm0
mulps %xmm1, %xmm0
addps %xmm11, %xmm0
mulps %xmm1, %xmm0
mulps %xmm3, %xmm0
addps %xmm3, %xmm0
#APP
# 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andps %xmm2, %xmm0
# 0 "" 2
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andnps %xmm3, %xmm2
# 0 "" 2
#NO_APP
movaps %xmm8, %xmm3
#APP
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
orps %xmm2, %xmm0
# 0 "" 2
#NO_APP
movdqa %xmm6, %xmm2
movaps %xmm0, %xmm1
psubd %xmm4, %xmm2
addps %xmm9, %xmm1
divps %xmm1, %xmm3
movaps %xmm3, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andps %xmm2, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andnps %xmm0, %xmm2
# 0 "" 2
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
orps %xmm2, %xmm1
# 0 "" 2
# 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm5, %xmm1
# 0 "" 2
#NO_APP
movaps %xmm1, out(%rax)
addq $16, %rax
cmpl %edi, %edx
jne .L48
4.4 (6 loads from memory, 5 loads are used as memory argument to opcodes):
.L54:
movaps in(%rax), %xmm2
movdqa -24(%rsp), %xmm3
addl $4, %edx
pand %xmm2, %xmm3
#APP
# 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm3, %xmm2
# 0 "" 2
#NO_APP
movaps %xmm2, %xmm4
movaps %xmm2, %xmm15
mulps %xmm14, %xmm4
#APP
# 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvttps2dq %xmm4, %xmm4
# 0 "" 2
#NO_APP
movdqa %xmm4, %xmm0
pand %xmm13, %xmm0
paddd %xmm0, %xmm4
#APP
# 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
cvtdq2ps %xmm4, %xmm0
# 0 "" 2
#NO_APP
pand .LC14(%rip), %xmm4
movaps %xmm0, %xmm1
psrld $1, %xmm4
mulps %xmm12, %xmm1
subps %xmm1, %xmm15
movaps %xmm15, %xmm1
movaps %xmm0, %xmm15
mulps %xmm10, %xmm0
mulps %xmm11, %xmm15
subps %xmm15, %xmm1
movaps %xmm9, %xmm15
subps %xmm0, %xmm1
cmpltps %xmm2, %xmm15
movaps %xmm1, %xmm0
movaps %xmm15, %xmm2
mulps %xmm1, %xmm0
movaps %xmm0, %xmm15
mulps %xmm8, %xmm15
addps %xmm7, %xmm15
mulps %xmm0, %xmm15
addps .LC10(%rip), %xmm15
mulps %xmm0, %xmm15
addps .LC11(%rip), %xmm15
mulps %xmm0, %xmm15
addps .LC12(%rip), %xmm15
mulps %xmm0, %xmm15
addps .LC13(%rip), %xmm15
mulps %xmm15, %xmm0
mulps %xmm1, %xmm0
addps %xmm1, %xmm0
#APP
# 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andps %xmm2, %xmm0
# 0 "" 2
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
andnps %xmm1, %xmm2
# 0 "" 2
#NO_APP
movdqa %xmm5, %xmm1
psubd %xmm4, %xmm1
movdqa %xmm1, %xmm4
movaps .LC15(%rip), %xmm1
#APP
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
orps %xmm2, %xmm0
# 0 "" 2
#NO_APP
movaps %xmm6, %xmm2
addps %xmm0, %xmm1
divps %xmm1, %xmm2
movaps %xmm2, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andps %xmm4, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
andnps %xmm0, %xmm4
# 0 "" 2
#NO_APP
movaps %xmm1, %xmm0
#APP
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
orps %xmm4, %xmm0
# 0 "" 2
# 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
xorps %xmm3, %xmm0
# 0 "" 2
#NO_APP
movaps %xmm0, out(%rax)
addq $16, %rax
cmpl %edi, %edx
jne .L54
hth
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134
More information about the Gcc-bugs
mailing list