[Bug target/38134] [4.4 Regression] speed regression with inline-asm sse code

tim at klingt dot org gcc-bugzilla@gcc.gnu.org
Mon Nov 17 18:50:00 GMT 2008



------- Comment #9 from tim at klingt dot org  2008-11-17 18:49 -------
i have updated the test program and attached preprocessed sources of gcc 4.3
and 4.4

the loop prefix contains
4.4 (9 invariant loads, one store of a generated constant to the stack):
        pxor    %xmm5, %xmm5
        xorl    %eax, %eax
        movdqa  %xmm5, %xmm0
        xorl    %edx, %edx
        pcmpeqd %xmm5, %xmm0
        movaps  .LC2(%rip), %xmm14
        psrld   $31, %xmm0
        movdqa  .LC3(%rip), %xmm13
        pslld   $31, %xmm0
        movaps  .LC4(%rip), %xmm12
        movaps  .LC5(%rip), %xmm11
        movaps  .LC6(%rip), %xmm10
        movaps  .LC7(%rip), %xmm9
        movaps  .LC8(%rip), %xmm8
        movaps  .LC9(%rip), %xmm7
        movaps  .LC16(%rip), %xmm6
        movdqa  %xmm0, -24(%rsp)

4.3 (8 invariant loads, store one generated constant in register):
        pxor    %xmm6, %xmm6
        xorl    %edx, %edx
        movdqa  %xmm6, %xmm0
        xorl    %eax, %eax
        pcmpeqd %xmm6, %xmm0
        movaps  .LC9(%rip), %xmm15
        psrld   $31, %xmm0
        movaps  .LC10(%rip), %xmm14
        pslld   $31, %xmm0
        movaps  .LC11(%rip), %xmm13
        movaps  .LC12(%rip), %xmm12
        movaps  .LC13(%rip), %xmm11
        movdqa  .LC14(%rip), %xmm10
        movaps  .LC15(%rip), %xmm9
        movaps  .LC16(%rip), %xmm8
        movdqa  %xmm0, %xmm7




body:
4.3 (7 loads from memory, 2 loads are used in the next instruction, others are
used later):
.L48:
        movaps  in(%rax), %xmm2
        movaps  .LC2(%rip), %xmm0
        movdqa  %xmm2, %xmm5
        movdqa  .LC3(%rip), %xmm4
        pand    %xmm7, %xmm5
        movaps  .LC4(%rip), %xmm1
        addl    $4, %edx
#APP
# 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm5, %xmm2
# 0 "" 2
#NO_APP
        mulps   %xmm2, %xmm0
        movaps  %xmm2, %xmm3
#APP
# 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvttps2dq %xmm0, %xmm0
# 0 "" 2
#NO_APP
        pand    %xmm0, %xmm4
        paddd   %xmm0, %xmm4
#APP
# 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvtdq2ps  %xmm4, %xmm0
# 0 "" 2
#NO_APP
        pand    %xmm10, %xmm4
        mulps   %xmm0, %xmm1
        psrld   $1, %xmm4
        subps   %xmm1, %xmm3
        movaps  .LC5(%rip), %xmm1
        mulps   %xmm0, %xmm1
        mulps   .LC6(%rip), %xmm0
        subps   %xmm1, %xmm3
        subps   %xmm0, %xmm3
        movaps  .LC7(%rip), %xmm0
        movaps  %xmm3, %xmm1
        cmpltps %xmm2, %xmm0
        mulps   %xmm3, %xmm1
        movaps  %xmm0, %xmm2
        movaps  .LC8(%rip), %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm15, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm14, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm13, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm12, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm11, %xmm0
        mulps   %xmm1, %xmm0
        mulps   %xmm3, %xmm0
        addps   %xmm3, %xmm0
#APP
# 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andps %xmm2, %xmm0
# 0 "" 2
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andnps %xmm3, %xmm2
# 0 "" 2
#NO_APP
        movaps  %xmm8, %xmm3
#APP
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        orps  %xmm2, %xmm0
# 0 "" 2
#NO_APP
        movdqa  %xmm6, %xmm2
        movaps  %xmm0, %xmm1
        psubd   %xmm4, %xmm2
        addps   %xmm9, %xmm1
        divps   %xmm1, %xmm3
        movaps  %xmm3, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andps %xmm2, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andnps %xmm0, %xmm2
# 0 "" 2
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        orps  %xmm2, %xmm1
# 0 "" 2
# 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm5, %xmm1
# 0 "" 2
#NO_APP
        movaps  %xmm1, out(%rax)
        addq    $16, %rax
        cmpl    %edi, %edx
        jne     .L48


4.4 (6 loads from memory, 5 loads are used as memory argument to opcodes):
.L54:
        movaps  in(%rax), %xmm2
        movdqa  -24(%rsp), %xmm3
        addl    $4, %edx
        pand    %xmm2, %xmm3
#APP
# 324 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm3, %xmm2
# 0 "" 2
#NO_APP
        movaps  %xmm2, %xmm4
        movaps  %xmm2, %xmm15
        mulps   %xmm14, %xmm4
#APP
# 327 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvttps2dq %xmm4, %xmm4
# 0 "" 2
#NO_APP
        movdqa  %xmm4, %xmm0
        pand    %xmm13, %xmm0
        paddd   %xmm0, %xmm4
#APP
# 330 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        cvtdq2ps  %xmm4, %xmm0
# 0 "" 2
#NO_APP
        pand    .LC14(%rip), %xmm4
        movaps  %xmm0, %xmm1
        psrld   $1, %xmm4
        mulps   %xmm12, %xmm1
        subps   %xmm1, %xmm15
        movaps  %xmm15, %xmm1
        movaps  %xmm0, %xmm15
        mulps   %xmm10, %xmm0
        mulps   %xmm11, %xmm15
        subps   %xmm15, %xmm1
        movaps  %xmm9, %xmm15
        subps   %xmm0, %xmm1
        cmpltps %xmm2, %xmm15
        movaps  %xmm1, %xmm0
        movaps  %xmm15, %xmm2
        mulps   %xmm1, %xmm0
        movaps  %xmm0, %xmm15
        mulps   %xmm8, %xmm15
        addps   %xmm7, %xmm15
        mulps   %xmm0, %xmm15
        addps   .LC10(%rip), %xmm15
        mulps   %xmm0, %xmm15
        addps   .LC11(%rip), %xmm15
        mulps   %xmm0, %xmm15
        addps   .LC12(%rip), %xmm15
        mulps   %xmm0, %xmm15
        addps   .LC13(%rip), %xmm15
        mulps   %xmm15, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm1, %xmm0
#APP
# 341 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andps %xmm2, %xmm0
# 0 "" 2
# 342 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        andnps %xmm1, %xmm2
# 0 "" 2
#NO_APP
        movdqa  %xmm5, %xmm1
        psubd   %xmm4, %xmm1
        movdqa  %xmm1, %xmm4
        movaps  .LC15(%rip), %xmm1
#APP
# 343 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        orps  %xmm2, %xmm0
# 0 "" 2
#NO_APP
        movaps  %xmm6, %xmm2
        addps   %xmm0, %xmm1
        divps   %xmm1, %xmm2
        movaps  %xmm2, %xmm1
#APP
# 145 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andps %xmm4, %xmm1
# 0 "" 2
# 146 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        andnps %xmm0, %xmm4
# 0 "" 2
#NO_APP
        movaps  %xmm1, %xmm0
#APP
# 147 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/simdconst.h" 1
        orps  %xmm4, %xmm0
# 0 "" 2
# 348 "benchmarks/../source/dsp/../../libs/libsimdmath/lib/sincosf4.h" 1
        xorps %xmm3, %xmm0
# 0 "" 2
#NO_APP
        movaps  %xmm0, out(%rax)
        addq    $16, %rax
        cmpl    %edi, %edx
        jne     .L54

hth


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38134



More information about the Gcc-bugs mailing list