[Bug rtl-optimization/29881] New: inefficient/incorrect xmm registers usage

Fri Nov 17 22:43:00 GMT 2006

hello!

I'm writing fast approximations of some complex math functions. I'm trying do
them as low-level as possible, far from hand-written assembly though. I often
check the assembly code to verify the quality of it. During one of the checkups
i found that gcc (i checked 3.4.5/win32 and 4.0/4.1/fc5) doesn't make a good
use of the xmm registers when writing a mixed SSE/SSE2 code.

example code:

#include <emmintrin.h>

typedef union {
        __m128i i;
        __m128 f;
} __vec128;

void array_sample_fun(__m128 *dst, const __m128  *src, int length) {
        __m128 af = _mm_set1_ps(1.20f);
        __m128 bf = _mm_set1_ps(2.88f);
        __m128 cf = _mm_set1_ps(-2.44f);
        __m128 df = _mm_set1_ps(4.06f);
        __m128 ef = _mm_set1_ps(-12.04f);

        __m128i mask = _mm_set1_epi32(0xff << 23);
        __m128i bias = _mm_set1_epi32(0x7f << 23);

        while (length-- != 0) {
                __vec128 vec;

                vec.f = *src++;
                __m128 arg =
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_sub_epi32(_mm_and_si128(vec.i, mask), bias),
23));
                vec.i = _mm_or_si128(_mm_andnot_si128(mask, vec.i), bias);
                *dst++ = _mm_add_ps(arg,
_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(
                        _mm_mul_ps(af, vec.f), bf), vec.f), cf), vec.f), df),
vec.f), ef));
        }
}

the main loop of the function looks like this:

.L4:
        movaps  (%rsi), %xmm0
        addl    $1, %eax
        movdqa  %xmm4, %xmm2
        addq    $16, %rsi
        movaps  %xmm0, -24(%rsp)
        movdqa  -24(%rsp), %xmm0
        pandn   %xmm0, %xmm2
        movdqa  %xmm0, %xmm1
        movdqa  %xmm2, %xmm0
        pand    %xmm4, %xmm1
        por     %xmm3, %xmm0
        psubd   %xmm3, %xmm1
        psrad   $23, %xmm1
        cvtdq2ps        %xmm1, %xmm1
        movdqa  %xmm0, -24(%rsp)
        movaps  %xmm9, %xmm0
        movaps  -24(%rsp), %xmm2
        mulps   %xmm2, %xmm0
        addps   %xmm8, %xmm0
        mulps   %xmm2, %xmm0
        addps   %xmm7, %xmm0
        mulps   %xmm2, %xmm0
        addps   %xmm6, %xmm0
        mulps   %xmm2, %xmm0
        addps   %xmm5, %xmm0
        addps   %xmm0, %xmm1
        movaps  %xmm1, (%rdi)
        addq    $16, %rdi
        cmpl    %edx, %eax
        jne     .L4

As you can see whenever i try to access a float vector from integer SSE2 unit
the compiler saves the register on the stack using movaps just to load it back
to same register using movdqa one instruction later - which is totally
unnecessary/inefficient! Same goes for accessing an integer value by SSE unit.
Is this behavior 'patchable'?

-- 
           Summary: inefficient/incorrect xmm registers usage
           Product: gcc
           Version: 4.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: spd at poczta dot fm
 GCC build triplet:  x86_64-unknown-linux-gnu
  GCC host triplet:  x86_64-unknown-linux-gnu
GCC target triplet:  x86_64-unknown-linux-gnu

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=29881