[Bug rtl-optimization/29881] union causes inefficient code

pinskia at gcc dot gnu dot org gcc-bugzilla@gcc.gnu.org
Sat Nov 18 00:14:00 GMT 2006



------- Comment #1 from pinskia at gcc dot gnu dot org  2006-11-18 00:14 -------
The problem here I think is unions.

Here is how I would have written this code (without using unions in fact):

void array_sample_fun(__m128 *dst, const __m128  *src, int length) {
        __m128 af = _mm_set1_ps(1.20f);
        __m128 bf = _mm_set1_ps(2.88f);
        __m128 cf = _mm_set1_ps(-2.44f);
        __m128 df = _mm_set1_ps(4.06f);
        __m128 ef = _mm_set1_ps(-12.04f);

        __m128i mask = _mm_set1_epi32(0xff << 23);
        __m128i bias = _mm_set1_epi32(0x7f << 23);
        __m128i t;

        while (length-- != 0) {
                __m128 vec;

                vec = (*src++);
                __m128 arg =
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_sub_epi32(_mm_and_si128((__m128i)vec, mask),
bias),
23));
                vec = (__m128)_mm_or_si128(_mm_andnot_si128(mask,
(__m128i)vec), bias);
                *dst++ = _mm_add_ps(arg,
_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(
                        _mm_mul_ps(af, vec), bf), vec), cf), vec), df),
vec), ef));
        }
}


----------------------
The above gives good results for 32bit:
.L4:
        movaps  (%eax), %xmm0
        movdqa  %xmm4, %xmm1
        addl    $1, %ecx
        addl    $16, %eax
        movdqa  %xmm0, %xmm2
        pandn   %xmm0, %xmm1
        movaps  .LC0, %xmm0
        por     %xmm3, %xmm1
        pand    %xmm4, %xmm2
        psubd   %xmm3, %xmm2
        mulps   %xmm1, %xmm0
        psrad   $23, %xmm2
        cvtdq2ps        %xmm2, %xmm2
        addps   .LC1, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm7, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm6, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm5, %xmm0
        addps   %xmm0, %xmm2
        movaps  %xmm2, (%edx)
        addl    $16, %edx
        cmpl    %ebx, %ecx
        jne     .L4


While your orginal example on 32bits has a store to the stack.

So this is just a case where union cause missed optimization


-- 

pinskia at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
  GCC build triplet| x86_64-unknown-linux-gnu   |x86_64-unknown-linux-gnu
   GCC host triplet| x86_64-unknown-linux-gnu   |x86_64-unknown-linux-gnu
 GCC target triplet| x86_64-unknown-linux-gnu   |x86_64-unknown-linux-gnu
            Summary|inefficient/incorrect xmm   |union causes inefficient
                   |registers usage             |code


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=29881



More information about the Gcc-bugs mailing list