Bug 29881 - union causes inefficient code
Summary: union causes inefficient code
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 4.1.0
: P3 enhancement
Target Milestone: 4.8.5
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2006-11-17 22:43 UTC by David Paj±k
Modified: 2021-06-08 09:26 UTC (History)
2 users (show)

See Also:
Host:
Target: i?86-*-* x86_64-*-*
Build:
Known to work: 4.8.5
Known to fail:
Last reconfirmed: 2006-11-18 11:18:36


Attachments
Example code (254 bytes, text/plain)
2007-08-07 14:18 UTC, José Fonseca
Details

Note You need to log in before you can comment on or make changes to this bug.
Description David Paj±k 2006-11-17 22:43:07 UTC
hello!

I'm writing fast approximations of some complex math functions. I'm trying do them as low-level as possible, far from hand-written assembly though. I often check the assembly code to verify the quality of it. During one of the checkups i found that gcc (i checked 3.4.5/win32 and 4.0/4.1/fc5) doesn't make a good use of the xmm registers when writing a mixed SSE/SSE2 code.

example code:

#include <emmintrin.h>

typedef union {
        __m128i i;
        __m128 f;
} __vec128;

void array_sample_fun(__m128 *dst, const __m128  *src, int length) {
        __m128 af = _mm_set1_ps(1.20f);
        __m128 bf = _mm_set1_ps(2.88f);
        __m128 cf = _mm_set1_ps(-2.44f);
        __m128 df = _mm_set1_ps(4.06f);
        __m128 ef = _mm_set1_ps(-12.04f);

        __m128i mask = _mm_set1_epi32(0xff << 23);
        __m128i bias = _mm_set1_epi32(0x7f << 23);

        while (length-- != 0) {
                __vec128 vec;

                vec.f = *src++;
                __m128 arg = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_sub_epi32(_mm_and_si128(vec.i, mask), bias), 23));
                vec.i = _mm_or_si128(_mm_andnot_si128(mask, vec.i), bias);
                *dst++ = _mm_add_ps(arg, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(
                        _mm_mul_ps(af, vec.f), bf), vec.f), cf), vec.f), df), vec.f), ef));
        }
}

the main loop of the function looks like this:

.L4:
        movaps  (%rsi), %xmm0
        addl    $1, %eax
        movdqa  %xmm4, %xmm2
        addq    $16, %rsi
        movaps  %xmm0, -24(%rsp)
        movdqa  -24(%rsp), %xmm0
        pandn   %xmm0, %xmm2
        movdqa  %xmm0, %xmm1
        movdqa  %xmm2, %xmm0
        pand    %xmm4, %xmm1
        por     %xmm3, %xmm0
        psubd   %xmm3, %xmm1
        psrad   $23, %xmm1
        cvtdq2ps        %xmm1, %xmm1
        movdqa  %xmm0, -24(%rsp)
        movaps  %xmm9, %xmm0
        movaps  -24(%rsp), %xmm2
        mulps   %xmm2, %xmm0
        addps   %xmm8, %xmm0
        mulps   %xmm2, %xmm0
        addps   %xmm7, %xmm0
        mulps   %xmm2, %xmm0
        addps   %xmm6, %xmm0
        mulps   %xmm2, %xmm0
        addps   %xmm5, %xmm0
        addps   %xmm0, %xmm1
        movaps  %xmm1, (%rdi)
        addq    $16, %rdi
        cmpl    %edx, %eax
        jne     .L4

As you can see whenever i try to access a float vector from integer SSE2 unit the compiler saves the register on the stack using movaps just to load it back to same register using movdqa one instruction later - which is totally unnecessary/inefficient! Same goes for accessing an integer value by SSE unit. Is this behavior 'patchable'?
Comment 1 Andrew Pinski 2006-11-18 00:14:13 UTC
The problem here I think is unions.

Here is how I would have written this code (without using unions in fact):

void array_sample_fun(__m128 *dst, const __m128  *src, int length) {
        __m128 af = _mm_set1_ps(1.20f);
        __m128 bf = _mm_set1_ps(2.88f);
        __m128 cf = _mm_set1_ps(-2.44f);
        __m128 df = _mm_set1_ps(4.06f);
        __m128 ef = _mm_set1_ps(-12.04f);

        __m128i mask = _mm_set1_epi32(0xff << 23);
        __m128i bias = _mm_set1_epi32(0x7f << 23);
        __m128i t;
         
        while (length-- != 0) {
                __m128 vec;

                vec = (*src++);
                __m128 arg =
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_sub_epi32(_mm_and_si128((__m128i)vec, mask), bias),
23));
                vec = (__m128)_mm_or_si128(_mm_andnot_si128(mask, (__m128i)vec), bias);
                *dst++ = _mm_add_ps(arg,
_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(
                        _mm_mul_ps(af, vec), bf), vec), cf), vec), df),
vec), ef));
        }
}


----------------------
The above gives good results for 32bit:
.L4:
        movaps  (%eax), %xmm0
        movdqa  %xmm4, %xmm1
        addl    $1, %ecx
        addl    $16, %eax
        movdqa  %xmm0, %xmm2
        pandn   %xmm0, %xmm1
        movaps  .LC0, %xmm0
        por     %xmm3, %xmm1
        pand    %xmm4, %xmm2
        psubd   %xmm3, %xmm2
        mulps   %xmm1, %xmm0
        psrad   $23, %xmm2
        cvtdq2ps        %xmm2, %xmm2
        addps   .LC1, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm7, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm6, %xmm0
        mulps   %xmm1, %xmm0
        addps   %xmm5, %xmm0
        addps   %xmm0, %xmm2
        movaps  %xmm2, (%edx)
        addl    $16, %edx
        cmpl    %ebx, %ecx
        jne     .L4


While your orginal example on 32bits has a store to the stack.

So this is just a case where union cause missed optimization
Comment 2 David Paj±k 2006-11-18 10:25:58 UTC
(In reply to comment #1)
> The problem here I think is unions.
> 
> Here is how I would have written this code (without using unions in fact):
> 
> void array_sample_fun(__m128 *dst, const __m128  *src, int length) {
>         __m128 af = _mm_set1_ps(1.20f);
>         __m128 bf = _mm_set1_ps(2.88f);
>         __m128 cf = _mm_set1_ps(-2.44f);
>         __m128 df = _mm_set1_ps(4.06f);
>         __m128 ef = _mm_set1_ps(-12.04f);
> 
>         __m128i mask = _mm_set1_epi32(0xff << 23);
>         __m128i bias = _mm_set1_epi32(0x7f << 23);
>         __m128i t;
> 
>         while (length-- != 0) {
>                 __m128 vec;
> 
>                 vec = (*src++);
>                 __m128 arg =
> _mm_cvtepi32_ps(_mm_srai_epi32(_mm_sub_epi32(_mm_and_si128((__m128i)vec, mask),
> bias),
> 23));
>                 vec = (__m128)_mm_or_si128(_mm_andnot_si128(mask,
> (__m128i)vec), bias);
>                 *dst++ = _mm_add_ps(arg,
> _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(
>                         _mm_mul_ps(af, vec), bf), vec), cf), vec), df),
> vec), ef));
>         }
> }
> 
> 
> ----------------------
> The above gives good results for 32bit:
> .L4:
>         movaps  (%eax), %xmm0
>         movdqa  %xmm4, %xmm1
>         addl    $1, %ecx
>         addl    $16, %eax
>         movdqa  %xmm0, %xmm2
>         pandn   %xmm0, %xmm1
>         movaps  .LC0, %xmm0
>         por     %xmm3, %xmm1
>         pand    %xmm4, %xmm2
>         psubd   %xmm3, %xmm2
>         mulps   %xmm1, %xmm0
>         psrad   $23, %xmm2
>         cvtdq2ps        %xmm2, %xmm2
>         addps   .LC1, %xmm0
>         mulps   %xmm1, %xmm0
>         addps   %xmm7, %xmm0
>         mulps   %xmm1, %xmm0
>         addps   %xmm6, %xmm0
>         mulps   %xmm1, %xmm0
>         addps   %xmm5, %xmm0
>         addps   %xmm0, %xmm2
>         movaps  %xmm2, (%edx)
>         addl    $16, %edx
>         cmpl    %ebx, %ecx
>         jne     .L4
> 
> 
> While your orginal example on 32bits has a store to the stack.
> 
> So this is just a case where union cause missed optimization
> 

Thanks for the hint! Assembly looks ok right now - didn't expect that this kind of casting would work (it doesn't in msvc compilers). Anyway it would be nice if the compiler would detect such a optimization opportunity (union case) as a valid one.
Comment 3 Andrew Pinski 2006-11-18 11:17:56 UTC
(In reply to comment #2)
> Thanks for the hint! Assembly looks ok right now - didn't expect that this kind
> of casting would work (it doesn't in msvc compilers). Anyway it would be nice
> if the compiler would detect such a optimization opportunity (union case) as a
> valid one.

It is a documented extension for GCC, though it is a standard feature with altivec(VMX, powerpc) and SPU compilers which is why it is included with GCC.

http://gcc.gnu.org/onlinedocs/gcc-4.1.1/gcc/Vector-Extensions.html

It is possible to cast from one vector type to another, provided they are of the same size (in fact, you can also cast vectors to and from other datatypes of the same size).
Comment 4 Andrew Pinski 2006-11-18 11:18:36 UTC
Confirmed.
Comment 5 José Fonseca 2007-08-07 14:01:10 UTC
Note that this problem is actually more general. I bumped into this when doing a very used pattern for MMX/SSE2 programming, which is making a union between a vector type and an array of integers:

union I16x8 {
	__m128i m;
	short v[8];
};

For example this code:

#include <emmintrin.h>

union I16x8 {
	__m128i m;
	short v[8];
};

void test(I16x8 *p) {
	I16x8 a, c;
	a = *p;
	c.m = _mm_add_epi16(a.m, a.m);
	*p = c;
}

Generates unnecessary copying in the body of the function:

	movl	8(%ebp), %edx
	movl	(%edx), %eax
	movl	%eax, -24(%ebp)
	movl	4(%edx), %eax
	movl	%eax, -20(%ebp)
	movl	8(%edx), %eax
	movl	%eax, -16(%ebp)
	movl	12(%edx), %eax
	movl	%eax, -12(%ebp)
	movdqa	-24(%ebp), %xmm0
	paddw	%xmm0, %xmm0
	movdqa	%xmm0, -40(%ebp)
	movl	-40(%ebp), %eax
	movl	%eax, (%edx)
	movl	-36(%ebp), %eax
	movl	%eax, 4(%edx)
	movl	-32(%ebp), %eax
	movl	%eax, 8(%edx)
	movl	-28(%ebp), %eax
	movl	%eax, 12(%edx)

The more strange is that eliminating the array member of the union as following 

union I16x8 {
	__m128i m;
};

Also generates *exactly* the same redundant code:

	movl	8(%ebp), %edx
	movl	(%edx), %eax
	movl	%eax, -24(%ebp)
	movl	4(%edx), %eax
	movl	%eax, -20(%ebp)
	movl	8(%edx), %eax
	movl	%eax, -16(%ebp)
	movl	12(%edx), %eax
	movl	%eax, -12(%ebp)
	movdqa	-24(%ebp), %xmm0
	paddw	%xmm0, %xmm0
	movdqa	%xmm0, -40(%ebp)
	movl	-40(%ebp), %eax
	movl	%eax, (%edx)
	movl	-36(%ebp), %eax
	movl	%eax, 4(%edx)
	movl	-32(%ebp), %eax
	movl	%eax, 8(%edx)
	movl	-28(%ebp), %eax
	movl	%eax, 12(%edx)

However overwriting the assignment operator as:

union I16x8 {
	__m128i m;
	short v[8];

	I16x8 & operator =(I16x8 &o) {
		m = o.m;
		return *this;
	}
};

Generates the right assembly code for the function above:

	movl	8(%ebp), %eax
	movdqa	(%eax), %xmm0
	paddw	%xmm0, %xmm0
	movdqa	%xmm0, (%eax)

Also strange, is that a dummy structure as follows:

struct I16x8 {
	__m128i m;
};

Also generates the right code (exactly as above):

	movl	8(%ebp), %eax
	movdqa	(%eax), %xmm0
	paddw	%xmm0, %xmm0
	movdqa	%xmm0, (%eax)

The union of vector type with a array of integers is an example used in almost every tutorial of the SIMD intrinsics out there. This bug was causing gcc to perform poorly with my code compared with Microsoft Visual C++ Compiler and Intel C++ Compiler, but after working around this it generated faster code than both.
Comment 6 José Fonseca 2007-08-07 14:18:28 UTC
Created attachment 14031 [details]
Example code

This is the source-code for my example above. To get the assembly run as:

gcc -S -DCASE=0 -O3 -msse2 -o sse2-union-0.s sse2-union.cpp
gcc -S -DCASE=1 -O3 -msse2 -o sse2-union-1.s sse2-union.cpp
gcc -S -DCASE=2 -O3 -msse2 -o sse2-union-2.s sse2-union.cpp
gcc -S -DCASE=3 -O3 -msse2 -o sse2-union-3.s sse2-union.cpp

This was run on (gcc -v):

Using built-in specs.
Target: i486-linux-gnu
Configured with: ../src/configure -v --enable-languages=c,c++,fortran,objc,obj-c++,treelang --prefix=/usr --enable-shared --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --enable-nls --with-gxx-include-dir=/usr/include/c++/4.1.3 --program-suffix=-4.1 --enable-__cxa_atexit --enable-clocale=gnu --enable-libstdcxx-debug --enable-mpfr --enable-checking=release i486-linux-gnu
Thread model: posix
gcc version 4.1.3 20070718 (prerelease) (Debian 4.1.2-14)

But I actually first discovered this in an unofficial build of gcc-4.2 for MinGW.
Comment 7 Andrew Pinski 2007-11-03 17:57:41 UTC
This is a target issue for not using the SSE register when doing copies:
;; a = *p
Is expanded as sequences of load/stores.
Comment 8 Andrew Pinski 2021-06-08 09:26:08 UTC
Been fixed for a long time now.  At least since GCC 4.8.5.  


  a$m_4 = MEM[(const union I16x8 &)p_2(D)];
  _5 = VIEW_CONVERT_EXPR<__v8hi>(a$m_4);
  _6 = __builtin_ia32_paddw128 (_5, _5);
  _7 = VIEW_CONVERT_EXPR<__m128i>(_6);
  MEM[(union I16x8 *)p_2(D)] = _7;


Either it was when SRA was improved or MEM_REF was added but it has been fixed for over 8 years now.