hello! I'm writing fast approximations of some complex math functions. I'm trying do them as low-level as possible, far from hand-written assembly though. I often check the assembly code to verify the quality of it. During one of the checkups i found that gcc (i checked 3.4.5/win32 and 4.0/4.1/fc5) doesn't make a good use of the xmm registers when writing a mixed SSE/SSE2 code. example code: #include <emmintrin.h> typedef union { __m128i i; __m128 f; } __vec128; void array_sample_fun(__m128 *dst, const __m128 *src, int length) { __m128 af = _mm_set1_ps(1.20f); __m128 bf = _mm_set1_ps(2.88f); __m128 cf = _mm_set1_ps(-2.44f); __m128 df = _mm_set1_ps(4.06f); __m128 ef = _mm_set1_ps(-12.04f); __m128i mask = _mm_set1_epi32(0xff << 23); __m128i bias = _mm_set1_epi32(0x7f << 23); while (length-- != 0) { __vec128 vec; vec.f = *src++; __m128 arg = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_sub_epi32(_mm_and_si128(vec.i, mask), bias), 23)); vec.i = _mm_or_si128(_mm_andnot_si128(mask, vec.i), bias); *dst++ = _mm_add_ps(arg, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps( _mm_mul_ps(af, vec.f), bf), vec.f), cf), vec.f), df), vec.f), ef)); } } the main loop of the function looks like this: .L4: movaps (%rsi), %xmm0 addl $1, %eax movdqa %xmm4, %xmm2 addq $16, %rsi movaps %xmm0, -24(%rsp) movdqa -24(%rsp), %xmm0 pandn %xmm0, %xmm2 movdqa %xmm0, %xmm1 movdqa %xmm2, %xmm0 pand %xmm4, %xmm1 por %xmm3, %xmm0 psubd %xmm3, %xmm1 psrad $23, %xmm1 cvtdq2ps %xmm1, %xmm1 movdqa %xmm0, -24(%rsp) movaps %xmm9, %xmm0 movaps -24(%rsp), %xmm2 mulps %xmm2, %xmm0 addps %xmm8, %xmm0 mulps %xmm2, %xmm0 addps %xmm7, %xmm0 mulps %xmm2, %xmm0 addps %xmm6, %xmm0 mulps %xmm2, %xmm0 addps %xmm5, %xmm0 addps %xmm0, %xmm1 movaps %xmm1, (%rdi) addq $16, %rdi cmpl %edx, %eax jne .L4 As you can see whenever i try to access a float vector from integer SSE2 unit the compiler saves the register on the stack using movaps just to load it back to same register using movdqa one instruction later - which is totally unnecessary/inefficient! Same goes for accessing an integer value by SSE unit. Is this behavior 'patchable'?
The problem here I think is unions. Here is how I would have written this code (without using unions in fact): void array_sample_fun(__m128 *dst, const __m128 *src, int length) { __m128 af = _mm_set1_ps(1.20f); __m128 bf = _mm_set1_ps(2.88f); __m128 cf = _mm_set1_ps(-2.44f); __m128 df = _mm_set1_ps(4.06f); __m128 ef = _mm_set1_ps(-12.04f); __m128i mask = _mm_set1_epi32(0xff << 23); __m128i bias = _mm_set1_epi32(0x7f << 23); __m128i t; while (length-- != 0) { __m128 vec; vec = (*src++); __m128 arg = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_sub_epi32(_mm_and_si128((__m128i)vec, mask), bias), 23)); vec = (__m128)_mm_or_si128(_mm_andnot_si128(mask, (__m128i)vec), bias); *dst++ = _mm_add_ps(arg, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps( _mm_mul_ps(af, vec), bf), vec), cf), vec), df), vec), ef)); } } ---------------------- The above gives good results for 32bit: .L4: movaps (%eax), %xmm0 movdqa %xmm4, %xmm1 addl $1, %ecx addl $16, %eax movdqa %xmm0, %xmm2 pandn %xmm0, %xmm1 movaps .LC0, %xmm0 por %xmm3, %xmm1 pand %xmm4, %xmm2 psubd %xmm3, %xmm2 mulps %xmm1, %xmm0 psrad $23, %xmm2 cvtdq2ps %xmm2, %xmm2 addps .LC1, %xmm0 mulps %xmm1, %xmm0 addps %xmm7, %xmm0 mulps %xmm1, %xmm0 addps %xmm6, %xmm0 mulps %xmm1, %xmm0 addps %xmm5, %xmm0 addps %xmm0, %xmm2 movaps %xmm2, (%edx) addl $16, %edx cmpl %ebx, %ecx jne .L4 While your orginal example on 32bits has a store to the stack. So this is just a case where union cause missed optimization
(In reply to comment #1) > The problem here I think is unions. > > Here is how I would have written this code (without using unions in fact): > > void array_sample_fun(__m128 *dst, const __m128 *src, int length) { > __m128 af = _mm_set1_ps(1.20f); > __m128 bf = _mm_set1_ps(2.88f); > __m128 cf = _mm_set1_ps(-2.44f); > __m128 df = _mm_set1_ps(4.06f); > __m128 ef = _mm_set1_ps(-12.04f); > > __m128i mask = _mm_set1_epi32(0xff << 23); > __m128i bias = _mm_set1_epi32(0x7f << 23); > __m128i t; > > while (length-- != 0) { > __m128 vec; > > vec = (*src++); > __m128 arg = > _mm_cvtepi32_ps(_mm_srai_epi32(_mm_sub_epi32(_mm_and_si128((__m128i)vec, mask), > bias), > 23)); > vec = (__m128)_mm_or_si128(_mm_andnot_si128(mask, > (__m128i)vec), bias); > *dst++ = _mm_add_ps(arg, > _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps( > _mm_mul_ps(af, vec), bf), vec), cf), vec), df), > vec), ef)); > } > } > > > ---------------------- > The above gives good results for 32bit: > .L4: > movaps (%eax), %xmm0 > movdqa %xmm4, %xmm1 > addl $1, %ecx > addl $16, %eax > movdqa %xmm0, %xmm2 > pandn %xmm0, %xmm1 > movaps .LC0, %xmm0 > por %xmm3, %xmm1 > pand %xmm4, %xmm2 > psubd %xmm3, %xmm2 > mulps %xmm1, %xmm0 > psrad $23, %xmm2 > cvtdq2ps %xmm2, %xmm2 > addps .LC1, %xmm0 > mulps %xmm1, %xmm0 > addps %xmm7, %xmm0 > mulps %xmm1, %xmm0 > addps %xmm6, %xmm0 > mulps %xmm1, %xmm0 > addps %xmm5, %xmm0 > addps %xmm0, %xmm2 > movaps %xmm2, (%edx) > addl $16, %edx > cmpl %ebx, %ecx > jne .L4 > > > While your orginal example on 32bits has a store to the stack. > > So this is just a case where union cause missed optimization > Thanks for the hint! Assembly looks ok right now - didn't expect that this kind of casting would work (it doesn't in msvc compilers). Anyway it would be nice if the compiler would detect such a optimization opportunity (union case) as a valid one.
(In reply to comment #2) > Thanks for the hint! Assembly looks ok right now - didn't expect that this kind > of casting would work (it doesn't in msvc compilers). Anyway it would be nice > if the compiler would detect such a optimization opportunity (union case) as a > valid one. It is a documented extension for GCC, though it is a standard feature with altivec(VMX, powerpc) and SPU compilers which is why it is included with GCC. http://gcc.gnu.org/onlinedocs/gcc-4.1.1/gcc/Vector-Extensions.html It is possible to cast from one vector type to another, provided they are of the same size (in fact, you can also cast vectors to and from other datatypes of the same size).
Confirmed.
Note that this problem is actually more general. I bumped into this when doing a very used pattern for MMX/SSE2 programming, which is making a union between a vector type and an array of integers: union I16x8 { __m128i m; short v[8]; }; For example this code: #include <emmintrin.h> union I16x8 { __m128i m; short v[8]; }; void test(I16x8 *p) { I16x8 a, c; a = *p; c.m = _mm_add_epi16(a.m, a.m); *p = c; } Generates unnecessary copying in the body of the function: movl 8(%ebp), %edx movl (%edx), %eax movl %eax, -24(%ebp) movl 4(%edx), %eax movl %eax, -20(%ebp) movl 8(%edx), %eax movl %eax, -16(%ebp) movl 12(%edx), %eax movl %eax, -12(%ebp) movdqa -24(%ebp), %xmm0 paddw %xmm0, %xmm0 movdqa %xmm0, -40(%ebp) movl -40(%ebp), %eax movl %eax, (%edx) movl -36(%ebp), %eax movl %eax, 4(%edx) movl -32(%ebp), %eax movl %eax, 8(%edx) movl -28(%ebp), %eax movl %eax, 12(%edx) The more strange is that eliminating the array member of the union as following union I16x8 { __m128i m; }; Also generates *exactly* the same redundant code: movl 8(%ebp), %edx movl (%edx), %eax movl %eax, -24(%ebp) movl 4(%edx), %eax movl %eax, -20(%ebp) movl 8(%edx), %eax movl %eax, -16(%ebp) movl 12(%edx), %eax movl %eax, -12(%ebp) movdqa -24(%ebp), %xmm0 paddw %xmm0, %xmm0 movdqa %xmm0, -40(%ebp) movl -40(%ebp), %eax movl %eax, (%edx) movl -36(%ebp), %eax movl %eax, 4(%edx) movl -32(%ebp), %eax movl %eax, 8(%edx) movl -28(%ebp), %eax movl %eax, 12(%edx) However overwriting the assignment operator as: union I16x8 { __m128i m; short v[8]; I16x8 & operator =(I16x8 &o) { m = o.m; return *this; } }; Generates the right assembly code for the function above: movl 8(%ebp), %eax movdqa (%eax), %xmm0 paddw %xmm0, %xmm0 movdqa %xmm0, (%eax) Also strange, is that a dummy structure as follows: struct I16x8 { __m128i m; }; Also generates the right code (exactly as above): movl 8(%ebp), %eax movdqa (%eax), %xmm0 paddw %xmm0, %xmm0 movdqa %xmm0, (%eax) The union of vector type with a array of integers is an example used in almost every tutorial of the SIMD intrinsics out there. This bug was causing gcc to perform poorly with my code compared with Microsoft Visual C++ Compiler and Intel C++ Compiler, but after working around this it generated faster code than both.
Created attachment 14031 [details] Example code This is the source-code for my example above. To get the assembly run as: gcc -S -DCASE=0 -O3 -msse2 -o sse2-union-0.s sse2-union.cpp gcc -S -DCASE=1 -O3 -msse2 -o sse2-union-1.s sse2-union.cpp gcc -S -DCASE=2 -O3 -msse2 -o sse2-union-2.s sse2-union.cpp gcc -S -DCASE=3 -O3 -msse2 -o sse2-union-3.s sse2-union.cpp This was run on (gcc -v): Using built-in specs. Target: i486-linux-gnu Configured with: ../src/configure -v --enable-languages=c,c++,fortran,objc,obj-c++,treelang --prefix=/usr --enable-shared --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --enable-nls --with-gxx-include-dir=/usr/include/c++/4.1.3 --program-suffix=-4.1 --enable-__cxa_atexit --enable-clocale=gnu --enable-libstdcxx-debug --enable-mpfr --enable-checking=release i486-linux-gnu Thread model: posix gcc version 4.1.3 20070718 (prerelease) (Debian 4.1.2-14) But I actually first discovered this in an unofficial build of gcc-4.2 for MinGW.
This is a target issue for not using the SSE register when doing copies: ;; a = *p Is expanded as sequences of load/stores.
Been fixed for a long time now. At least since GCC 4.8.5. a$m_4 = MEM[(const union I16x8 &)p_2(D)]; _5 = VIEW_CONVERT_EXPR<__v8hi>(a$m_4); _6 = __builtin_ia32_paddw128 (_5, _5); _7 = VIEW_CONVERT_EXPR<__m128i>(_6); MEM[(union I16x8 *)p_2(D)] = _7; Either it was when SRA was improved or MEM_REF was added but it has been fixed for over 8 years now.