This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/58095] SIMD code requiring auxiliary array for best optimization
- From: "siavashserver at gmail dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Wed, 07 Aug 2013 05:13:29 +0000
- Subject: [Bug tree-optimization/58095] SIMD code requiring auxiliary array for best optimization
- Auto-submitted: auto-generated
- References: <bug-58095-4 at http dot gcc dot gnu dot org/bugzilla/>
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58095
--- Comment #3 from Siavash Eliasi <siavashserver at gmail dot com> ---
I did an experiment with using raw float data types instead of __m128 data
type. This time GCC, Clang and ICC were able to generate desired code, even
without using __restric__ keyword, but a little more dirty (Pointer
arithmetics).
Not most, but I'm sure that new video decoder/encoder, game engines and similar
applications are using __m128 data types directly instead of float data types,
because (1) it guarantees them to be 16byte aligned, (2) removes the need to
manually load/store data from memory to XMM/YMM registers, (3) makes the source
code smaller and easier to maintain and (4) much more clean and smaller
generated code.
In conclusion, I don't think issue me and other people are facing is related to
not using __restrict__ keyword. All compilers fail to generate optimal code
when facing __m128 data types. However as an exception, ICC is able to generate
optimal code when facing __m128 data types and __restrict__ keyword mixed.
Here is what I have tried:
#include <xmmintrin.h>
void fooFloat(float* a, float* b, float* d, float* c, unsigned int size)
{
for (unsigned int i = 0; i < size; i+=32)
{
__m128 ax[8], bx[8], cx[8], dx[8];
ax[0] = _mm_load_ps(&a[i*32+0]);
ax[1] = _mm_load_ps(&a[i*32+4]);
ax[2] = _mm_load_ps(&a[i*32+8]);
ax[3] = _mm_load_ps(&a[i*32+12]);
ax[4] = _mm_load_ps(&a[i*32+16]);
ax[5] = _mm_load_ps(&a[i*32+20]);
ax[6] = _mm_load_ps(&a[i*32+24]);
ax[7] = _mm_load_ps(&a[i*32+28]);
bx[0] = _mm_load_ps(&b[i*32+0]);
bx[1] = _mm_load_ps(&b[i*32+4]);
bx[2] = _mm_load_ps(&b[i*32+8]);
bx[3] = _mm_load_ps(&b[i*32+12]);
bx[4] = _mm_load_ps(&b[i*32+16]);
bx[5] = _mm_load_ps(&b[i*32+20]);
bx[6] = _mm_load_ps(&b[i*32+24]);
bx[7] = _mm_load_ps(&b[i*32+28]);
dx[0] = _mm_load_ps(&d[i*32+0]);
dx[1] = _mm_load_ps(&d[i*32+4]);
dx[2] = _mm_load_ps(&d[i*32+8]);
dx[3] = _mm_load_ps(&d[i*32+12]);
dx[4] = _mm_load_ps(&d[i*32+16]);
dx[5] = _mm_load_ps(&d[i*32+20]);
dx[6] = _mm_load_ps(&d[i*32+24]);
dx[7] = _mm_load_ps(&d[i*32+28]);
cx[0] = _mm_add_ps(ax[0], _mm_mul_ps(dx[0], bx[0]));
cx[1] = _mm_add_ps(ax[1], _mm_mul_ps(dx[1], bx[1]));
cx[2] = _mm_add_ps(ax[2], _mm_mul_ps(dx[2], bx[2]));
cx[3] = _mm_add_ps(ax[3], _mm_mul_ps(dx[3], bx[3]));
cx[4] = _mm_add_ps(ax[4], _mm_mul_ps(dx[4], bx[4]));
cx[5] = _mm_add_ps(ax[5], _mm_mul_ps(dx[5], bx[5]));
cx[6] = _mm_add_ps(ax[6], _mm_mul_ps(dx[6], bx[6]));
cx[7] = _mm_add_ps(ax[7], _mm_mul_ps(dx[7], bx[7]));
_mm_store_ps(&c[i*32+0], cx[0]);
_mm_store_ps(&c[i*32+4], cx[1]);
_mm_store_ps(&c[i*32+8], cx[2]);
_mm_store_ps(&c[i*32+12], cx[3]);
_mm_store_ps(&c[i*32+16], cx[4]);
_mm_store_ps(&c[i*32+20], cx[5]);
_mm_store_ps(&c[i*32+24], cx[6]);
_mm_store_ps(&c[i*32+28], cx[7]);
}
}
And its output using GCC 4.8.1 -O2 :
fooFloat(float*, float*, float*, float*, unsigned int):
push r15
xor r15d, r15d
test r8d, r8d
mov eax, 4
push r14
push r13
push r12
push rbp
push rbx
je .L15
.L19:
lea r12d, [rax+4]
lea ebp, [rax+8]
lea ebx, [rax+12]
lea r11d, [rax+16]
lea r10d, [rax+20]
lea r9d, [rax+24]
mov r14d, r15d
mov r13d, eax
add r15d, 32
sal r14d, 5
movaps xmm6, XMMWORD PTR [rdx+r13*4]
add eax, 1024
cmp r8d, r15d
movaps xmm7, XMMWORD PTR [rdx+r14*4]
mulps xmm6, XMMWORD PTR [rsi+r13*4]
movaps xmm5, XMMWORD PTR [rdx+r12*4]
mulps xmm7, XMMWORD PTR [rsi+r14*4]
movaps xmm4, XMMWORD PTR [rdx+rbp*4]
mulps xmm5, XMMWORD PTR [rsi+r12*4]
movaps xmm3, XMMWORD PTR [rdx+rbx*4]
mulps xmm4, XMMWORD PTR [rsi+rbp*4]
movaps xmm2, XMMWORD PTR [rdx+r11*4]
mulps xmm3, XMMWORD PTR [rsi+rbx*4]
movaps xmm1, XMMWORD PTR [rdx+r10*4]
mulps xmm2, XMMWORD PTR [rsi+r11*4]
movaps xmm0, XMMWORD PTR [rdx+r9*4]
mulps xmm1, XMMWORD PTR [rsi+r10*4]
addps xmm7, XMMWORD PTR [rdi+r14*4]
mulps xmm0, XMMWORD PTR [rsi+r9*4]
addps xmm6, XMMWORD PTR [rdi+r13*4]
addps xmm5, XMMWORD PTR [rdi+r12*4]
addps xmm4, XMMWORD PTR [rdi+rbp*4]
addps xmm3, XMMWORD PTR [rdi+rbx*4]
addps xmm2, XMMWORD PTR [rdi+r11*4]
addps xmm1, XMMWORD PTR [rdi+r10*4]
addps xmm0, XMMWORD PTR [rdi+r9*4]
movaps XMMWORD PTR [rcx+r14*4], xmm7
movaps XMMWORD PTR [rcx+r13*4], xmm6
movaps XMMWORD PTR [rcx+r12*4], xmm5
movaps XMMWORD PTR [rcx+rbp*4], xmm4
movaps XMMWORD PTR [rcx+rbx*4], xmm3
movaps XMMWORD PTR [rcx+r11*4], xmm2
movaps XMMWORD PTR [rcx+r10*4], xmm1
movaps XMMWORD PTR [rcx+r9*4], xmm0
ja .L19
.L15:
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret