Created attachment 30621 [details] Source code and its generated asm code. Hello. I have noticed a strange behavior when I'm trying to write SIMD code using provided SSE intrinsics. It looks like GCC is not able to generate/optimize same code like function (bar) for function (foo). I was wondering how can I achieve same generated code for the function (foo) without going into trouble of defining and using an auxiliary array like function (bar). I've tried using __restrict__ keyword for input data (foo2), but GCC still generates same code like function (foo). ICC and Clang also generate same code and fail to optimize. Something strange I've noticed is that GCC 4.4.7 generates desired code for function (foo), but fails to do for function (foo2) and (bar). Newer versions generate exactly same code for function (foo) and (foo2), and desired code for function (bar). Output attached is generated from GCC 4.8.1 using -O2 optimization level. I've used online GCC compiler from: http://gcc.godbolt.org/
>I've tried using __restrict__ keyword for input data (foo2), I think you want __restrict__ inside of the [].
(In reply to Andrew Pinski from comment #1) > >I've tried using __restrict__ keyword for input data (foo2), > > I think you want __restrict__ inside of the []. Do you mind pasting the modified source code and generated asm code please?
I did an experiment with using raw float data types instead of __m128 data type. This time GCC, Clang and ICC were able to generate desired code, even without using __restric__ keyword, but a little more dirty (Pointer arithmetics). Not most, but I'm sure that new video decoder/encoder, game engines and similar applications are using __m128 data types directly instead of float data types, because (1) it guarantees them to be 16byte aligned, (2) removes the need to manually load/store data from memory to XMM/YMM registers, (3) makes the source code smaller and easier to maintain and (4) much more clean and smaller generated code. In conclusion, I don't think issue me and other people are facing is related to not using __restrict__ keyword. All compilers fail to generate optimal code when facing __m128 data types. However as an exception, ICC is able to generate optimal code when facing __m128 data types and __restrict__ keyword mixed. Here is what I have tried: #include <xmmintrin.h> void fooFloat(float* a, float* b, float* d, float* c, unsigned int size) { for (unsigned int i = 0; i < size; i+=32) { __m128 ax[8], bx[8], cx[8], dx[8]; ax[0] = _mm_load_ps(&a[i*32+0]); ax[1] = _mm_load_ps(&a[i*32+4]); ax[2] = _mm_load_ps(&a[i*32+8]); ax[3] = _mm_load_ps(&a[i*32+12]); ax[4] = _mm_load_ps(&a[i*32+16]); ax[5] = _mm_load_ps(&a[i*32+20]); ax[6] = _mm_load_ps(&a[i*32+24]); ax[7] = _mm_load_ps(&a[i*32+28]); bx[0] = _mm_load_ps(&b[i*32+0]); bx[1] = _mm_load_ps(&b[i*32+4]); bx[2] = _mm_load_ps(&b[i*32+8]); bx[3] = _mm_load_ps(&b[i*32+12]); bx[4] = _mm_load_ps(&b[i*32+16]); bx[5] = _mm_load_ps(&b[i*32+20]); bx[6] = _mm_load_ps(&b[i*32+24]); bx[7] = _mm_load_ps(&b[i*32+28]); dx[0] = _mm_load_ps(&d[i*32+0]); dx[1] = _mm_load_ps(&d[i*32+4]); dx[2] = _mm_load_ps(&d[i*32+8]); dx[3] = _mm_load_ps(&d[i*32+12]); dx[4] = _mm_load_ps(&d[i*32+16]); dx[5] = _mm_load_ps(&d[i*32+20]); dx[6] = _mm_load_ps(&d[i*32+24]); dx[7] = _mm_load_ps(&d[i*32+28]); cx[0] = _mm_add_ps(ax[0], _mm_mul_ps(dx[0], bx[0])); cx[1] = _mm_add_ps(ax[1], _mm_mul_ps(dx[1], bx[1])); cx[2] = _mm_add_ps(ax[2], _mm_mul_ps(dx[2], bx[2])); cx[3] = _mm_add_ps(ax[3], _mm_mul_ps(dx[3], bx[3])); cx[4] = _mm_add_ps(ax[4], _mm_mul_ps(dx[4], bx[4])); cx[5] = _mm_add_ps(ax[5], _mm_mul_ps(dx[5], bx[5])); cx[6] = _mm_add_ps(ax[6], _mm_mul_ps(dx[6], bx[6])); cx[7] = _mm_add_ps(ax[7], _mm_mul_ps(dx[7], bx[7])); _mm_store_ps(&c[i*32+0], cx[0]); _mm_store_ps(&c[i*32+4], cx[1]); _mm_store_ps(&c[i*32+8], cx[2]); _mm_store_ps(&c[i*32+12], cx[3]); _mm_store_ps(&c[i*32+16], cx[4]); _mm_store_ps(&c[i*32+20], cx[5]); _mm_store_ps(&c[i*32+24], cx[6]); _mm_store_ps(&c[i*32+28], cx[7]); } } And its output using GCC 4.8.1 -O2 : fooFloat(float*, float*, float*, float*, unsigned int): push r15 xor r15d, r15d test r8d, r8d mov eax, 4 push r14 push r13 push r12 push rbp push rbx je .L15 .L19: lea r12d, [rax+4] lea ebp, [rax+8] lea ebx, [rax+12] lea r11d, [rax+16] lea r10d, [rax+20] lea r9d, [rax+24] mov r14d, r15d mov r13d, eax add r15d, 32 sal r14d, 5 movaps xmm6, XMMWORD PTR [rdx+r13*4] add eax, 1024 cmp r8d, r15d movaps xmm7, XMMWORD PTR [rdx+r14*4] mulps xmm6, XMMWORD PTR [rsi+r13*4] movaps xmm5, XMMWORD PTR [rdx+r12*4] mulps xmm7, XMMWORD PTR [rsi+r14*4] movaps xmm4, XMMWORD PTR [rdx+rbp*4] mulps xmm5, XMMWORD PTR [rsi+r12*4] movaps xmm3, XMMWORD PTR [rdx+rbx*4] mulps xmm4, XMMWORD PTR [rsi+rbp*4] movaps xmm2, XMMWORD PTR [rdx+r11*4] mulps xmm3, XMMWORD PTR [rsi+rbx*4] movaps xmm1, XMMWORD PTR [rdx+r10*4] mulps xmm2, XMMWORD PTR [rsi+r11*4] movaps xmm0, XMMWORD PTR [rdx+r9*4] mulps xmm1, XMMWORD PTR [rsi+r10*4] addps xmm7, XMMWORD PTR [rdi+r14*4] mulps xmm0, XMMWORD PTR [rsi+r9*4] addps xmm6, XMMWORD PTR [rdi+r13*4] addps xmm5, XMMWORD PTR [rdi+r12*4] addps xmm4, XMMWORD PTR [rdi+rbp*4] addps xmm3, XMMWORD PTR [rdi+rbx*4] addps xmm2, XMMWORD PTR [rdi+r11*4] addps xmm1, XMMWORD PTR [rdi+r10*4] addps xmm0, XMMWORD PTR [rdi+r9*4] movaps XMMWORD PTR [rcx+r14*4], xmm7 movaps XMMWORD PTR [rcx+r13*4], xmm6 movaps XMMWORD PTR [rcx+r12*4], xmm5 movaps XMMWORD PTR [rcx+rbp*4], xmm4 movaps XMMWORD PTR [rcx+rbx*4], xmm3 movaps XMMWORD PTR [rcx+r11*4], xmm2 movaps XMMWORD PTR [rcx+r10*4], xmm1 movaps XMMWORD PTR [rcx+r9*4], xmm0 ja .L19 .L15: pop rbx pop rbp pop r12 pop r13 pop r14 pop r15 ret
In the end, here is what I really like GCC to generate for me. Same output as function (bar) for function (foo) when using GCC with -O3 -march=core2 switches: #include <xmmintrin.h> #define BATCHSIZE 8 void foo(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE], __m128 c[][BATCHSIZE], unsigned int size) { for (unsigned int i = 0; i < size; i++) { for (unsigned int j=0; j<BATCHSIZE; j++) { c[i][j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j])); } } } void bar(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE], __m128 c[][BATCHSIZE], unsigned int size) { for (unsigned int i = 0; i < size; i++) { __m128 cx[BATCHSIZE]; for (unsigned int j=0; j<BATCHSIZE; j++) { cx[j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j])); } for (unsigned int j=0; j<BATCHSIZE; j++) { c[i][j] = cx[j]; } } } Generated asm code: foo(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], unsigned int): test r8d, r8d je .L1 xor eax, eax .L4: movaps xmm0, XMMWORD PTR [rdx] add eax, 1 sub rsi, -128 sub rdx, -128 sub rdi, -128 sub rcx, -128 mulps xmm0, XMMWORD PTR [rsi-128] addps xmm0, XMMWORD PTR [rdi-128] movaps XMMWORD PTR [rcx-128], xmm0 movaps xmm0, XMMWORD PTR [rdx-112] mulps xmm0, XMMWORD PTR [rsi-112] addps xmm0, XMMWORD PTR [rdi-112] movaps XMMWORD PTR [rcx-112], xmm0 movaps xmm0, XMMWORD PTR [rdx-96] mulps xmm0, XMMWORD PTR [rsi-96] addps xmm0, XMMWORD PTR [rdi-96] movaps XMMWORD PTR [rcx-96], xmm0 movaps xmm0, XMMWORD PTR [rdx-80] mulps xmm0, XMMWORD PTR [rsi-80] addps xmm0, XMMWORD PTR [rdi-80] movaps XMMWORD PTR [rcx-80], xmm0 movaps xmm0, XMMWORD PTR [rdx-64] mulps xmm0, XMMWORD PTR [rsi-64] addps xmm0, XMMWORD PTR [rdi-64] movaps XMMWORD PTR [rcx-64], xmm0 movaps xmm0, XMMWORD PTR [rdx-48] mulps xmm0, XMMWORD PTR [rsi-48] addps xmm0, XMMWORD PTR [rdi-48] movaps XMMWORD PTR [rcx-48], xmm0 movaps xmm0, XMMWORD PTR [rdx-32] mulps xmm0, XMMWORD PTR [rsi-32] addps xmm0, XMMWORD PTR [rdi-32] movaps XMMWORD PTR [rcx-32], xmm0 movaps xmm0, XMMWORD PTR [rdx-16] mulps xmm0, XMMWORD PTR [rsi-16] addps xmm0, XMMWORD PTR [rdi-16] movaps XMMWORD PTR [rcx-16], xmm0 cmp eax, r8d jne .L4 .L1: rep; ret bar(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], unsigned int): test r8d, r8d je .L6 xor eax, eax .L9: movaps xmm7, XMMWORD PTR [rdx] add eax, 1 sub rsi, -128 movaps xmm6, XMMWORD PTR [rdx+16] sub rdi, -128 sub rdx, -128 movaps xmm5, XMMWORD PTR [rdx-96] sub rcx, -128 movaps xmm4, XMMWORD PTR [rdx-80] movaps xmm3, XMMWORD PTR [rdx-64] movaps xmm2, XMMWORD PTR [rdx-48] movaps xmm1, XMMWORD PTR [rdx-32] movaps xmm0, XMMWORD PTR [rdx-16] mulps xmm7, XMMWORD PTR [rsi-128] mulps xmm6, XMMWORD PTR [rsi-112] mulps xmm5, XMMWORD PTR [rsi-96] mulps xmm4, XMMWORD PTR [rsi-80] mulps xmm3, XMMWORD PTR [rsi-64] mulps xmm2, XMMWORD PTR [rsi-48] mulps xmm1, XMMWORD PTR [rsi-32] mulps xmm0, XMMWORD PTR [rsi-16] addps xmm7, XMMWORD PTR [rdi-128] addps xmm6, XMMWORD PTR [rdi-112] addps xmm5, XMMWORD PTR [rdi-96] addps xmm4, XMMWORD PTR [rdi-80] addps xmm3, XMMWORD PTR [rdi-64] addps xmm2, XMMWORD PTR [rdi-48] addps xmm1, XMMWORD PTR [rdi-32] addps xmm0, XMMWORD PTR [rdi-16] movaps XMMWORD PTR [rcx-128], xmm7 movaps XMMWORD PTR [rcx-112], xmm6 movaps XMMWORD PTR [rcx-96], xmm5 movaps XMMWORD PTR [rcx-80], xmm4 movaps XMMWORD PTR [rcx-64], xmm3 movaps XMMWORD PTR [rcx-48], xmm2 movaps XMMWORD PTR [rcx-32], xmm1 movaps XMMWORD PTR [rcx-16], xmm0 cmp eax, r8d jne .L9 .L6: rep; ret