58095 – SIMD code requiring auxiliary array for best optimization

Bug 58095 - SIMD code requiring auxiliary array for best optimization

Summary: SIMD code requiring auxiliary array for best optimization

Status:	UNCONFIRMED

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	tree-optimization (show other bugs)
Version:	unknown

Importance:	P3 enhancement
Target Milestone:	---
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Depends on:
Blocks:

Reported:	2013-08-06 16:03 UTC by Siavash Eliasi
Modified:	2021-08-28 18:48 UTC (History)
CC List:	1 user (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:

Attachments
Source code and its generated asm code. (840 bytes, text/plain) 2013-08-06 16:03 UTC, Siavash Eliasi	Details
View All Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Siavash Eliasi 2013-08-06 16:03:16 UTC

Created attachment 30621 [details]
Source code and its generated asm code.

Hello. I have noticed a strange behavior when I'm trying to write SIMD code using provided SSE intrinsics. It looks like GCC is not able to generate/optimize same code like function (bar) for function (foo).


I was wondering how can I achieve same generated code for the function (foo) without going into trouble of defining and using an auxiliary array like function (bar).


I've tried using __restrict__ keyword for input data (foo2), but GCC still generates same code like function (foo). ICC and Clang also generate same code and fail to optimize.

Something strange I've noticed is that GCC 4.4.7 generates desired code for function (foo), but fails to do for function (foo2) and (bar). Newer versions generate exactly same code for function (foo) and (foo2), and desired code for function (bar).

Output attached is generated from GCC 4.8.1 using -O2 optimization level. I've used online GCC compiler from: http://gcc.godbolt.org/

Comment 1 Andrew Pinski 2013-08-06 16:54:28 UTC

>I've tried using __restrict__ keyword for input data (foo2),

I think you want __restrict__ inside of the [].

Comment 2 Siavash Eliasi 2013-08-06 17:46:00 UTC

(In reply to Andrew Pinski from comment #1)
> >I've tried using __restrict__ keyword for input data (foo2),
> 
> I think you want __restrict__ inside of the [].

Do you mind pasting the modified source code and generated asm code please?

Comment 3 Siavash Eliasi 2013-08-07 05:13:29 UTC

I did an experiment with using raw float data types instead of __m128 data type. This time GCC, Clang and ICC were able to generate desired code, even without using __restric__ keyword, but a little more dirty (Pointer arithmetics).

Not most, but I'm sure that new video decoder/encoder, game engines and similar applications are using __m128 data types directly instead of float data types, because (1) it guarantees them to be 16byte aligned, (2) removes the need to manually load/store data from memory to XMM/YMM registers, (3) makes the source code smaller and easier to maintain and (4) much more clean and smaller generated code.

In conclusion, I don't think issue me and other people are facing is related to not using __restrict__ keyword. All compilers fail to generate optimal code when facing __m128 data types. However as an exception, ICC is able to generate optimal code when facing __m128 data types and __restrict__ keyword mixed.

Here is what I have tried:

#include <xmmintrin.h>

void fooFloat(float* a, float* b, float* d, float* c, unsigned int size)
{
	for (unsigned int i = 0; i < size; i+=32)
	{
		__m128 ax[8], bx[8], cx[8], dx[8];

		ax[0] = _mm_load_ps(&a[i*32+0]);
		ax[1] = _mm_load_ps(&a[i*32+4]);
		ax[2] = _mm_load_ps(&a[i*32+8]);
		ax[3] = _mm_load_ps(&a[i*32+12]);
		ax[4] = _mm_load_ps(&a[i*32+16]);
		ax[5] = _mm_load_ps(&a[i*32+20]);
		ax[6] = _mm_load_ps(&a[i*32+24]);
		ax[7] = _mm_load_ps(&a[i*32+28]);

		bx[0] = _mm_load_ps(&b[i*32+0]);
		bx[1] = _mm_load_ps(&b[i*32+4]);
		bx[2] = _mm_load_ps(&b[i*32+8]);
		bx[3] = _mm_load_ps(&b[i*32+12]);
		bx[4] = _mm_load_ps(&b[i*32+16]);
		bx[5] = _mm_load_ps(&b[i*32+20]);
		bx[6] = _mm_load_ps(&b[i*32+24]);
		bx[7] = _mm_load_ps(&b[i*32+28]);

		dx[0] = _mm_load_ps(&d[i*32+0]);
		dx[1] = _mm_load_ps(&d[i*32+4]);
		dx[2] = _mm_load_ps(&d[i*32+8]);
		dx[3] = _mm_load_ps(&d[i*32+12]);
		dx[4] = _mm_load_ps(&d[i*32+16]);
		dx[5] = _mm_load_ps(&d[i*32+20]);
		dx[6] = _mm_load_ps(&d[i*32+24]);
		dx[7] = _mm_load_ps(&d[i*32+28]);

		cx[0] = _mm_add_ps(ax[0], _mm_mul_ps(dx[0], bx[0]));
		cx[1] = _mm_add_ps(ax[1], _mm_mul_ps(dx[1], bx[1]));
		cx[2] = _mm_add_ps(ax[2], _mm_mul_ps(dx[2], bx[2]));
		cx[3] = _mm_add_ps(ax[3], _mm_mul_ps(dx[3], bx[3]));
		cx[4] = _mm_add_ps(ax[4], _mm_mul_ps(dx[4], bx[4]));
		cx[5] = _mm_add_ps(ax[5], _mm_mul_ps(dx[5], bx[5]));
		cx[6] = _mm_add_ps(ax[6], _mm_mul_ps(dx[6], bx[6]));
		cx[7] = _mm_add_ps(ax[7], _mm_mul_ps(dx[7], bx[7]));

		_mm_store_ps(&c[i*32+0], cx[0]);
		_mm_store_ps(&c[i*32+4], cx[1]);
		_mm_store_ps(&c[i*32+8], cx[2]);
		_mm_store_ps(&c[i*32+12], cx[3]);
		_mm_store_ps(&c[i*32+16], cx[4]);
		_mm_store_ps(&c[i*32+20], cx[5]);
		_mm_store_ps(&c[i*32+24], cx[6]);
		_mm_store_ps(&c[i*32+28], cx[7]);
	}
}

And its output using GCC 4.8.1 -O2 :

fooFloat(float*, float*, float*, float*, unsigned int):
	push	r15
	xor	r15d, r15d
	test	r8d, r8d
	mov	eax, 4
	push	r14
	push	r13
	push	r12
	push	rbp
	push	rbx
	je	.L15
.L19:
	lea	r12d, [rax+4]
	lea	ebp, [rax+8]
	lea	ebx, [rax+12]
	lea	r11d, [rax+16]
	lea	r10d, [rax+20]
	lea	r9d, [rax+24]
	mov	r14d, r15d
	mov	r13d, eax
	add	r15d, 32
	sal	r14d, 5
	movaps	xmm6, XMMWORD PTR [rdx+r13*4]
	add	eax, 1024
	cmp	r8d, r15d
	movaps	xmm7, XMMWORD PTR [rdx+r14*4]
	mulps	xmm6, XMMWORD PTR [rsi+r13*4]
	movaps	xmm5, XMMWORD PTR [rdx+r12*4]
	mulps	xmm7, XMMWORD PTR [rsi+r14*4]
	movaps	xmm4, XMMWORD PTR [rdx+rbp*4]
	mulps	xmm5, XMMWORD PTR [rsi+r12*4]
	movaps	xmm3, XMMWORD PTR [rdx+rbx*4]
	mulps	xmm4, XMMWORD PTR [rsi+rbp*4]
	movaps	xmm2, XMMWORD PTR [rdx+r11*4]
	mulps	xmm3, XMMWORD PTR [rsi+rbx*4]
	movaps	xmm1, XMMWORD PTR [rdx+r10*4]
	mulps	xmm2, XMMWORD PTR [rsi+r11*4]
	movaps	xmm0, XMMWORD PTR [rdx+r9*4]
	mulps	xmm1, XMMWORD PTR [rsi+r10*4]
	addps	xmm7, XMMWORD PTR [rdi+r14*4]
	mulps	xmm0, XMMWORD PTR [rsi+r9*4]
	addps	xmm6, XMMWORD PTR [rdi+r13*4]
	addps	xmm5, XMMWORD PTR [rdi+r12*4]
	addps	xmm4, XMMWORD PTR [rdi+rbp*4]
	addps	xmm3, XMMWORD PTR [rdi+rbx*4]
	addps	xmm2, XMMWORD PTR [rdi+r11*4]
	addps	xmm1, XMMWORD PTR [rdi+r10*4]
	addps	xmm0, XMMWORD PTR [rdi+r9*4]
	movaps	XMMWORD PTR [rcx+r14*4], xmm7
	movaps	XMMWORD PTR [rcx+r13*4], xmm6
	movaps	XMMWORD PTR [rcx+r12*4], xmm5
	movaps	XMMWORD PTR [rcx+rbp*4], xmm4
	movaps	XMMWORD PTR [rcx+rbx*4], xmm3
	movaps	XMMWORD PTR [rcx+r11*4], xmm2
	movaps	XMMWORD PTR [rcx+r10*4], xmm1
	movaps	XMMWORD PTR [rcx+r9*4], xmm0
	ja	.L19
.L15:
	pop	rbx
	pop	rbp
	pop	r12
	pop	r13
	pop	r14
	pop	r15
	ret

Comment 4 Siavash Eliasi 2013-08-07 06:31:51 UTC

In the end, here is what I really like GCC to generate for me. Same output as function (bar) for function (foo) when using GCC with -O3 -march=core2 switches:

#include <xmmintrin.h>

#define BATCHSIZE 8

void foo(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE], __m128 c[][BATCHSIZE], unsigned int size)
{
	for (unsigned int i = 0; i < size; i++)
	{
		for (unsigned int j=0; j<BATCHSIZE; j++)
		{
			c[i][j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j]));
		}
	}
}

void bar(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE], __m128 c[][BATCHSIZE], unsigned int size)
{
	for (unsigned int i = 0; i < size; i++)
	{
		__m128 cx[BATCHSIZE];
      
		for (unsigned int j=0; j<BATCHSIZE; j++)
		{
			cx[j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j]));
		}

		for (unsigned int j=0; j<BATCHSIZE; j++)
		{
			c[i][j] = cx[j]; 
		}
	}
}

Generated asm code:

foo(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], unsigned int):
	test	r8d, r8d
	je	.L1
	xor	eax, eax
.L4:
	movaps	xmm0, XMMWORD PTR [rdx]
	add	eax, 1
	sub	rsi, -128
	sub	rdx, -128
	sub	rdi, -128
	sub	rcx, -128
	mulps	xmm0, XMMWORD PTR [rsi-128]
	addps	xmm0, XMMWORD PTR [rdi-128]
	movaps	XMMWORD PTR [rcx-128], xmm0
	movaps	xmm0, XMMWORD PTR [rdx-112]
	mulps	xmm0, XMMWORD PTR [rsi-112]
	addps	xmm0, XMMWORD PTR [rdi-112]
	movaps	XMMWORD PTR [rcx-112], xmm0
	movaps	xmm0, XMMWORD PTR [rdx-96]
	mulps	xmm0, XMMWORD PTR [rsi-96]
	addps	xmm0, XMMWORD PTR [rdi-96]
	movaps	XMMWORD PTR [rcx-96], xmm0
	movaps	xmm0, XMMWORD PTR [rdx-80]
	mulps	xmm0, XMMWORD PTR [rsi-80]
	addps	xmm0, XMMWORD PTR [rdi-80]
	movaps	XMMWORD PTR [rcx-80], xmm0
	movaps	xmm0, XMMWORD PTR [rdx-64]
	mulps	xmm0, XMMWORD PTR [rsi-64]
	addps	xmm0, XMMWORD PTR [rdi-64]
	movaps	XMMWORD PTR [rcx-64], xmm0
	movaps	xmm0, XMMWORD PTR [rdx-48]
	mulps	xmm0, XMMWORD PTR [rsi-48]
	addps	xmm0, XMMWORD PTR [rdi-48]
	movaps	XMMWORD PTR [rcx-48], xmm0
	movaps	xmm0, XMMWORD PTR [rdx-32]
	mulps	xmm0, XMMWORD PTR [rsi-32]
	addps	xmm0, XMMWORD PTR [rdi-32]
	movaps	XMMWORD PTR [rcx-32], xmm0
	movaps	xmm0, XMMWORD PTR [rdx-16]
	mulps	xmm0, XMMWORD PTR [rsi-16]
	addps	xmm0, XMMWORD PTR [rdi-16]
	movaps	XMMWORD PTR [rcx-16], xmm0
	cmp	eax, r8d
	jne	.L4
.L1:
	rep; ret
bar(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], unsigned int):
	test	r8d, r8d
	je	.L6
	xor	eax, eax
.L9:
	movaps	xmm7, XMMWORD PTR [rdx]
	add	eax, 1
	sub	rsi, -128
	movaps	xmm6, XMMWORD PTR [rdx+16]
	sub	rdi, -128
	sub	rdx, -128
	movaps	xmm5, XMMWORD PTR [rdx-96]
	sub	rcx, -128
	movaps	xmm4, XMMWORD PTR [rdx-80]
	movaps	xmm3, XMMWORD PTR [rdx-64]
	movaps	xmm2, XMMWORD PTR [rdx-48]
	movaps	xmm1, XMMWORD PTR [rdx-32]
	movaps	xmm0, XMMWORD PTR [rdx-16]
	mulps	xmm7, XMMWORD PTR [rsi-128]
	mulps	xmm6, XMMWORD PTR [rsi-112]
	mulps	xmm5, XMMWORD PTR [rsi-96]
	mulps	xmm4, XMMWORD PTR [rsi-80]
	mulps	xmm3, XMMWORD PTR [rsi-64]
	mulps	xmm2, XMMWORD PTR [rsi-48]
	mulps	xmm1, XMMWORD PTR [rsi-32]
	mulps	xmm0, XMMWORD PTR [rsi-16]
	addps	xmm7, XMMWORD PTR [rdi-128]
	addps	xmm6, XMMWORD PTR [rdi-112]
	addps	xmm5, XMMWORD PTR [rdi-96]
	addps	xmm4, XMMWORD PTR [rdi-80]
	addps	xmm3, XMMWORD PTR [rdi-64]
	addps	xmm2, XMMWORD PTR [rdi-48]
	addps	xmm1, XMMWORD PTR [rdi-32]
	addps	xmm0, XMMWORD PTR [rdi-16]
	movaps	XMMWORD PTR [rcx-128], xmm7
	movaps	XMMWORD PTR [rcx-112], xmm6
	movaps	XMMWORD PTR [rcx-96], xmm5
	movaps	XMMWORD PTR [rcx-80], xmm4
	movaps	XMMWORD PTR [rcx-64], xmm3
	movaps	XMMWORD PTR [rcx-48], xmm2
	movaps	XMMWORD PTR [rcx-32], xmm1
	movaps	XMMWORD PTR [rcx-16], xmm0
	cmp	eax, r8d
	jne	.L9
.L6:
	rep; ret