typedef int v4si __attribute__((vector_size(16))); v4si foo (int i, int j, int k, int l) { return (v4si) { i, j, k, l }; } produces movl %edx, -12(%rsp) movd -12(%rsp), %xmm1 movl %ecx, -12(%rsp) movd -12(%rsp), %xmm2 movl %edi, -12(%rsp) movd -12(%rsp), %xmm0 movl %esi, -12(%rsp) movd -12(%rsp), %xmm3 punpckldq %xmm2, %xmm1 punpckldq %xmm3, %xmm0 punpcklqdq %xmm1, %xmm0 ret as we spill everything to the stack we could as well use a vector load, thus something like movl %edx, -12(%rsp) movl %ecx, -16(%rsp) movl %edi, -20(%rsp) movl %esi, -24(%rsp) movdqu -12(%rsp), %xmm0 ret
typedef int v4si __attribute__((vector_size(16))); v4si bar (int *i, int *j, int *k, int *l) { return (v4si) { *i, *j, *k, *l }; } looks reasonable (no spills at least, stray move for the return value). movd (%rsi), %xmm0 movd (%rdi), %xmm3 movd (%rcx), %xmm1 movd (%rdx), %xmm2 punpckldq %xmm0, %xmm3 punpckldq %xmm1, %xmm2 movdqa %xmm3, %xmm0 punpcklqdq %xmm2, %xmm0 With -mavx2 we get vmovd (%rdx), %xmm2 vmovd (%rdi), %xmm3 vpinsrd $1, (%rcx), %xmm2, %xmm1 vpinsrd $1, (%rsi), %xmm3, %xmm0 vpunpcklqdq %xmm1, %xmm0, %xmm0
typedef unsigned char v16qi __attribute__((vector_size(16))); v16qi baz (int i0, int i1, int i2, int i3, int i10, int i11, int i12, int i13, int i20, int i21, int i22, int i23, int i30, int i31, int i32, int i33) { return (v16qi) { i0, i1, i2, i3, i10, i11, i12, i13, i20, i21, i22, i23, i30, i31, i32, i33 }; } is even more "funny". I'm looking whether the vectorizer cost model for these vector constructors make sense. Currently the cost is case vec_construct: elements = TYPE_VECTOR_SUBPARTS (vectype); return ix86_cost->vec_stmt_cost * (elements / 2 + 1); which for v16qi and generic would be 9 vector stmts. The assembly for the above contains 47 instructions. Not sure where elements / 2 + 1 comes from.
Another example where the vectorizer thinks vectorization is profitable: #define N 16 unsigned int out[N]; unsigned int in[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; __attribute__ ((noinline)) int main1 (unsigned int x, unsigned int y) { int i; unsigned int a0, a1, a2, a3; a0 = in[0]; a1 = in[1]; a2 = in[2]; a3 = in[3]; out[0] = a0 * x; out[1] = a1 * y; out[2] = a2 * x; out[3] = a3 * y; } generates main1: .LFB0: .cfi_startproc movl %edi, -12(%rsp) movd -12(%rsp), %xmm0 movl %esi, -12(%rsp) movd -12(%rsp), %xmm3 movdqa in(%rip), %xmm2 punpckldq %xmm3, %xmm0 psrlq $32, %xmm2 punpcklqdq %xmm0, %xmm0 movdqa %xmm0, %xmm1 psrlq $32, %xmm0 pmuludq %xmm2, %xmm0 pshufd $8, %xmm0, %xmm0 pmuludq in(%rip), %xmm1 pshufd $8, %xmm1, %xmm1 punpckldq %xmm0, %xmm1 movaps %xmm1, out(%rip) ret slightly less obfuscated when we allow gpr xmm moves with -mtune=intel: main1: .LFB0: .cfi_startproc movd %edi, %xmm0 movd %esi, %xmm3 movdqa in(%rip), %xmm2 punpckldq %xmm3, %xmm0 punpcklqdq %xmm0, %xmm0 psrlq $32, %xmm2 movdqa %xmm0, %xmm1 psrlq $32, %xmm0 pmuludq in(%rip), %xmm1 pmuludq %xmm2, %xmm0 pshufd $8, %xmm1, %xmm1 pshufd $8, %xmm0, %xmm0 punpckldq %xmm0, %xmm1 movdqa %xmm1, out(%rip) ret so for { x, y, x, y } construction we generate movd %edi, %xmm0 movd %esi, %xmm3 punpckldq %xmm3, %xmm0 punpcklqdq %xmm0, %xmm0 no f*** idea where all the shifting and shuffling comes from... This is just vect_cst_.7_18 = {x_6(D), y_9(D), x_6(D), y_9(D)}; vect_a0_2.5_17 = MEM[(unsigned int *)&in]; vect__7.6_19 = vect_a0_2.5_17 * vect_cst_.7_18; MEM[(unsigned int *)&out] = vect__7.6_19;
Somehow we get into very weird initial RTL generated by expand... (insn 12 11 13 (set (reg:V2DI 101) (mult:V2DI (zero_extend:V2DI (vec_select:V2SI (reg:V4SI 95 [ vect_cst_.7 ]) (parallel [ (const_int 0 [0]) (const_int 2 [0x2]) ]))) (zero_extend:V2DI (vec_select:V2SI (mem/c:V4SI (reg/f:DI 94) [1 MEM[(unsigned int *)&in]+0 S16 A256]) (parallel [ (const_int 0 [0]) (const_int 2 [0x2]) ]))))) t.c:17 -1 (nil)) (WTF!? As if there were no integer vector multiply with SSE2 but only DImode ones?!)
On a related note, store_constructor handles VECTOR_TYPE constructors through the vec_init optab but that doesn't work for vector elements: /* Don't use vec_init<mode> if some elements have VECTOR_TYPE. */ if (icode != CODE_FOR_nothing) { tree value; FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (exp), idx, value) if (TREE_CODE (TREE_TYPE (value)) == VECTOR_TYPE) { icode = CODE_FOR_nothing; instead it could, for say a V16QI constructor of four V4QI elements pun the elements to SI mode, create a V4SI mode vector via vec_init and then pun it to V16QI. All punning via subregs. The vectorizer currently creates such vector constructors from grouped strided load/store support.
Testcase simulating all qi vector cases the vectorizer may create char a[1024]; char b[1024]; void foobar (int s) { for (int i = 0; i < 16; ++i) { b[i] = a[s*i]; } } void foo (int s) { for (int i = 0; i < 8; ++i) { b[2*i] = a[s*i]; b[2*i + 1] = a[s*i + 1]; } } void bar (int s) { for (int i = 0; i < 4; ++i) { b[4*i] = a[s*i]; b[4*i + 1] = a[s*i + 1]; b[4*i + 2] = a[s*i + 2]; b[4*i + 3] = a[s*i + 3]; } } void baz(int s) { for (int i = 0; i < 2; ++i) { b[8*i] = a[s*i]; b[8*i + 1] = a[s*i + 1]; b[8*i + 2] = a[s*i + 2]; b[8*i + 3] = a[s*i + 3]; b[8*i + 4] = a[s*i + 4]; b[8*i + 5] = a[s*i + 5]; b[8*i + 6] = a[s*i + 6]; b[8*i + 7] = a[s*i + 7]; } } Compile with -fdisable-tree-cunrolli. foobar creates absymal code and baz needlessly goes through the stack. For plain -msse2 all code-gen isn't great but for foo which ends up using pinsrw. baz fails to use pinsrq and foobar fails to use pinsrq with -msse4.
*** Bug 71467 has been marked as a duplicate of this bug. ***
Testcase from PR71467 exposing the issue from vector lowering: typedef int __v8si __attribute__ ((__vector_size__ (32))); void vz(__v8si *v) { *v <<= 1; } It's also really a middle-end issue. Choices are making vec_init get two modes, destination mode plus element mode like vec_init<v8si><v4si> or simply making accepting any element mode(s?), making it essentially a (vec_concat (vec_concat ...) ...) For integer vectors expansion can also try expanding (v8si) { v4si, v4si } as (v8si) (v2di) { (di) v4si, (di) v4si } thus go via integer modes. But x86 doesn't support (v2ti) { TImode, TImode } for example. Or we can add a different optab, say one for vec_concat and expand the construction by pair-wise vec_concat (that's essentially what targets would do with vec_init but they can of course choose optimal sequences here). I will probably work on the middle-end side but any input from target maintainers is welcome.
description and comment#2 and comment#6 still hold. Once you enable -mavx2 things get better, also when avoiding the gpr<->xmm through memory path which we enable with generic tuning.
Someone might want to look at this, from what I can see it looks like the first few examples are now optimized optimally but I can't say for sure that the later examples are optimized optimally in the same way (it looks like some of them at least somewhat ideally optimized but some of them are... badly optimized, to say the least).