[[hjl@gnu-6 gcc]$ cat /tmp/v.c typedef struct { float f1; float f2; float f3; float f4; } test_structure_8; test_structure_8 struct8 (test_structure_8 ts) { ts.f1 += 1; ts.f2 += 1; ts.f3 += 1; ts.f4 += 1; return ts; } [hjl@gnu-6 gcc]$ ./xgcc -B./ -O3 -S /tmp/v.c [hjl@gnu-6 gcc]$ cat v.s .file "v.c" .section .text.unlikely,"ax",@progbits .LCOLDB1: .text .LHOTB1: .p2align 4,,15 .globl struct8 .type struct8, @function struct8: .LFB0: .cfi_startproc movq %xmm1, -32(%rsp) movq %xmm0, -40(%rsp) movups -40(%rsp), %xmm0 addps .LC0(%rip), %xmm0 movaps %xmm0, -56(%rsp) movq -48(%rsp), %rax movq -56(%rsp), %xmm0 movq %rax, -56(%rsp) movq -56(%rsp), %xmm1 ret .cfi_endproc .LFE0: .size struct8, .-struct8 .section .text.unlikely .LCOLDE1: .text .LHOTE1: .section .rodata.cst16,"aM",@progbits,16 .align 16 .LC0: .long 1065353216 .long 1065353216 .long 1065353216 .long 1065353216 .ident "GCC: (GNU) 4.9.0 20131210 (experimental)" .section .note.GNU-stack,"",@progbits [hjl@gnu-6 gcc]$ At least, we should do .globl struct8 .type struct8, @function struct8: .LFB0: .cfi_startproc movq %xmm1, -32(%rsp) movq %xmm0, -40(%rsp) movups -40(%rsp), %xmm0 addps .LC0(%rip), %xmm0 movaps %xmm0, -56(%rsp) movq -48(%rsp), %xmm1 movq -56(%rsp), %xmm0 ret .cfi_endproc .LFE0: .size struct8, .-struct8
struct8 (struct test_structure_8 ts) { vector(4) float vect__2.10; vector(4) float vect_ts_f1_9.9; struct test_structure_8 D.1759; ;; basic block 2, loop depth 0 ;; pred: ENTRY vect_ts_f1_9.9_15 = MEM[(struct *)&ts]; vect__2.10_17 = vect_ts_f1_9.9_15 + { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 }; MEM[(struct *)&D.1759] = vect__2.10_17; return D.1759; ;; succ: EXIT } OTOH the code generated is much much worse if you disable vectorization. Then we should make sure the stack slot we spill the arguments to is properly aligned so we can avoid the UNSPEC_LOADU. This eventually can lead to cleanup opportunities. Ideally we'd recognize that we can more optimally load from the argument registers into a V4SF register of course (but due to the way we do argument setup during expand this may not be easily possible). It's of course the bad ABI choice that leads to this mess ...