Bug 59464

Summary: Unnecessary vector register spill
Product: gcc Reporter: H.J. Lu <hjl.tools>
Component: targetAssignee: Not yet assigned to anyone <unassigned>
Status: NEW ---    
Severity: normal CC: dimhen, ubizjak
Priority: P3 Keywords: missed-optimization
Version: 4.9.0   
Target Milestone: ---   
Host: Target: x86-64
Build: Known to work:
Known to fail: Last reconfirmed: 2013-12-11 00:00:00
Bug Depends on:    
Bug Blocks: 53947, 89582, 101926    

Description H.J. Lu 2013-12-11 00:54:17 UTC
[[hjl@gnu-6 gcc]$ cat /tmp/v.c
typedef struct
{
  float f1;
  float f2;
  float f3;
  float f4;
} test_structure_8;
test_structure_8 struct8 (test_structure_8 ts)
{
  ts.f1 += 1;
  ts.f2 += 1;
  ts.f3 += 1;
  ts.f4 += 1;
  return ts;
}
[hjl@gnu-6 gcc]$ ./xgcc -B./ -O3 -S /tmp/v.c
[hjl@gnu-6 gcc]$ cat v.s
	.file	"v.c"
	.section	.text.unlikely,"ax",@progbits
.LCOLDB1:
	.text
.LHOTB1:
	.p2align 4,,15
	.globl	struct8
	.type	struct8, @function
struct8:
.LFB0:
	.cfi_startproc
	movq	%xmm1, -32(%rsp)
	movq	%xmm0, -40(%rsp)
	movups	-40(%rsp), %xmm0
	addps	.LC0(%rip), %xmm0
	movaps	%xmm0, -56(%rsp)
	movq	-48(%rsp), %rax
	movq	-56(%rsp), %xmm0
	movq	%rax, -56(%rsp)
	movq	-56(%rsp), %xmm1
	ret
	.cfi_endproc
.LFE0:
	.size	struct8, .-struct8
	.section	.text.unlikely
.LCOLDE1:
	.text
.LHOTE1:
	.section	.rodata.cst16,"aM",@progbits,16
	.align 16
.LC0:
	.long	1065353216
	.long	1065353216
	.long	1065353216
	.long	1065353216
	.ident	"GCC: (GNU) 4.9.0 20131210 (experimental)"
	.section	.note.GNU-stack,"",@progbits
[hjl@gnu-6 gcc]$ 

At least, we should do

	.globl	struct8
	.type	struct8, @function
struct8:
.LFB0:
	.cfi_startproc
	movq	%xmm1, -32(%rsp)
	movq	%xmm0, -40(%rsp)
	movups	-40(%rsp), %xmm0
	addps	.LC0(%rip), %xmm0
	movaps	%xmm0, -56(%rsp)
	movq	-48(%rsp), %xmm1
	movq	-56(%rsp), %xmm0
	ret
	.cfi_endproc
.LFE0:
	.size	struct8, .-struct8
Comment 1 Richard Biener 2013-12-11 09:21:23 UTC
struct8 (struct test_structure_8 ts)
{
  vector(4) float vect__2.10;
  vector(4) float vect_ts_f1_9.9;
  struct test_structure_8 D.1759;

;;   basic block 2, loop depth 0
;;    pred:       ENTRY
  vect_ts_f1_9.9_15 = MEM[(struct  *)&ts];
  vect__2.10_17 = vect_ts_f1_9.9_15 + { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 };
  MEM[(struct  *)&D.1759] = vect__2.10_17;
  return D.1759;
;;    succ:       EXIT

}

OTOH the code generated is much much worse if you disable vectorization.

Then we should make sure the stack slot we spill the arguments to is
properly aligned so we can avoid the UNSPEC_LOADU.  This eventually
can lead to cleanup opportunities.

Ideally we'd recognize that we can more optimally load from the argument
registers into a V4SF register of course (but due to the way we do argument
setup during expand this may not be easily possible).

It's of course the bad ABI choice that leads to this mess ...