Bug 59464 - Unnecessary vector register spill
Summary: Unnecessary vector register spill
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 4.9.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords:
Depends on:
Blocks: vectorizer 89582
  Show dependency treegraph
 
Reported: 2013-12-11 00:54 UTC by H.J. Lu
Modified: 2019-04-05 07:34 UTC (History)
2 users (show)

See Also:
Host:
Target: x86-64
Build:
Known to work:
Known to fail:
Last reconfirmed: 2013-12-11 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description H.J. Lu 2013-12-11 00:54:17 UTC
[[hjl@gnu-6 gcc]$ cat /tmp/v.c
typedef struct
{
  float f1;
  float f2;
  float f3;
  float f4;
} test_structure_8;
test_structure_8 struct8 (test_structure_8 ts)
{
  ts.f1 += 1;
  ts.f2 += 1;
  ts.f3 += 1;
  ts.f4 += 1;
  return ts;
}
[hjl@gnu-6 gcc]$ ./xgcc -B./ -O3 -S /tmp/v.c
[hjl@gnu-6 gcc]$ cat v.s
	.file	"v.c"
	.section	.text.unlikely,"ax",@progbits
.LCOLDB1:
	.text
.LHOTB1:
	.p2align 4,,15
	.globl	struct8
	.type	struct8, @function
struct8:
.LFB0:
	.cfi_startproc
	movq	%xmm1, -32(%rsp)
	movq	%xmm0, -40(%rsp)
	movups	-40(%rsp), %xmm0
	addps	.LC0(%rip), %xmm0
	movaps	%xmm0, -56(%rsp)
	movq	-48(%rsp), %rax
	movq	-56(%rsp), %xmm0
	movq	%rax, -56(%rsp)
	movq	-56(%rsp), %xmm1
	ret
	.cfi_endproc
.LFE0:
	.size	struct8, .-struct8
	.section	.text.unlikely
.LCOLDE1:
	.text
.LHOTE1:
	.section	.rodata.cst16,"aM",@progbits,16
	.align 16
.LC0:
	.long	1065353216
	.long	1065353216
	.long	1065353216
	.long	1065353216
	.ident	"GCC: (GNU) 4.9.0 20131210 (experimental)"
	.section	.note.GNU-stack,"",@progbits
[hjl@gnu-6 gcc]$ 

At least, we should do

	.globl	struct8
	.type	struct8, @function
struct8:
.LFB0:
	.cfi_startproc
	movq	%xmm1, -32(%rsp)
	movq	%xmm0, -40(%rsp)
	movups	-40(%rsp), %xmm0
	addps	.LC0(%rip), %xmm0
	movaps	%xmm0, -56(%rsp)
	movq	-48(%rsp), %xmm1
	movq	-56(%rsp), %xmm0
	ret
	.cfi_endproc
.LFE0:
	.size	struct8, .-struct8
Comment 1 Richard Biener 2013-12-11 09:21:23 UTC
struct8 (struct test_structure_8 ts)
{
  vector(4) float vect__2.10;
  vector(4) float vect_ts_f1_9.9;
  struct test_structure_8 D.1759;

;;   basic block 2, loop depth 0
;;    pred:       ENTRY
  vect_ts_f1_9.9_15 = MEM[(struct  *)&ts];
  vect__2.10_17 = vect_ts_f1_9.9_15 + { 1.0e+0, 1.0e+0, 1.0e+0, 1.0e+0 };
  MEM[(struct  *)&D.1759] = vect__2.10_17;
  return D.1759;
;;    succ:       EXIT

}

OTOH the code generated is much much worse if you disable vectorization.

Then we should make sure the stack slot we spill the arguments to is
properly aligned so we can avoid the UNSPEC_LOADU.  This eventually
can lead to cleanup opportunities.

Ideally we'd recognize that we can more optimally load from the argument
registers into a V4SF register of course (but due to the way we do argument
setup during expand this may not be easily possible).

It's of course the bad ABI choice that leads to this mess ...