Bug 59262 - __attribute__ ((optimize())) broken (and corrupts optimization of the whole compilation unit)
Summary: __attribute__ ((optimize())) broken (and corrupts optimization of the whole ...
Status: RESOLVED WONTFIX
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.9.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2013-11-23 11:58 UTC by vincenzo Innocente
Modified: 2013-11-25 09:22 UTC (History)
0 users

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description vincenzo Innocente 2013-11-23 11:58:21 UTC
in latest 4.9. seen in 4.8.1 too
take
cat attribute.cc
inline float sum(float x, float y) { return x+y;}


#ifdef OPT1
float foo1() __attribute__ ((optimize("O3", "fast-math")));
#endif
#ifdef OPT2
float foo2() __attribute__ ((optimize("fast-math")));
#endif
#ifdef OPT3
float foo3() __attribute__ ((optimize("O3")));
#endif

float x[1024], y[1024];

float foo1() {
  float ret=0;
  for (int i=0; i<1024; ++i) 
     ret += sum(x[i],y[i]);
  return ret;
}


float foo2() {
  float ret=0;
  for (int i=0; i<1024; ++i)
     ret += sum(x[i],y[i]);
  return ret;
}


float foo3() {
  float ret=0;
  for (int i=0; i<1024; ++i)
     ret += sum(x[i],y[i]);
  return ret;
}


float bar() {
  float ret=0;
  for (int i=0; i<1024; ++i)
     ret += sum(x[i],y[i]);
  return ret;
}

c++ -O2 -ftree-vectorize -S attribute.cc -march=corei7 -DOPT1 ; cat attribute.s
	.file	"attribute.cc"
	.section	.text._Z3sumff,"axG",@progbits,_Z3sumff,comdat
	.p2align 4,,15
	.weak	_Z3sumff
	.type	_Z3sumff, @function
_Z3sumff:
.LFB0:
	.cfi_startproc
	addss	%xmm1, %xmm0
	ret
	.cfi_endproc
.LFE0:
	.size	_Z3sumff, .-_Z3sumff
	.text
	.p2align 4,,-1
	.globl	_Z4foo1v
	.type	_Z4foo1v, @function
_Z4foo1v:
.LFB1:
	.cfi_startproc
	pushq	%rbx
	.cfi_def_cfa_offset 16
	.cfi_offset 3, -16
	pxor	%xmm3, %xmm3
	xorl	%ebx, %ebx
	subq	$16, %rsp
	.cfi_def_cfa_offset 32
	movss	%xmm3, 12(%rsp)
	.p2align 4,,10
	.p2align 3
.L3:
	movss	x(%rbx), %xmm0
	addq	$4, %rbx
	movss	y-4(%rbx), %xmm1
	call	_Z3sumff
	addss	12(%rsp), %xmm0
	movss	%xmm0, 12(%rsp)
	cmpq	$4096, %rbx
	jne	.L3
	addq	$16, %rsp
	.cfi_def_cfa_offset 16
	popq	%rbx
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE1:
	.size	_Z4foo1v, .-_Z4foo1v
	.p2align 4,,15
	.globl	_Z4foo2v
	.type	_Z4foo2v, @function
_Z4foo2v:
.LFB2:
	.cfi_startproc
	xorl	%eax, %eax
	pxor	%xmm0, %xmm0
	.p2align 4,,10
	.p2align 3
.L8:
	movaps	x(%rax), %xmm1
	addq	$16, %rax
	addps	y-16(%rax), %xmm1
	addps	%xmm1, %xmm0
	cmpq	$4096, %rax
	jne	.L8
	haddps	%xmm0, %xmm0
	haddps	%xmm0, %xmm0
	addss	.LC0(%rip), %xmm0
	ret
	.cfi_endproc
.LFE2:
	.size	_Z4foo2v, .-_Z4foo2v
	.p2align 4,,15
	.globl	_Z4foo3v
	.type	_Z4foo3v, @function
_Z4foo3v:
.LFB3:
	.cfi_startproc
	xorl	%eax, %eax
	pxor	%xmm0, %xmm0
	.p2align 4,,10
	.p2align 3
.L11:
	movaps	x(%rax), %xmm1
	addq	$16, %rax
	addps	y-16(%rax), %xmm1
	addps	%xmm1, %xmm0
	cmpq	$4096, %rax
	jne	.L11
	haddps	%xmm0, %xmm0
	haddps	%xmm0, %xmm0
	addss	.LC0(%rip), %xmm0
	ret
	.cfi_endproc
.LFE3:
	.size	_Z4foo3v, .-_Z4foo3v
	.p2align 4,,15
	.globl	_Z3barv
	.type	_Z3barv, @function
_Z3barv:
.LFB4:
	.cfi_startproc
	xorl	%eax, %eax
	pxor	%xmm0, %xmm0
	.p2align 4,,10
	.p2align 3
.L14:
	movaps	x(%rax), %xmm1
	addq	$16, %rax
	addps	y-16(%rax), %xmm1
	addps	%xmm1, %xmm0
	cmpq	$4096, %rax
	jne	.L14
	haddps	%xmm0, %xmm0
	haddps	%xmm0, %xmm0
	addss	.LC0(%rip), %xmm0
	ret
	.cfi_endproc
.LFE4:
	.size	_Z3barv, .-_Z3barv
	.globl	y
	.bss
	.align 32
	.type	y, @object
	.size	y, 4096
y:
	.zero	4096
	.globl	x
	.align 32
	.type	x, @object
	.size	x, 4096
x:
	.zero	4096
	.section	.rodata.cst4,"aM",@progbits,4
	.align 4
.LC0:
	.long	0
	.ident	"GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision 204623]"
	.section	.note.GNU-stack,"",@progbits
 c++ -O2 -ftree-vectorize -S attribute.cc -march=corei7 -DOPT2 ; cat attribute.s
	.file	"attribute.cc"
	.text
	.p2align 4,,15
	.globl	_Z4foo1v
	.type	_Z4foo1v, @function
_Z4foo1v:
.LFB1:
	.cfi_startproc
	xorl	%eax, %eax
	pxor	%xmm0, %xmm0
	.p2align 4,,10
	.p2align 3
.L2:
	movaps	x(%rax), %xmm1
	addq	$16, %rax
	addps	y-16(%rax), %xmm1
	addps	%xmm1, %xmm0
	cmpq	$4096, %rax
	jne	.L2
	haddps	%xmm0, %xmm0
	haddps	%xmm0, %xmm0
	addss	.LC0(%rip), %xmm0
	ret
	.cfi_endproc
.LFE1:
	.size	_Z4foo1v, .-_Z4foo1v
	.p2align 4,,-1
	.globl	_Z4foo2v
	.type	_Z4foo2v, @function
_Z4foo2v:
.LFB2:
	.cfi_startproc
	xorl	%eax, %eax
	pxor	%xmm0, %xmm0
	.p2align 4,,10
	.p2align 3
.L6:
	movss	x(%rax), %xmm1
	addq	$4, %rax
	addss	y-4(%rax), %xmm1
	addss	%xmm1, %xmm0
	cmpq	$4096, %rax
	jne	.L6
	ret
	.cfi_endproc
.LFE2:
	.size	_Z4foo2v, .-_Z4foo2v
	.p2align 4,,15
	.globl	_Z4foo3v
	.type	_Z4foo3v, @function
_Z4foo3v:
.LFB3:
	.cfi_startproc
	xorl	%eax, %eax
	pxor	%xmm0, %xmm0
	.p2align 4,,10
	.p2align 3
.L9:
	movaps	x(%rax), %xmm1
	addq	$16, %rax
	addps	y-16(%rax), %xmm1
	addps	%xmm1, %xmm0
	cmpq	$4096, %rax
	jne	.L9
	haddps	%xmm0, %xmm0
	haddps	%xmm0, %xmm0
	addss	.LC0(%rip), %xmm0
	ret
	.cfi_endproc
.LFE3:
	.size	_Z4foo3v, .-_Z4foo3v
	.p2align 4,,15
	.globl	_Z3barv
	.type	_Z3barv, @function
_Z3barv:
.LFB4:
	.cfi_startproc
	xorl	%eax, %eax
	pxor	%xmm0, %xmm0
	.p2align 4,,10
	.p2align 3
.L12:
	movaps	x(%rax), %xmm1
	addq	$16, %rax
	addps	y-16(%rax), %xmm1
	addps	%xmm1, %xmm0
	cmpq	$4096, %rax
	jne	.L12
	haddps	%xmm0, %xmm0
	haddps	%xmm0, %xmm0
	addss	.LC0(%rip), %xmm0
	ret
	.cfi_endproc
.LFE4:
	.size	_Z3barv, .-_Z3barv
	.globl	y
	.bss
	.align 32
	.type	y, @object
	.size	y, 4096
y:
	.zero	4096
	.globl	x
	.align 32
	.type	x, @object
	.size	x, 4096
x:
	.zero	4096
	.section	.rodata.cst4,"aM",@progbits,4
	.align 4
.LC0:
	.long	0
	.ident	"GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision 204623]"
	.section	.note.GNU-stack,"",@progbits

[innocent@vinavx2 bugs48]$ c++ -O2 -ftree-vectorize -S attribute.cc -march=corei7 -DOPT3 ; cat attribute.s
	.file	"attribute.cc"
	.section	.text._Z3sumff,"axG",@progbits,_Z3sumff,comdat
	.p2align 4,,15
	.weak	_Z3sumff
	.type	_Z3sumff, @function
_Z3sumff:
.LFB0:
	.cfi_startproc
	addss	%xmm1, %xmm0
	ret
	.cfi_endproc
.LFE0:
	.size	_Z3sumff, .-_Z3sumff
	.text
	.p2align 4,,15
	.globl	_Z4foo1v
	.type	_Z4foo1v, @function
_Z4foo1v:
.LFB1:
	.cfi_startproc
	xorl	%eax, %eax
	pxor	%xmm0, %xmm0
	.p2align 4,,10
	.p2align 3
.L3:
	movss	y(%rax), %xmm1
	addq	$4, %rax
	addss	x-4(%rax), %xmm1
	addss	%xmm1, %xmm0
	cmpq	$4096, %rax
	jne	.L3
	ret
	.cfi_endproc
.LFE1:
	.size	_Z4foo1v, .-_Z4foo1v
	.p2align 4,,15
	.globl	_Z4foo2v
	.type	_Z4foo2v, @function
_Z4foo2v:
.LFB2:
	.cfi_startproc
	xorl	%eax, %eax
	pxor	%xmm0, %xmm0
	.p2align 4,,10
	.p2align 3
.L7:
	movss	y(%rax), %xmm1
	addq	$4, %rax
	addss	x-4(%rax), %xmm1
	addss	%xmm1, %xmm0
	cmpq	$4096, %rax
	jne	.L7
	ret
	.cfi_endproc
.LFE2:
	.size	_Z4foo2v, .-_Z4foo2v
	.p2align 4,,-1
	.globl	_Z4foo3v
	.type	_Z4foo3v, @function
_Z4foo3v:
.LFB3:
	.cfi_startproc
	pushq	%rbx
	.cfi_def_cfa_offset 16
	.cfi_offset 3, -16
	pxor	%xmm3, %xmm3
	xorl	%ebx, %ebx
	subq	$16, %rsp
	.cfi_def_cfa_offset 32
	movss	%xmm3, 12(%rsp)
	.p2align 4,,10
	.p2align 3
.L10:
	movss	x(%rbx), %xmm0
	addq	$4, %rbx
	movss	y-4(%rbx), %xmm1
	call	_Z3sumff
	addss	12(%rsp), %xmm0
	movss	%xmm0, 12(%rsp)
	cmpq	$4096, %rbx
	jne	.L10
	addq	$16, %rsp
	.cfi_def_cfa_offset 16
	popq	%rbx
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE3:
	.size	_Z4foo3v, .-_Z4foo3v
	.p2align 4,,15
	.globl	_Z3barv
	.type	_Z3barv, @function
_Z3barv:
.LFB4:
	.cfi_startproc
	xorl	%eax, %eax
	pxor	%xmm0, %xmm0
	.p2align 4,,10
	.p2align 3
.L14:
	movss	y(%rax), %xmm1
	addq	$4, %rax
	addss	x-4(%rax), %xmm1
	addss	%xmm1, %xmm0
	cmpq	$4096, %rax
	jne	.L14
	ret
	.cfi_endproc
.LFE4:
	.size	_Z3barv, .-_Z3barv
	.globl	y
	.bss
	.align 32
	.type	y, @object
	.size	y, 4096
y:
	.zero	4096
	.globl	x
	.align 32
	.type	x, @object
	.size	x, 4096
x:
	.zero	4096
	.ident	"GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision 204623]"
	.section	.note.GNU-stack,"",@progbits


notice how
float foo1() __attribute__ ((optimize("O3", "fast-math")));
manages to vectorize foo2,foo3,bar while prevents inlining in foo1 itself...
float foo2() __attribute__ ((optimize("fast-math")));
instead vectorize all others BUT foo2
Comment 1 Marek Polacek 2013-11-23 14:00:40 UTC
__attribute__ ((optimize (""))) is not suited for this; its purpose is mostly for debugging.
Comment 2 Richard Biener 2013-11-25 09:22:49 UTC
More specifically we cannot inline across changes of -ffast-math as that would
generate wrong-code.  Thus the non-fast-math 'sum' cannot be inlined
into a fast-math annotated function.  Which then of course defeats
vectorization.

Generally I'd advise against using the optimize attribute.