Bug 69871 - Type punned structs returned by value optimized poorly due to SRA
Summary: Type punned structs returned by value optimized poorly due to SRA
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 5.3.0
: P3 enhancement
Target Milestone: 8.0
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2016-02-19 10:30 UTC by Phil Ruffwind
Modified: 2023-04-27 23:16 UTC (History)
2 users (show)

See Also:
Host:
Target:
Build:
Known to work: 8.1.0
Known to fail: 6.1.0, 7.1.0
Last reconfirmed: 2016-02-19 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Phil Ruffwind 2016-02-19 10:30:58 UTC
The following code, which unpacks a 32-bit integer into a struct of four bytes, does not optimize as well as it should.  While "unpack" seems to optimize just fine, trivial wrappers of the function do not seem to get optimized nearly as well:

- Two of the wrappers ("wrapper", "wrapper2") are completely identical yet they do not result in the same assembly code.  One is optimized well, the other is not.
- Adding another layer of indirection ("wrapperwrapper") also prevents the optimization from occurring.

The problem occurs not only for union-based type-punning, but also for similar tricks that involve:

  - memcpy, where all three wrappers would optimize poorly, or
  - bitshift operators, where even "unpack" would optimize poorly.

See also: https://gcc.gnu.org/ml/gcc/2016-02/msg00244.html

The code was compiled with "gcc -fverbose-asm -Wall -S -O3 foo.c" on Linux 4.4.1 x86-64.  The GCC binaries are part of the Arch Linux's gcc-multilib 5.3.0-4 binary package.

---

struct alpha {
    char a, b, c, d;
};

struct alpha unpack(unsigned x)
{
    union {
        struct alpha r;
        unsigned i;
    } u;
    u.i = x;
    return u.r;
}

struct alpha wrapper(unsigned y)
{
    return unpack(y);
}

struct alpha wrapper2(unsigned y)
{
    return unpack(y);
}

struct alpha wrapperwrapper(unsigned y)
{
    return wrapper(y);
}

---

	.file	"foo.c"
# GNU C11 (GCC) version 5.3.0 (x86_64-unknown-linux-gnu)
#	compiled by GNU C version 5.3.0, GMP version 6.1.0, MPFR version 3.1.3-p5, MPC version 1.0.3
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed:  foo.c -mtune=generic -march=x86-64 -O3 -Wall
# -fverbose-asm
# options enabled:  -faggressive-loop-optimizations -falign-labels
# -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg
# -fcaller-saves -fchkp-check-incomplete-type -fchkp-check-read
# -fchkp-check-write -fchkp-instrument-calls -fchkp-narrow-bounds
# -fchkp-optimize -fchkp-store-bounds -fchkp-use-static-bounds
# -fchkp-use-static-const-bounds -fchkp-use-wrappers
# -fcombine-stack-adjustments -fcommon -fcompare-elim -fcprop-registers
# -fcrossjumping -fcse-follow-jumps -fdefer-pop
# -fdelete-null-pointer-checks -fdevirtualize -fdevirtualize-speculatively
# -fdwarf2-cfi-asm -fearly-inlining -feliminate-unused-debug-types
# -fexpensive-optimizations -fforward-propagate -ffunction-cse -fgcse
# -fgcse-after-reload -fgcse-lm -fgnu-runtime -fgnu-unique
# -fguess-branch-probability -fhoist-adjacent-loads -fident -fif-conversion
# -fif-conversion2 -findirect-inlining -finline -finline-atomics
# -finline-functions -finline-functions-called-once
# -finline-small-functions -fipa-cp -fipa-cp-alignment -fipa-cp-clone
# -fipa-icf -fipa-icf-functions -fipa-icf-variables -fipa-profile
# -fipa-pure-const -fipa-ra -fipa-reference -fipa-sra -fira-hoist-pressure
# -fira-share-save-slots -fira-share-spill-slots
# -fisolate-erroneous-paths-dereference -fivopts -fkeep-static-consts
# -fleading-underscore -flifetime-dse -flra-remat -flto-odr-type-merging
# -fmath-errno -fmerge-constants -fmerge-debug-strings
# -fmove-loop-invariants -fomit-frame-pointer -foptimize-sibling-calls
# -foptimize-strlen -fpartial-inlining -fpeephole -fpeephole2
# -fpredictive-commoning -fprefetch-loop-arrays -free -freg-struct-return
# -freorder-blocks -freorder-blocks-and-partition -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-fusion
# -fschedule-insns2 -fsemantic-interposition -fshow-column -fshrink-wrap
# -fsigned-zeros -fsplit-ivs-in-unroller -fsplit-wide-types -fssa-phiopt
# -fstdarg-opt -fstrict-aliasing -fstrict-overflow
# -fstrict-volatile-bitfields -fsync-libcalls -fthread-jumps
# -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce
# -ftree-ccp -ftree-ch -ftree-coalesce-vars -ftree-copy-prop
# -ftree-copyrename -ftree-cselim -ftree-dce -ftree-dominator-opts
# -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-distribute-patterns
# -ftree-loop-if-convert -ftree-loop-im -ftree-loop-ivcanon
# -ftree-loop-optimize -ftree-loop-vectorize -ftree-parallelize-loops=
# -ftree-partial-pre -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc
# -ftree-scev-cprop -ftree-sink -ftree-slp-vectorize -ftree-slsr -ftree-sra
# -ftree-switch-conversion -ftree-tail-merge -ftree-ter -ftree-vrp
# -funit-at-a-time -funswitch-loops -funwind-tables -fverbose-asm
# -fzero-initialized-in-bss -m128bit-long-double -m64 -m80387
# -malign-stringops -mavx256-split-unaligned-load
# -mavx256-split-unaligned-store -mfancy-math-387 -mfp-ret-in-387 -mfxsr
# -mglibc -mieee-fp -mlong-double-80 -mmmx -mno-sse4 -mpush-args -mred-zone
# -msse -msse2 -mtls-direct-seg-refs -mvzeroupper

	.section	.text.unlikely,"ax",@progbits
.LCOLDB0:
	.text
.LHOTB0:
	.p2align 4,,15
	.globl	unpack
	.type	unpack, @function
unpack:
.LFB0:
	.cfi_startproc
	movl	%edi, %eax	# x, x
	ret
	.cfi_endproc
.LFE0:
	.size	unpack, .-unpack
	.section	.text.unlikely
.LCOLDE0:
	.text
.LHOTE0:
	.section	.text.unlikely
.LCOLDB1:
	.text
.LHOTB1:
	.p2align 4,,15
	.globl	wrapper
	.type	wrapper, @function
wrapper:
.LFB5:
	.cfi_startproc
	movl	%edi, %eax	# y, y
	xorl	%edx, %edx	# retval.9
	movsbl	%ah, %eax	# y, SR.14
	movb	%dil, %dl	# y, retval.9
	movb	%al, %dh	# SR.14, retval.9
	movl	%edi, %eax	# y, tmp101
	andl	$-16777216, %edi	#, tmp105
	andl	$16711680, %eax	#, tmp101
	movzwl	%dx, %edx	# retval.9, tmp103
	orl	%eax, %edx	# tmp101, tmp106
	movl	%edx, %eax	# tmp106, tmp107
	orl	%edi, %eax	# tmp105, tmp107
	ret
	.cfi_endproc
.LFE5:
	.size	wrapper, .-wrapper
	.section	.text.unlikely
.LCOLDE1:
	.text
.LHOTE1:
	.section	.text.unlikely
.LCOLDB2:
	.text
.LHOTB2:
	.p2align 4,,15
	.globl	wrapper2
	.type	wrapper2, @function
wrapper2:
.LFB2:
	.cfi_startproc
	movl	%edi, %eax	# y, y
	ret
	.cfi_endproc
.LFE2:
	.size	wrapper2, .-wrapper2
	.section	.text.unlikely
.LCOLDE2:
	.text
.LHOTE2:
	.section	.text.unlikely
.LCOLDB3:
	.text
.LHOTB3:
	.p2align 4,,15
	.globl	wrapperwrapper
	.type	wrapperwrapper, @function
wrapperwrapper:
.LFB3:
	.cfi_startproc
	movl	%edi, %eax	# y, y
	xorl	%edx, %edx	# D.1859
	movsbl	%ah, %eax	# y, SR.5
	movb	%dil, %dl	# y, D.1859
	movb	%al, %dh	# SR.5, D.1859
	movl	%edi, %eax	# y, tmp101
	andl	$-16777216, %edi	#, tmp105
	andl	$16711680, %eax	#, tmp101
	movzwl	%dx, %edx	# D.1859, tmp103
	orl	%eax, %edx	# tmp101, tmp106
	movl	%edx, %eax	# tmp106, tmp107
	orl	%edi, %eax	# tmp105, tmp107
	ret
	.cfi_endproc
.LFE3:
	.size	wrapperwrapper, .-wrapperwrapper
	.section	.text.unlikely
.LCOLDE3:
	.text
.LHOTE3:
	.ident	"GCC: (GNU) 5.3.0"
	.section	.note.GNU-stack,"",@progbits
Comment 1 Richard Biener 2016-02-19 11:33:27 UTC
Confirmed also on x86_64.  It's SRA messing things up - it possibly should
take into account whether args / return values will end up in registers or not
[maybe we should re-write those in to SSA in some way].
Comment 2 Andrew Pinski 2023-04-27 23:16:36 UTC
Fixed fully in GCC 8. Most likely by r8-4769-g4b84d9b8f9a6