For the following example: struct Vec3{ double x, y, z; }; void vadd_v2(struct Vec3* a, struct Vec3* out); struct Vec3 use_v1(struct Vec3 *in){ struct Vec3 out; vadd_v2(in, &out); return out; } the resulting assembler (-O2 -Wall) is: use_v1: pushq %r12 movq %rdi, %r12 movq %rsi, %rdi subq $32, %rsp movq %rsp, %rsi call vadd_v2 movq 16(%rsp), %rax movdqa (%rsp), %xmm0 movq %rax, 16(%r12) movq %r12, %rax movups %xmm0, (%r12) addq $32, %rsp popq %r12 ret However, the hidden pointer could be passed directly into vadd_v2, which is what clang is doing: use_v1: # @use_v1 pushq %rbx movq %rdi, %rbx movq %rsi, %rdi movq %rbx, %rsi callq vadd_v2 movq %rbx, %rax popq %rbx retq See also https://godbolt.org/z/rT41Sj
NRVO happens in the C++ front-end, but not the C one, and the general tree-nrv.c is rather limited.
Confirmed.
I have another example of this broken-ness... In qerrst.h: typedef struct { char s[64] ; } qerrst_t ; extern qerrst_t qerrst0(int err) ; In qerrst.c: #include "qerrst.h" #include <stdio.h> #include <string.h> extern qerrst_t qerrst0(int err) { qerrst_t st ; snprintf(st.s, sizeof(st.s), "errno=%d", err) ; return st ; } In qmain.c: #include <stdio.h> #include "qerrst.h" int main(int argc, char* argv[]) { int err = argc ; qerrst_t z ; printf("qerrst0()='%s'\n", qerrst0(err).s) ; z = qerrst0(err) ; printf("qerrst0()='%s'\n", z.s) ; return 0 ; } Compile: gcc -O2 -Wall ../qerrst.c ../qmain.c [gcc 9.2.1] function main: push %rbp mov %edi,%esi mov %edi,%ebp sub $0xc0,%rsp lea 0x80(%rsp),%rdi # "hidden pointer" callq 0x4011c0 <qerrst0> lea 0x80(%rsp),%rsi # use "hidden pointer" for printf mov $0x402019,%edi xor %eax,%eax callq 0x401030 <printf@plt> mov %ebp,%esi mov %rsp,%rdi # another "hidden pointer" callq 0x4011c0 <qerrst0> movdqu (%rsp),%xmm0 lea 0x40(%rsp),%rsi # address of 'qerrst_t z' xor %eax,%eax movdqu 0x10(%rsp),%xmm1 movdqu 0x20(%rsp),%xmm2 mov $0x402019,%edi movdqu 0x30(%rsp),%xmm3 movaps %xmm0,0x40(%rsp) # copy "hidden" to 'qerrst_t z' movaps %xmm1,0x50(%rsp) movaps %xmm2,0x60(%rsp) movaps %xmm3,0x70(%rsp) callq 0x401030 <printf@plt> add $0xc0,%rsp xor %eax,%eax pop %rbp retq function qerrst0: push %r12 mov %esi,%ecx mov %rdi,%r12 # save "hidden pointer" mov $0x402010,%edx mov $0x40,%esi xor %eax,%eax sub $0x40,%rsp # allocate 'qerrst_t st' mov %rsp,%rdi callq 0x401040 <snprintf@plt> movdqa (%rsp),%xmm0 mov %r12,%rax movdqa 0x10(%rsp),%xmm1 movdqa 0x20(%rsp),%xmm2 movdqa 0x30(%rsp),%xmm3 movups %xmm0,(%r12) # copy st !!! movups %xmm1,0x10(%r12) movups %xmm2,0x20(%r12) movups %xmm3,0x30(%r12) add $0x40,%rsp pop %r12 retq So, I looked at the AMD64 ABI (Draft 0.99.7 – November 17, 2014 – 15:08), Section 3.2.3 Parameter Passing, p22: Returning of Values: .... 2. If the type has class MEMORY, then the caller provides space for the return value and passes the address of this storage in %rdi as if it were the first argument to the function. In effect, this address becomes a “hidden” first argument. This storage must not overlap any data visible to the callee through other names than this argument. So... the ABI appears to say that the callee does *not* need to do any copying *ever*. So... why is the qerrst0() function doing a copy ? This pushes the problem back to the caller. If the caller can be sure that the final destination is not visible to the callee, it too can avoid copying. In the case above, gcc fails to spot that 'qerrst_t z' in main() is not visible to anything beyond main(). FWIW: clang (8.0.0) avoids the spurious copy in qerrst0(), but not the unnecessary copy to 'qerrst_t z'. ------------------------------------------------------- I guess that functions returning large(ish) struct is not deemed worth supporting properly ?