return value aliasing ?
Joachim Schöberl
joachim.schoeberl@tuwien.ac.at
Thu Mar 27 20:30:00 GMT 2014
Hello,
I am experimenting with AVX optimization of a complex code, and discovered
this simplified test-case already for scalar code:
I compiled this code
class MyDouble
{
double data;
public:
MyDouble (const MyDouble & v2) : data(v2.data) { ; }
MyDouble (double d) : data(d) { ; }
MyDouble & operator+= (MyDouble d)
{ data += d.data; return *this; }
};
MyDouble MyFuncBad (int m, MyDouble x)
{
MyDouble sum = 0.0;
for (int i = 0; i < m; i++) sum += x;
return sum;
}
MyDouble MyFuncGood (int m, MyDouble x)
{
MyDouble sum = 0.0;
for (int i = 0; i < m; i++) sum += x;
return MyDouble(sum);
}
with 'gcc-4.8 -O3 -S testdouble.cpp' and got the assembly code below.
I would expect that both functions are equivalent.
In MyFuncBad, the intermediate sum is written to memory in every
iteration, while in MyFuncGood just the final result is brought to
memory. Is this necessary for correctness ?
Can the return-value alias the call-by-value argument ?
In the AVX-code the performance difference was about 30 percent.
If I use the default copy-constructor, I also get the faster code.
Is there something wrong with my (of course here unnecessary) copy-constructor
?
Thanks a lot for an explanation
Joachim
The generated code is:
.file "testdouble.cpp"
.text
.p2align 4,,15
.globl _Z9MyFuncBadi8MyDouble
.type _Z9MyFuncBadi8MyDouble, @function
_Z9MyFuncBadi8MyDouble:
.LFB7:
.cfi_startproc
xorpd %xmm0, %xmm0
xorl %ecx, %ecx
testl %esi, %esi
movq %rdi, %rax
movsd %xmm0, (%rdi)
jle .L1
.p2align 4,,10
.p2align 3
.L5:
addsd (%rdx), %xmm0 <-- WHY RELOAD ???
addl $1, %ecx
cmpl %esi, %ecx
movsd %xmm0, (%rax) <-- WHY THE WRITE IN THE LOOP ????
jne .L5
.L1:
rep; ret
.cfi_endproc
.LFE7:
.size _Z9MyFuncBadi8MyDouble, .-_Z9MyFuncBadi8MyDouble
.p2align 4,,15
.globl _Z10MyFuncGoodi8MyDouble
.type _Z10MyFuncGoodi8MyDouble, @function
_Z10MyFuncGoodi8MyDouble:
.LFB8:
.cfi_startproc
testl %esi, %esi
movq %rdi, %rax
xorpd %xmm0, %xmm0
jle .L8
xorpd %xmm0, %xmm0
movsd (%rdx), %xmm1
xorl %edx, %edx
.p2align 4,,10
.p2align 3
.L9:
addl $1, %edx
addsd %xmm1, %xmm0
cmpl %esi, %edx
jne .L9
.L8:
movsd %xmm0, (%rax) <---- AFTER THE LOOP
ret
.cfi_endproc
.LFE8:
.size _Z10MyFuncGoodi8MyDouble, .-_Z10MyFuncGoodi8MyDouble
.ident "GCC: (Ubuntu 4.8.1-2ubuntu1~12.04) 4.8.1"
.section .note.GNU-stack,"",@progbits
More information about the Gcc-help
mailing list