[Bug tree-optimization/63537] New: Missed optimization: Loop unrolling adds extra copy when returning aggregate
tavianator at gmail dot com
gcc-bugzilla@gcc.gnu.org
Tue Oct 14 19:02:00 GMT 2014
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63537
Bug ID: 63537
Summary: Missed optimization: Loop unrolling adds extra copy
when returning aggregate
Product: gcc
Version: 4.9.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: tavianator at gmail dot com
Created attachment 33715
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=33715&action=edit
Reproducer
At -O2 and above on x86_64, this manually unrolled loop generates much better
code than the automatically unrolled one:
struct vec {
double n[3];
};
struct vec mul_unrolled(struct vec lhs, double rhs) {
struct vec ret;
ret.n[0] = lhs.n[0]*rhs;
ret.n[1] = lhs.n[1]*rhs;
ret.n[2] = lhs.n[2]*rhs;
return ret;
}
This generates the beautiful:
movsd 16(%rsp), %xmm2
movq %rdi, %rax
movsd 24(%rsp), %xmm1
mulsd %xmm0, %xmm2
mulsd %xmm0, %xmm1
mulsd 8(%rsp), %xmm0
movsd %xmm2, 8(%rdi)
movsd %xmm1, 16(%rdi)
movsd %xmm0, (%rdi)
ret
In contrast, at -O2 this:
struct vec mul_loop(struct vec lhs, double rhs) {
struct vec ret;
for (int i = 0; i < 3; ++i) {
ret.n[i] = lhs.n[i]*rhs;
}
return ret;
}
generates this:
movsd 8(%rsp), %xmm1
movq %rdi, %rax
mulsd %xmm0, %xmm1
movsd %xmm1, -40(%rsp)
movq -40(%rsp), %rdx
movsd 16(%rsp), %xmm1
mulsd %xmm0, %xmm1
movq %rdx, (%rdi)
mulsd 24(%rsp), %xmm0
movsd %xmm1, -32(%rsp)
movq -32(%rsp), %rdx
movsd %xmm0, -24(%rsp)
movq %rdx, 8(%rdi)
movq -24(%rsp), %rdx
movq %rdx, 16(%rdi)
ret
which puts the result in -40(%rsp) and then copies it to (%rdi). At -O3 it
gets vectorized but the extra copy is still there:
movapd %xmm0, %xmm1
mulsd 24(%rsp), %xmm0
movupd 8(%rsp), %xmm2
movq %rdi, %rax
unpcklpd %xmm1, %xmm1
mulpd %xmm1, %xmm2
movsd %xmm0, -24(%rsp)
movaps %xmm2, -40(%rsp)
movq -40(%rsp), %rdx
movq %rdx, (%rdi)
movq -32(%rsp), %rdx
movq %rdx, 8(%rdi)
movq -24(%rsp), %rdx
movq %rdx, 16(%rdi)
More information about the Gcc-bugs
mailing list