[Bug tree-optimization/63537] New: Missed optimization: Loop unrolling adds extra copy when returning aggregate

Tue Oct 14 19:02:00 GMT 2014

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63537

            Bug ID: 63537
           Summary: Missed optimization: Loop unrolling adds extra copy
                    when returning aggregate
           Product: gcc
           Version: 4.9.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tavianator at gmail dot com

Created attachment 33715
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=33715&action=edit
Reproducer

At -O2 and above on x86_64, this manually unrolled loop generates much better
code than the automatically unrolled one:

    struct vec {
        double n[3];
    };

    struct vec mul_unrolled(struct vec lhs, double rhs) {
        struct vec ret;
        ret.n[0] = lhs.n[0]*rhs;
        ret.n[1] = lhs.n[1]*rhs;
        ret.n[2] = lhs.n[2]*rhs;
        return ret;
    }

This generates the beautiful:

    movsd    16(%rsp), %xmm2
    movq    %rdi, %rax
    movsd    24(%rsp), %xmm1
    mulsd    %xmm0, %xmm2
    mulsd    %xmm0, %xmm1
    mulsd    8(%rsp), %xmm0
    movsd    %xmm2, 8(%rdi)
    movsd    %xmm1, 16(%rdi)
    movsd    %xmm0, (%rdi)
    ret

In contrast, at -O2 this:

    struct vec mul_loop(struct vec lhs, double rhs) {
        struct vec ret;
        for (int i = 0; i < 3; ++i) {
            ret.n[i] = lhs.n[i]*rhs;
        }
        return ret;
    }

generates this:

    movsd    8(%rsp), %xmm1
    movq    %rdi, %rax
    mulsd    %xmm0, %xmm1
    movsd    %xmm1, -40(%rsp)
    movq    -40(%rsp), %rdx
    movsd    16(%rsp), %xmm1
    mulsd    %xmm0, %xmm1
    movq    %rdx, (%rdi)
    mulsd    24(%rsp), %xmm0
    movsd    %xmm1, -32(%rsp)
    movq    -32(%rsp), %rdx
    movsd    %xmm0, -24(%rsp)
    movq    %rdx, 8(%rdi)
    movq    -24(%rsp), %rdx
    movq    %rdx, 16(%rdi)
    ret

which puts the result in -40(%rsp) and then copies it to (%rdi).  At -O3 it
gets vectorized but the extra copy is still there:

    movapd    %xmm0, %xmm1
    mulsd    24(%rsp), %xmm0
    movupd    8(%rsp), %xmm2
    movq    %rdi, %rax
    unpcklpd    %xmm1, %xmm1
    mulpd    %xmm1, %xmm2
    movsd    %xmm0, -24(%rsp)
    movaps    %xmm2, -40(%rsp)
    movq    -40(%rsp), %rdx
    movq    %rdx, (%rdi)
    movq    -32(%rsp), %rdx
    movq    %rdx, 8(%rdi)
    movq    -24(%rsp), %rdx
    movq    %rdx, 16(%rdi)