[Bug c/56083] New: Vectorizer uses xor/movlps/movhps rather than movups

Wed Jan 23 15:39:00 GMT 2013

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=56083

             Bug #: 56083
           Summary: Vectorizer uses xor/movlps/movhps rather than movups
    Classification: Unclassified
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: minor
          Priority: P3
         Component: c
        AssignedTo: unassigned@gcc.gnu.org
        ReportedBy: ljkarkk2@cc.hut.fi

Unnecessarily complex machine code is generated on x86-64. Perhaps there is a
reason for this but to me it seems like the compiler is failing to optimize
properly. Asm code labels changed and comments added, other than that they are
are produced by the respective compilers for this C code:

void negate(float* fvec) {
    fvec[0] *= -1;
    fvec[1] *= -1;
    fvec[2] *= -1;
    fvec[3] *= -1;
}

.LC0:
    .long    2147483648              # float -0.000000e+00
    .long    2147483648              # float -0.000000e+00
    .long    2147483648              # float -0.000000e+00
    .long    2147483648              # float -0.000000e+00
.LC1:
        .long   3212836864              # float -1.000000e+00
        .long   3212836864              # float -1.000000e+00
        .long   3212836864              # float -1.000000e+00
        .long   3212836864              # float -1.000000e+00

# Hand optimized
    movups    (%rdi), %xmm0       # Load fvec into xmm0
    xorps    .LC0(%rip), %xmm0   # Invert sign bits of each float (xor -0.0f)
    movups    %xmm0, (%rdi)       # Store xmm0 to fvec

# Clang -O3 -mllvm -vectorize
        movups  (%rdi), %xmm0       # Load fvec
        mulps   .LC1(%rip), %xmm0   # Multiply by negative ones
        movups  %xmm0, (%rdi)       # Store fvec

# GCC 4.7.2 -O3
    xorps    %xmm0, %xmm0        # Zero xmm0 (unnecessary)
    movlps    (%rdi), %xmm0       # Load two lower floats
    movhps    8(%rdi), %xmm0      # Load two higher floats
    xorps    .LC0(%rip), %xmm0   # Invert sign bits
    movlps    %xmm0, (%rdi)       # Store two lower floats
    movhps    %xmm0, 8(%rdi)      # Store two higher floats