[Bug tree-optimization/55155] New: Autovectorization does not use unaligned loads/stores

Wed Oct 31 21:33:00 GMT 2012

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55155

             Bug #: 55155
           Summary: Autovectorization does not use unaligned loads/stores
    Classification: Unclassified
           Product: gcc
           Version: 4.7.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned@gcc.gnu.org
        ReportedBy: sgunderson@bigfoot.com

Hi,

I am on

  gcc version 4.7.1 (Debian 4.7.1-7) 

and a project of mine had code that looked like this:

beklager:~> cat example.cpp
void func(float * __restrict prod_features, float * __restrict
grad_prod_features, float alpha, unsigned num_prods) {
    float *pf = (float *)__builtin_assume_aligned(prod_features, 16);
    float *gpf = (float *)__builtin_assume_aligned(grad_prod_features, 16);
    for (unsigned i = 0; i < num_prods * 16; ++i) {
        prod_features[i] -= alpha * grad_prod_features[i];
        //pf[i] -= alpha * gpf[i];
    }
}

This would seem like a great case for autovectorization, so I tried:

beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp 
example.cpp: In function ‘void func(float*, float*, float, unsigned int)’:
example.cpp:2:9: warning: unused variable ‘pf’ [-Wunused-variable]
example.cpp:3:9: warning: unused variable ‘gpf’ [-Wunused-variable]

The resulting code, however, is a train wreck:
beklager:~> objdump --disassemble --demangle example.o         

example.o:     file format elf64-x86-64

Disassembly of section .text:

0000000000000000 <func(float*, float*, float, unsigned int)>:
   0:    55                       push   %rbp
   1:    c1 e2 04                 shl    $0x4,%edx
   4:    85 d2                    test   %edx,%edx
   6:    53                       push   %rbx
   7:    0f 84 ef 00 00 00        je     fc <func(float*, float*, float,
unsigned int)+0xfc>
   d:    49 89 f8                 mov    %rdi,%r8
  10:    41 83 e0 0f              and    $0xf,%r8d
  14:    49 c1 e8 02              shr    $0x2,%r8
  18:    49 f7 d8                 neg    %r8
  1b:    41 83 e0 03              and    $0x3,%r8d
  1f:    44 39 c2                 cmp    %r8d,%edx
  22:    44 0f 42 c2              cmovb  %edx,%r8d
  26:    83 fa 04                 cmp    $0x4,%edx
  29:    0f 87 d0 00 00 00        ja     ff <func(float*, float*, float,
unsigned int)+0xff>
  2f:    41 89 d0                 mov    %edx,%r8d
  32:    31 c0                    xor    %eax,%eax
  34:    0f 1f 40 00              nopl   0x0(%rax)
  38:    f3 0f 10 14 86           movss  (%rsi,%rax,4),%xmm2
  3d:    8d 48 01                 lea    0x1(%rax),%ecx
  40:    f3 0f 59 d0              mulss  %xmm0,%xmm2
  44:    f3 0f 10 0c 87           movss  (%rdi,%rax,4),%xmm1
  49:    f3 0f 5c ca              subss  %xmm2,%xmm1
  4d:    f3 0f 11 0c 87           movss  %xmm1,(%rdi,%rax,4)
  52:    48 83 c0 01              add    $0x1,%rax
  56:    41 39 c0                 cmp    %eax,%r8d
  59:    77 dd                    ja     38 <func(float*, float*, float,
unsigned int)+0x38>
  5b:    44 39 c2                 cmp    %r8d,%edx
  5e:    0f 84 98 00 00 00        je     fc <func(float*, float*, float,
unsigned int)+0xfc>
  64:    89 d5                    mov    %edx,%ebp
  66:    45 89 c1                 mov    %r8d,%r9d
  69:    44 29 c5                 sub    %r8d,%ebp
  6c:    41 89 eb                 mov    %ebp,%r11d
  6f:    41 c1 eb 02              shr    $0x2,%r11d
  73:    42 8d 1c 9d 00 00 00     lea    0x0(,%r11,4),%ebx
  7a:    00 
  7b:    85 db                    test   %ebx,%ebx
  7d:    74 59                    je     d8 <func(float*, float*, float,
unsigned int)+0xd8>
  7f:    0f 28 c8                 movaps %xmm0,%xmm1
  82:    49 c1 e1 02              shl    $0x2,%r9
  86:    0f 57 db                 xorps  %xmm3,%xmm3
  89:    4e 8d 14 0f              lea    (%rdi,%r9,1),%r10
  8d:    0f c6 c9 00              shufps $0x0,%xmm1,%xmm1
  91:    49 01 f1                 add    %rsi,%r9
  94:    31 c0                    xor    %eax,%eax
  96:    45 31 c0                 xor    %r8d,%r8d
  99:    0f 28 e1                 movaps %xmm1,%xmm4
  9c:    0f 1f 40 00              nopl   0x0(%rax)
  a0:    0f 28 cb                 movaps %xmm3,%xmm1
  a3:    41 83 c0 01              add    $0x1,%r8d
  a7:    41 0f 28 14 02           movaps (%r10,%rax,1),%xmm2
  ac:    41 0f 12 0c 01           movlps (%r9,%rax,1),%xmm1
  b1:    41 0f 16 4c 01 08        movhps 0x8(%r9,%rax,1),%xmm1
  b7:    0f 59 cc                 mulps  %xmm4,%xmm1
  ba:    0f 5c d1                 subps  %xmm1,%xmm2
  bd:    41 0f 29 14 02           movaps %xmm2,(%r10,%rax,1)
  c2:    48 83 c0 10              add    $0x10,%rax
  c6:    45 39 d8                 cmp    %r11d,%r8d
  c9:    72 d5                    jb     a0 <func(float*, float*, float,
unsigned int)+0xa0>
  cb:    01 d9                    add    %ebx,%ecx
  cd:    39 dd                    cmp    %ebx,%ebp
  cf:    74 2b                    je     fc <func(float*, float*, float,
unsigned int)+0xfc>
  d1:    0f 1f 80 00 00 00 00     nopl   0x0(%rax)
  d8:    41 89 c8                 mov    %ecx,%r8d
  db:    83 c1 01                 add    $0x1,%ecx
  de:    f3 42 0f 10 14 86        movss  (%rsi,%r8,4),%xmm2
  e4:    4a 8d 04 87              lea    (%rdi,%r8,4),%rax
  e8:    39 ca                    cmp    %ecx,%edx
  ea:    f3 0f 59 d0              mulss  %xmm0,%xmm2
  ee:    f3 0f 10 08              movss  (%rax),%xmm1
  f2:    f3 0f 5c ca              subss  %xmm2,%xmm1
  f6:    f3 0f 11 08              movss  %xmm1,(%rax)
  fa:    77 dc                    ja     d8 <func(float*, float*, float,
unsigned int)+0xd8>
  fc:    5b                       pop    %rbx
  fd:    5d                       pop    %rbp
  fe:    c3                       retq   
  ff:    45 85 c0                 test   %r8d,%r8d
 102:    0f 85 2a ff ff ff        jne    32 <func(float*, float*, float,
unsigned int)+0x32>
 108:    31 c9                    xor    %ecx,%ecx
 10a:    e9 55 ff ff ff           jmpq   64 <func(float*, float*, float,
unsigned int)+0x64>

There are two potential issues here:

1. It knows that my two arrays are not necessarily 16-byte aligned, so it emits
a huge body of code around it. (If I comment out the line in the inner loop and
uncomment the one next to it, much of this code disappears.) It should simply
write the loop using unaligned loads/stores (movups) instead of trying to piece
together packed scalars with movlps and movhps itself.
2. For some reason, it doesn't understand that (num_prods * 16) is divisible by
four, so it has extra code to handle that case.

If I change num_prods to a constant (e.g. 64), and use the variables that are
assumed 16-aligned, the output is the much more sane
beklager:~> cat example.cpp                                        
void func(float * __restrict prod_features, float * __restrict
grad_prod_features, float alpha, unsigned num_prods) {
    float *pf = (float *)__builtin_assume_aligned(prod_features, 16);
    float *gpf = (float *)__builtin_assume_aligned(grad_prod_features, 16);
    for (unsigned i = 0; i < 64 * 16; ++i) {
        //prod_features[i] -= alpha * grad_prod_features[i];
        pf[i] -= alpha * gpf[i];
    }
}

beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp 
beklager:~> objdump --disassemble --demangle example.o             

example.o:     file format elf64-x86-64

Disassembly of section .text:

0000000000000000 <func(float*, float*, float, unsigned int)>:
   0:    0f 28 c8                 movaps %xmm0,%xmm1
   3:    31 c0                    xor    %eax,%eax
   5:    0f c6 c9 00              shufps $0x0,%xmm1,%xmm1
   9:    0f 28 d1                 movaps %xmm1,%xmm2
   c:    0f 1f 40 00              nopl   0x0(%rax)
  10:    0f 28 0c 06              movaps (%rsi,%rax,1),%xmm1
  14:    0f 59 ca                 mulps  %xmm2,%xmm1
  17:    0f 28 04 07              movaps (%rdi,%rax,1),%xmm0
  1b:    0f 5c c1                 subps  %xmm1,%xmm0
  1e:    0f 29 04 07              movaps %xmm0,(%rdi,%rax,1)
  22:    48 83 c0 10              add    $0x10,%rax
  26:    48 3d 00 10 00 00        cmp    $0x1000,%rax
  2c:    75 e2                    jne    10 <func(float*, float*, float,
unsigned int)+0x10>
  2e:    f3 c3                    repz retq 

although in this case, one could argue that it should have fused the
movaps+subps+movaps to a single subps from memory.