Bug 55155 - Autovectorization does not use unaligned loads/stores
Summary: Autovectorization does not use unaligned loads/stores
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.7.1
: P3 normal
Target Milestone: 8.0
Assignee: Not yet assigned to anyone
URL:
Keywords:
Depends on: 55157
Blocks:
  Show dependency treegraph
 
Reported: 2012-10-31 21:32 UTC by Steinar H. Gunderson
Modified: 2021-06-08 08:28 UTC (History)
0 users

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Steinar H. Gunderson 2012-10-31 21:32:37 UTC
Hi,

I am on

  gcc version 4.7.1 (Debian 4.7.1-7) 

and a project of mine had code that looked like this:

beklager:~> cat example.cpp
void func(float * __restrict prod_features, float * __restrict grad_prod_features, float alpha, unsigned num_prods) {
	float *pf = (float *)__builtin_assume_aligned(prod_features, 16);
	float *gpf = (float *)__builtin_assume_aligned(grad_prod_features, 16);
	for (unsigned i = 0; i < num_prods * 16; ++i) {
		prod_features[i] -= alpha * grad_prod_features[i];
		//pf[i] -= alpha * gpf[i];
	}
}

This would seem like a great case for autovectorization, so I tried:

beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp 
example.cpp: In function ‘void func(float*, float*, float, unsigned int)’:
example.cpp:2:9: warning: unused variable ‘pf’ [-Wunused-variable]
example.cpp:3:9: warning: unused variable ‘gpf’ [-Wunused-variable]

The resulting code, however, is a train wreck:
beklager:~> objdump --disassemble --demangle example.o         
                                             
example.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <func(float*, float*, float, unsigned int)>:
   0:	55                   	push   %rbp
   1:	c1 e2 04             	shl    $0x4,%edx
   4:	85 d2                	test   %edx,%edx
   6:	53                   	push   %rbx
   7:	0f 84 ef 00 00 00    	je     fc <func(float*, float*, float, unsigned int)+0xfc>
   d:	49 89 f8             	mov    %rdi,%r8
  10:	41 83 e0 0f          	and    $0xf,%r8d
  14:	49 c1 e8 02          	shr    $0x2,%r8
  18:	49 f7 d8             	neg    %r8
  1b:	41 83 e0 03          	and    $0x3,%r8d
  1f:	44 39 c2             	cmp    %r8d,%edx
  22:	44 0f 42 c2          	cmovb  %edx,%r8d
  26:	83 fa 04             	cmp    $0x4,%edx
  29:	0f 87 d0 00 00 00    	ja     ff <func(float*, float*, float, unsigned int)+0xff>
  2f:	41 89 d0             	mov    %edx,%r8d
  32:	31 c0                	xor    %eax,%eax
  34:	0f 1f 40 00          	nopl   0x0(%rax)
  38:	f3 0f 10 14 86       	movss  (%rsi,%rax,4),%xmm2
  3d:	8d 48 01             	lea    0x1(%rax),%ecx
  40:	f3 0f 59 d0          	mulss  %xmm0,%xmm2
  44:	f3 0f 10 0c 87       	movss  (%rdi,%rax,4),%xmm1
  49:	f3 0f 5c ca          	subss  %xmm2,%xmm1
  4d:	f3 0f 11 0c 87       	movss  %xmm1,(%rdi,%rax,4)
  52:	48 83 c0 01          	add    $0x1,%rax
  56:	41 39 c0             	cmp    %eax,%r8d
  59:	77 dd                	ja     38 <func(float*, float*, float, unsigned int)+0x38>
  5b:	44 39 c2             	cmp    %r8d,%edx
  5e:	0f 84 98 00 00 00    	je     fc <func(float*, float*, float, unsigned int)+0xfc>
  64:	89 d5                	mov    %edx,%ebp
  66:	45 89 c1             	mov    %r8d,%r9d
  69:	44 29 c5             	sub    %r8d,%ebp
  6c:	41 89 eb             	mov    %ebp,%r11d
  6f:	41 c1 eb 02          	shr    $0x2,%r11d
  73:	42 8d 1c 9d 00 00 00 	lea    0x0(,%r11,4),%ebx
  7a:	00 
  7b:	85 db                	test   %ebx,%ebx
  7d:	74 59                	je     d8 <func(float*, float*, float, unsigned int)+0xd8>
  7f:	0f 28 c8             	movaps %xmm0,%xmm1
  82:	49 c1 e1 02          	shl    $0x2,%r9
  86:	0f 57 db             	xorps  %xmm3,%xmm3
  89:	4e 8d 14 0f          	lea    (%rdi,%r9,1),%r10
  8d:	0f c6 c9 00          	shufps $0x0,%xmm1,%xmm1
  91:	49 01 f1             	add    %rsi,%r9
  94:	31 c0                	xor    %eax,%eax
  96:	45 31 c0             	xor    %r8d,%r8d
  99:	0f 28 e1             	movaps %xmm1,%xmm4
  9c:	0f 1f 40 00          	nopl   0x0(%rax)
  a0:	0f 28 cb             	movaps %xmm3,%xmm1
  a3:	41 83 c0 01          	add    $0x1,%r8d
  a7:	41 0f 28 14 02       	movaps (%r10,%rax,1),%xmm2
  ac:	41 0f 12 0c 01       	movlps (%r9,%rax,1),%xmm1
  b1:	41 0f 16 4c 01 08    	movhps 0x8(%r9,%rax,1),%xmm1
  b7:	0f 59 cc             	mulps  %xmm4,%xmm1
  ba:	0f 5c d1             	subps  %xmm1,%xmm2
  bd:	41 0f 29 14 02       	movaps %xmm2,(%r10,%rax,1)
  c2:	48 83 c0 10          	add    $0x10,%rax
  c6:	45 39 d8             	cmp    %r11d,%r8d
  c9:	72 d5                	jb     a0 <func(float*, float*, float, unsigned int)+0xa0>
  cb:	01 d9                	add    %ebx,%ecx
  cd:	39 dd                	cmp    %ebx,%ebp
  cf:	74 2b                	je     fc <func(float*, float*, float, unsigned int)+0xfc>
  d1:	0f 1f 80 00 00 00 00 	nopl   0x0(%rax)
  d8:	41 89 c8             	mov    %ecx,%r8d
  db:	83 c1 01             	add    $0x1,%ecx
  de:	f3 42 0f 10 14 86    	movss  (%rsi,%r8,4),%xmm2
  e4:	4a 8d 04 87          	lea    (%rdi,%r8,4),%rax
  e8:	39 ca                	cmp    %ecx,%edx
  ea:	f3 0f 59 d0          	mulss  %xmm0,%xmm2
  ee:	f3 0f 10 08          	movss  (%rax),%xmm1
  f2:	f3 0f 5c ca          	subss  %xmm2,%xmm1
  f6:	f3 0f 11 08          	movss  %xmm1,(%rax)
  fa:	77 dc                	ja     d8 <func(float*, float*, float, unsigned int)+0xd8>
  fc:	5b                   	pop    %rbx
  fd:	5d                   	pop    %rbp
  fe:	c3                   	retq   
  ff:	45 85 c0             	test   %r8d,%r8d
 102:	0f 85 2a ff ff ff    	jne    32 <func(float*, float*, float, unsigned int)+0x32>
 108:	31 c9                	xor    %ecx,%ecx
 10a:	e9 55 ff ff ff       	jmpq   64 <func(float*, float*, float, unsigned int)+0x64>

There are two potential issues here:

1. It knows that my two arrays are not necessarily 16-byte aligned, so it emits a huge body of code around it. (If I comment out the line in the inner loop and uncomment the one next to it, much of this code disappears.) It should simply write the loop using unaligned loads/stores (movups) instead of trying to piece together packed scalars with movlps and movhps itself.
2. For some reason, it doesn't understand that (num_prods * 16) is divisible by four, so it has extra code to handle that case.

If I change num_prods to a constant (e.g. 64), and use the variables that are assumed 16-aligned, the output is the much more sane
beklager:~> cat example.cpp                                        
void func(float * __restrict prod_features, float * __restrict grad_prod_features, float alpha, unsigned num_prods) {
	float *pf = (float *)__builtin_assume_aligned(prod_features, 16);
	float *gpf = (float *)__builtin_assume_aligned(grad_prod_features, 16);
	for (unsigned i = 0; i < 64 * 16; ++i) {
		//prod_features[i] -= alpha * grad_prod_features[i];
		pf[i] -= alpha * gpf[i];
	}
}

beklager:~> g++ -Wall -O2 -ftree-vectorize -msse4.1 -c example.cpp 
beklager:~> objdump --disassemble --demangle example.o             
            
example.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <func(float*, float*, float, unsigned int)>:
   0:	0f 28 c8             	movaps %xmm0,%xmm1
   3:	31 c0                	xor    %eax,%eax
   5:	0f c6 c9 00          	shufps $0x0,%xmm1,%xmm1
   9:	0f 28 d1             	movaps %xmm1,%xmm2
   c:	0f 1f 40 00          	nopl   0x0(%rax)
  10:	0f 28 0c 06          	movaps (%rsi,%rax,1),%xmm1
  14:	0f 59 ca             	mulps  %xmm2,%xmm1
  17:	0f 28 04 07          	movaps (%rdi,%rax,1),%xmm0
  1b:	0f 5c c1             	subps  %xmm1,%xmm0
  1e:	0f 29 04 07          	movaps %xmm0,(%rdi,%rax,1)
  22:	48 83 c0 10          	add    $0x10,%rax
  26:	48 3d 00 10 00 00    	cmp    $0x1000,%rax
  2c:	75 e2                	jne    10 <func(float*, float*, float, unsigned int)+0x10>
  2e:	f3 c3                	repz retq 

although in this case, one could argue that it should have fused the movaps+subps+movaps to a single subps from memory.
Comment 1 Andrew Pinski 2012-10-31 21:44:20 UTC
  <bb 2>:
  _19 = num_prods_6(D) * 16;
  if (_19 != 0)
    goto <bb 4>;
  else
    goto <bb 3>;

  <bb 3>:
  return;

  <bb 4>:
  _16 = ASSERT_EXPR <_19, _19 != 0>;
...
  if (_16 <= 4)
    goto <bb 5>;
  else
    goto <bb 21>;

We should have an assert_expr that _16 is also greater or equal to than 16.

Note this is changing unsigned to __SIZE_TYPE__ so casting does not get in the way.
Comment 2 Andrew Pinski 2021-06-08 08:28:36 UTC
Been fixed since at least GCC 8.
_Z4funcPfS_fj:
        sall    $4, %edx
        je      .L1
        shrl    $2, %edx
        xorl    %eax, %eax
        shufps  $0, %xmm0, %xmm0
        salq    $4, %rdx
        .p2align 4,,10
        .p2align 3
.L3:
        movups  (%rsi,%rax), %xmm1
        movups  (%rdi,%rax), %xmm2
        mulps   %xmm0, %xmm1
        subps   %xmm1, %xmm2
        movups  %xmm2, (%rdi,%rax)
        addq    $16, %rax
        cmpq    %rax, %rdx
        jne     .L3
.L1:
        ret