This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug other/80634] New: strangely missed vectorization optimizations
- From: "steven at uplinklabs dot net" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Thu, 04 May 2017 21:45:38 +0000
- Subject: [Bug other/80634] New: strangely missed vectorization optimizations
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80634
Bug ID: 80634
Summary: strangely missed vectorization optimizations
Product: gcc
Version: 6.3.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: other
Assignee: unassigned at gcc dot gnu.org
Reporter: steven at uplinklabs dot net
Target Milestone: ---
Created attachment 41322
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=41322&action=edit
gcc 6.3.1 outputs for ELEMS=1 through ELEMS=32
(Not sure which component is the correct one for this issue).
I've noticed that the vectorizer makes some strange choices occasionally, and
will turn some straightforward code into a large branchy code sequence. Take
this, for example:
void saxpy(float a, float * restrict x, float * restrict y)
{
for (int i = 0; i < ELEMS; ++i)
y[i] = a*x[i] + y[i];
}
If I use the flags "-O3 -march=haswell" (or "-O3 -xAVX2" on ICC) and use
varying definitions of ELEMS, I will sometimes get odd results with GCC 6.3.1.
Here's -DELEMS=6 with GCC 6.3.1:
saxpy:
vshufps $0, %xmm0, %xmm0, %xmm1
vmovups (%rsi), %xmm2
vfmadd132ps (%rdi), %xmm2, %xmm1
vmovss 20(%rsi), %xmm3
vmovups %xmm1, (%rsi)
vmovss 16(%rdi), %xmm1
vfmadd213ss 16(%rsi), %xmm0, %xmm1
vfmadd132ss 20(%rdi), %xmm3, %xmm0
vmovss %xmm1, 16(%rsi)
vmovss %xmm0, 20(%rsi)
ret
Seems reasonable.
Here's -DELEMS=7 with GCC 6.3.1:
saxpy:
movq %rsi, %rax
shrq $2, %rax
negq %rax
andl $3, %eax
je .L7
vmovss (%rdi), %xmm1
vfmadd213ss (%rsi), %xmm0, %xmm1
vmovss %xmm1, (%rsi)
cmpl $1, %eax
je .L8
vmovss 4(%rdi), %xmm1
vfmadd213ss 4(%rsi), %xmm0, %xmm1
vmovss %xmm1, 4(%rsi)
cmpl $3, %eax
jne .L9
vmovss 8(%rdi), %xmm1
vfmadd213ss 8(%rsi), %xmm0, %xmm1
movl $4, %r8d
movl $3, %edx
vmovss %xmm1, 8(%rsi)
.L3:
movl $7, %ecx
movl %eax, %r9d
subl %eax, %ecx
.L2:
leaq (%rsi,%r9,4), %rax
vshufps $0, %xmm0, %xmm0, %xmm1
vmovaps (%rax), %xmm3
vfmadd132ps (%rdi,%r9,4), %xmm3, %xmm1
vmovaps %xmm1, (%rax)
leal 4(%rdx), %eax
cmpl $4, %ecx
je .L19
cltq
addl $5, %edx
leaq (%rsi,%rax,4), %rcx
vmovss (%rdi,%rax,4), %xmm1
vfmadd213ss (%rcx), %xmm0, %xmm1
vmovss %xmm1, (%rcx)
cmpl $5, %r8d
je .L17
movslq %edx, %rdx
leaq (%rsi,%rdx,4), %rax
vmovss (%rdi,%rdx,4), %xmm1
vfmadd213ss (%rax), %xmm0, %xmm1
vmovss %xmm1, (%rax)
cmpl $6, %r8d
je .L17
vmovss 24(%rsi), %xmm2
vfmadd132ss 24(%rdi), %xmm2, %xmm0
vmovss %xmm0, 24(%rsi)
ret
.L17:
ret
.L7:
movl $7, %ecx
xorl %r9d, %r9d
movl $7, %r8d
xorl %edx, %edx
jmp .L2
.L19:
ret
.L8:
movl $6, %r8d
movl $1, %edx
jmp .L3
.L9:
movl $5, %r8d
movl $2, %edx
jmp .L3
This might be explained away by it being an odd number just short of a power of
two, but ICC does an apparently better job (one packed FMA plus three single
FMAs):
saxpy:
vbroadcastss %xmm0, %xmm2
vmovups (%rdi), %xmm1
vmovss 16(%rdi), %xmm3
vmovss 20(%rdi), %xmm4
vmovss 24(%rdi), %xmm5
vfmadd213ps (%rsi), %xmm1, %xmm2
vfmadd213ss 16(%rsi), %xmm0, %xmm3
vfmadd213ss 20(%rsi), %xmm0, %xmm4
vfmadd213ss 24(%rsi), %xmm5, %xmm0
vmovups %xmm2, (%rsi)
vmovss %xmm3, 16(%rsi)
vmovss %xmm4, 20(%rsi)
vmovss %xmm0, 24(%rsi)
ret
The results from GCC 6.3.1 for ELEMS values 8 through 14 look fine (short
branchless code sequences similar to what ICC emits), but things go to crap
again for what seems to be *any* value ELEMS=15 or above.
It even misses the opportunity with ELEMS=16 to just do two packed FMAs with
YMM registers:
saxpy:
movq %rsi, %rax
shrq $2, %rax
negq %rax
andl $7, %eax
je .L7
vmovss (%rdi), %xmm1
vfmadd213ss (%rsi), %xmm0, %xmm1
vmovss %xmm1, (%rsi)
cmpl $1, %eax
je .L8
vmovss 4(%rdi), %xmm1
vfmadd213ss 4(%rsi), %xmm0, %xmm1
vmovss %xmm1, 4(%rsi)
cmpl $2, %eax
je .L9
vmovss 8(%rdi), %xmm1
vfmadd213ss 8(%rsi), %xmm0, %xmm1
vmovss %xmm1, 8(%rsi)
cmpl $3, %eax
je .L10
vmovss 12(%rdi), %xmm1
vfmadd213ss 12(%rsi), %xmm0, %xmm1
vmovss %xmm1, 12(%rsi)
cmpl $4, %eax
je .L11
vmovss 16(%rdi), %xmm1
vfmadd213ss 16(%rsi), %xmm0, %xmm1
vmovss %xmm1, 16(%rsi)
cmpl $5, %eax
je .L12
vmovss 20(%rdi), %xmm1
vfmadd213ss 20(%rsi), %xmm0, %xmm1
vmovss %xmm1, 20(%rsi)
cmpl $7, %eax
jne .L13
vmovss 24(%rdi), %xmm1
vfmadd213ss 24(%rsi), %xmm0, %xmm1
movl $9, %r9d
movl $7, %r10d
vmovss %xmm1, 24(%rsi)
.L3:
movl $16, %ecx
movl %eax, %edx
movl $8, %r8d
movl $1, %r11d
subl %eax, %ecx
.L2:
salq $2, %rdx
vbroadcastss %xmm0, %ymm1
leaq (%rdi,%rdx), %rax
addq %rsi, %rdx
vmovups (%rax), %ymm2
vfmadd213ps (%rdx), %ymm1, %ymm2
vmovaps %ymm2, (%rdx)
cmpl $2, %r11d
jne .L4
vmovaps 32(%rdx), %ymm4
vfmadd132ps 32(%rax), %ymm4, %ymm1
vmovaps %ymm1, 32(%rdx)
.L4:
movl %r9d, %edx
leal (%r8,%r10), %eax
subl %r8d, %edx
cmpl %r8d, %ecx
je .L29
movslq %eax, %r8
leaq (%rsi,%r8,4), %rcx
vmovss (%rdi,%r8,4), %xmm1
vfmadd213ss (%rcx), %xmm0, %xmm1
vmovss %xmm1, (%rcx)
leal 1(%rax), %ecx
cmpl $1, %edx
je .L29
movslq %ecx, %rcx
leaq (%rsi,%rcx,4), %r8
vmovss (%rdi,%rcx,4), %xmm1
leal 2(%rax), %ecx
vfmadd213ss (%r8), %xmm0, %xmm1
vmovss %xmm1, (%r8)
cmpl $2, %edx
je .L29
movslq %ecx, %rcx
leaq (%rsi,%rcx,4), %r8
vmovss (%rdi,%rcx,4), %xmm1
leal 3(%rax), %ecx
vfmadd213ss (%r8), %xmm0, %xmm1
vmovss %xmm1, (%r8)
cmpl $3, %edx
je .L29
movslq %ecx, %rcx
leaq (%rsi,%rcx,4), %r8
vmovss (%rdi,%rcx,4), %xmm1
leal 4(%rax), %ecx
vfmadd213ss (%r8), %xmm0, %xmm1
vmovss %xmm1, (%r8)
cmpl $4, %edx
je .L29
movslq %ecx, %rcx
leaq (%rsi,%rcx,4), %r8
vmovss (%rdi,%rcx,4), %xmm1
leal 5(%rax), %ecx
vfmadd213ss (%r8), %xmm0, %xmm1
vmovss %xmm1, (%r8)
cmpl $5, %edx
je .L29
movslq %ecx, %rcx
addl $6, %eax
leaq (%rsi,%rcx,4), %r8
vmovss (%rdi,%rcx,4), %xmm1
vfmadd213ss (%r8), %xmm0, %xmm1
vmovss %xmm1, (%r8)
cmpl $6, %edx
je .L29
cltq
leaq (%rsi,%rax,4), %rdx
vmovss (%rdx), %xmm3
vfmadd132ss (%rdi,%rax,4), %xmm3, %xmm0
vmovss %xmm0, (%rdx)
.L29:
vzeroupper
ret
.L7:
movl $16, %r8d
movl $16, %ecx
xorl %edx, %edx
xorl %r10d, %r10d
movl $2, %r11d
movl $16, %r9d
jmp .L2
.L13:
movl $10, %r9d
movl $6, %r10d
jmp .L3
.L8:
movl $15, %r9d
movl $1, %r10d
jmp .L3
.L9:
movl $14, %r9d
movl $2, %r10d
jmp .L3
.L10:
movl $13, %r9d
movl $3, %r10d
jmp .L3
.L11:
movl $12, %r9d
movl $4, %r10d
jmp .L3
.L12:
movl $11, %r9d
movl $5, %r10d
jmp .L3
ICC gets ELEMS=16 right:
saxpy:
vmovups (%rdi), %ymm1
vmovups 32(%rdi), %ymm2
vbroadcastss %xmm0, %ymm3
vfmadd213ps (%rsi), %ymm3, %ymm1
vfmadd213ps 32(%rsi), %ymm2, %ymm3
vmovups %ymm1, (%rsi)
vmovups %ymm3, 32(%rsi)
vzeroupper
ret
I'll attach the code outputs for ELEMS values 1 through 32 using GCC 6.3.1 and
ICC 17.0.1.