Vectorization: Loop peeling with misaligned support.
Ondřej Bílka
neleai@seznam.cz
Fri Nov 15 22:26:00 GMT 2013
On Fri, Nov 15, 2013 at 09:17:14AM -0800, Hendrik Greving wrote:
> Also keep in mind that usually costs go up significantly if
> misalignment causes cache line splits (processor will fetch 2 lines).
> There are non-linear costs of filling up the store queue in modern
> out-of-order processors (x86). Bottom line is that it's much better to
> peel e.g. for AVX2/AVX3 if the loop would cause loads that cross cache
> line boundaries otherwise. The solution is to either actually always
> peel for alignment, or insert an additional check for cache line
> boundaries (for high trip count loops).
That is quite bold claim do you have a benchmark to support that?
Since nehalem there is no overhead of unaligned sse loads except of fetching
cache lines. As haswell avx2 loads behave in similar way.
You are forgetting that loop needs both cache lines when it issues
unaligned load. This will generaly take maximum of times needed to
access these lines. Now with peeling you accesss first cache line, and
after that in loop access the second, effectively doubling running time
when both lines were in main memory.
You also need to compute all factors not just that one factor is
expensive. There are several factor in plays, cost of branch
misprediction is main argument againist doing peeling, so you need to
show that cost of unaligned loads is bigger than cost of branch
misprediction of a peeled implementation.
As a quick example why peeling is generaly bad idea I did a simple
benchmark. Could somebody with haswell also test attached code generated
by gcc -O3 -march=core-avx2 (files set[13]_avx2.s)?
For the test we repeately call a function set with a pointer randomly
picked from 262144 bytes to stress a L2 cache, relevant tester
is following (file test.c)
for (i=0;i<100000000;i++){
set (ptr + 64 * (p % (SIZE /64) + 60), ptr2 + 64 * (q % (SIZE /64) + 60));
First vectorize by following function. A vectorizer here does
peeling (assembly is bit long, see file set1.s)
void set(int *p, int *q){
int i;
for (i=0; i<128; i++)
p[i] = 42 * p[i];
}
When ran it I got
$ gcc -O3 -DSIZE= test.c
$ gcc test.o set1.s
$ time ./a.out
real 0m3.724s
user 0m3.724s
sys 0m0.000s
Now what happens if we use separate input and output arrays? A gcc
vectorizer fortunately does not peel in this case (file set2.s) which
gives better performance
void set(int *p, int *q){
int i;
for (i=0; i<128; i++)
p[i] = 42 * q[i];
}
$ gcc test.o set2.s
$ time ./a.out
real 0m3.169s
user 0m3.170s
sys 0m0.000s
A speedup here is can be partialy explained by fact that inplace
modifications run slower. To eliminate this possibility we change
assembly to make input same as output (file set3.s)
jb .L15
.L7:
xorl %eax, %eax
+ movq %rdi, %rsi
.p2align 4,,10
.p2align 3
.L5:
$ gcc test.o set3.s
$ time ./a.out
real 0m3.169s
user 0m3.170s
sys 0m0.000s
Which is still faster than what peeling vectorizer generated.
And in this test I did not alignment is constant so branch misprediction
is not a issue.
-------------- next part --------------
#define _GNU_SOURCE
#include <stdlib.h>
int main(){
char *ptr = pvalloc(2 * SIZE + 128);
char *ptr2 = pvalloc(2 * SIZE + 128);
unsigned long p = 31;
unsigned long q = 17;
int i;
for (i=0; i < 100000000; i++) {
set (ptr + 64 * (p % (SIZE / 64) + 60), ptr2 + 64 * (q % (SIZE /64) + 60));
p = 11 * p + 3;
q = 13 * p + 5;
}
}
-------------- next part --------------
.file "set1.c"
.text
.p2align 4,,15
.globl set
.type set, @function
set:
.LFB0:
.cfi_startproc
leaq 32(%rdi), %rax
cmpq %rax, %rsi
jb .L12
movq %rdi, %rsi
.L6:
vmovdqu (%rsi), %ymm1
vmovdqa .LC0(%rip), %ymm0
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, (%rdi)
vmovdqu 32(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 32(%rdi)
vmovdqu 64(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 64(%rdi)
vmovdqu 96(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 96(%rdi)
vmovdqu 128(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 128(%rdi)
vmovdqu 160(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 160(%rdi)
vmovdqu 192(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 192(%rdi)
vmovdqu 224(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 224(%rdi)
vmovdqu 256(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 256(%rdi)
vmovdqu 288(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 288(%rdi)
vmovdqu 320(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 320(%rdi)
vmovdqu 352(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 352(%rdi)
vmovdqu 384(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 384(%rdi)
vmovdqu 416(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 416(%rdi)
vmovdqu 448(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 448(%rdi)
vmovdqu 480(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm0
vmovdqu %ymm0, 480(%rdi)
vzeroupper
ret
.p2align 4,,10
.p2align 3
.L12:
leaq 32(%rsi), %rax
cmpq %rax, %rdi
jae .L6
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L5:
movl (%rsi,%rax), %edx
movl $42, %ecx
imull %ecx, %edx
movl %edx, (%rdi,%rax)
addq $4, %rax
cmpq $512, %rax
jne .L5
rep ret
.cfi_endproc
.LFE0:
.size set, .-set
.section .rodata.cst32,"aM",@progbits,32
.align 32
.LC0:
.long 42
.long 42
.long 42
.long 42
.long 42
.long 42
.long 42
.long 42
.ident "GCC: (Debian 4.8.1-10) 4.8.1"
.section .note.GNU-stack,"",@progbits
-------------- next part --------------
.file "set2.c"
.text
.p2align 4,,15
.globl set
.type set, @function
set:
.LFB0:
.cfi_startproc
leaq 16(%rdi), %rax
cmpq %rax, %rsi
jb .L15
.L7:
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L5:
movdqu (%rsi,%rax), %xmm1
pslld $1, %xmm1
movdqa %xmm1, %xmm0
pslld $2, %xmm0
psubd %xmm1, %xmm0
movdqa %xmm0, %xmm1
pslld $3, %xmm1
psubd %xmm0, %xmm1
movdqu %xmm1, (%rdi,%rax)
addq $16, %rax
cmpq $512, %rax
jne .L5
rep ret
.p2align 4,,10
.p2align 3
.L15:
leaq 16(%rsi), %rax
cmpq %rax, %rdi
jae .L7
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L6:
movl (%rsi,%rax), %edx
movl $42, %ecx
imull %ecx, %edx
movl %edx, (%rdi,%rax)
addq $4, %rax
cmpq $512, %rax
jne .L6
rep ret
.cfi_endproc
.LFE0:
.size set, .-set
.ident "GCC: (Debian 4.8.1-10) 4.8.1"
.section .note.GNU-stack,"",@progbits
-------------- next part --------------
.file "set2.c"
.text
.p2align 4,,15
.globl set
.type set, @function
set:
.LFB0:
.cfi_startproc
leaq 16(%rdi), %rax
cmpq %rax, %rsi
jb .L15
.L7:
xorl %eax, %eax
movq %rdi, %rsi
.p2align 4,,10
.p2align 3
.L5:
movdqu (%rsi,%rax), %xmm1
pslld $1, %xmm1
movdqa %xmm1, %xmm0
pslld $2, %xmm0
psubd %xmm1, %xmm0
movdqa %xmm0, %xmm1
pslld $3, %xmm1
psubd %xmm0, %xmm1
movdqu %xmm1, (%rdi,%rax)
addq $16, %rax
cmpq $512, %rax
jne .L5
rep ret
.p2align 4,,10
.p2align 3
.L15:
leaq 16(%rsi), %rax
cmpq %rax, %rdi
jae .L7
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L6:
movl (%rsi,%rax), %edx
movl $42, %ecx
imull %ecx, %edx
movl %edx, (%rdi,%rax)
addq $4, %rax
cmpq $512, %rax
jne .L6
rep ret
.cfi_endproc
.LFE0:
.size set, .-set
.ident "GCC: (Debian 4.8.1-10) 4.8.1"
.section .note.GNU-stack,"",@progbits
-------------- next part --------------
.file "set1.c"
.text
.p2align 4,,15
.globl set
.type set, @function
set:
.LFB0:
.cfi_startproc
movq %rdi, %rax
andl $31, %eax
shrq $2, %rax
negq %rax
andl $7, %eax
je .L7
movl (%rdi), %edx
movl $42, %r11d
imull %r11d, %edx
cmpl $1, %eax
movl %edx, (%rdi)
jbe .L8
movl 4(%rdi), %edx
movl $42, %r10d
imull %r10d, %edx
cmpl $2, %eax
movl %edx, 4(%rdi)
jbe .L9
movl 8(%rdi), %edx
movl $42, %r9d
imull %r9d, %edx
cmpl $3, %eax
movl %edx, 8(%rdi)
jbe .L10
movl 12(%rdi), %edx
movl $42, %r8d
imull %r8d, %edx
cmpl $4, %eax
movl %edx, 12(%rdi)
jbe .L11
movl 16(%rdi), %edx
movl $42, %esi
imull %esi, %edx
cmpl $5, %eax
movl %edx, 16(%rdi)
jbe .L12
movl 20(%rdi), %edx
movl $42, %ecx
imull %ecx, %edx
cmpl $6, %eax
movl %edx, 20(%rdi)
jbe .L13
movl 24(%rdi), %edx
movl $42, %r11d
movl $7, %r9d
imull %r11d, %edx
movl %edx, 24(%rdi)
movl $121, %edx
.L2:
movl $128, %ecx
vmovdqa .LC0(%rip), %ymm0
subl %eax, %ecx
movl %eax, %eax
leaq (%rdi,%rax,4), %rax
movl %ecx, %r8d
shrl $3, %r8d
vpmulld (%rax), %ymm0, %ymm1
vmovdqa %ymm1, (%rax)
cmpl $15, %r8d
vpmulld 32(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 32(%rax)
leal 0(,%r8,8), %esi
vpmulld 64(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 64(%rax)
vpmulld 96(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 96(%rax)
vpmulld 128(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 128(%rax)
vpmulld 160(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 160(%rax)
vpmulld 192(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 192(%rax)
vpmulld 224(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 224(%rax)
vpmulld 256(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 256(%rax)
vpmulld 288(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 288(%rax)
vpmulld 320(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 320(%rax)
vpmulld 352(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 352(%rax)
vpmulld 384(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 384(%rax)
vpmulld 416(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 416(%rax)
vpmulld 448(%rax), %ymm0, %ymm1
vmovdqa %ymm1, 448(%rax)
jbe .L4
vpmulld 480(%rax), %ymm0, %ymm0
vmovdqa %ymm0, 480(%rax)
.L4:
leal (%r9,%rsi), %eax
subl %esi, %edx
cmpl %esi, %ecx
je .L29
movslq %eax, %rcx
movl $42, %r9d
leaq (%rdi,%rcx,4), %rcx
movl (%rcx), %esi
imull %r9d, %esi
cmpl $1, %edx
movl %esi, (%rcx)
leal 1(%rax), %ecx
je .L29
movslq %ecx, %rcx
movl $42, %r8d
leaq (%rdi,%rcx,4), %rcx
movl (%rcx), %esi
imull %r8d, %esi
cmpl $2, %edx
movl %esi, (%rcx)
leal 2(%rax), %ecx
je .L29
movslq %ecx, %rcx
movl $42, %r11d
leaq (%rdi,%rcx,4), %rcx
movl (%rcx), %esi
imull %r11d, %esi
cmpl $3, %edx
movl %esi, (%rcx)
leal 3(%rax), %ecx
je .L29
movslq %ecx, %rcx
movl $42, %r10d
leaq (%rdi,%rcx,4), %rcx
movl (%rcx), %esi
imull %r10d, %esi
cmpl $4, %edx
movl %esi, (%rcx)
leal 4(%rax), %ecx
je .L29
movslq %ecx, %rcx
movl $42, %r9d
leaq (%rdi,%rcx,4), %rcx
movl (%rcx), %esi
imull %r9d, %esi
cmpl $5, %edx
movl %esi, (%rcx)
leal 5(%rax), %ecx
je .L29
movslq %ecx, %rcx
movl $42, %r8d
addl $6, %eax
leaq (%rdi,%rcx,4), %rcx
movl (%rcx), %esi
imull %r8d, %esi
cmpl $6, %edx
movl %esi, (%rcx)
je .L29
cltq
movl $42, %r10d
leaq (%rdi,%rax,4), %rax
movl (%rax), %edx
imull %r10d, %edx
movl %edx, (%rax)
.L29:
vzeroupper
ret
.p2align 4,,10
.p2align 3
.L7:
movl $128, %edx
xorl %r9d, %r9d
jmp .L2
.p2align 4,,10
.p2align 3
.L13:
movl $122, %edx
movl $6, %r9d
jmp .L2
.p2align 4,,10
.p2align 3
.L8:
movl $127, %edx
movl $1, %r9d
jmp .L2
.p2align 4,,10
.p2align 3
.L9:
movl $126, %edx
movl $2, %r9d
jmp .L2
.p2align 4,,10
.p2align 3
.L10:
movl $125, %edx
movl $3, %r9d
jmp .L2
.p2align 4,,10
.p2align 3
.L11:
movl $124, %edx
movl $4, %r9d
jmp .L2
.p2align 4,,10
.p2align 3
.L12:
movl $123, %edx
movl $5, %r9d
jmp .L2
.cfi_endproc
.LFE0:
.size set, .-set
.section .rodata.cst32,"aM",@progbits,32
.align 32
.LC0:
.long 42
.long 42
.long 42
.long 42
.long 42
.long 42
.long 42
.long 42
.ident "GCC: (Debian 4.8.1-10) 4.8.1"
.section .note.GNU-stack,"",@progbits
-------------- next part --------------
.file "set1.c"
.text
.p2align 4,,15
.globl set
.type set, @function
set:
.LFB0:
.cfi_startproc
leaq 32(%rdi), %rax
cmpq %rax, %rsi
jb .L12
movq %rdi, %rsi
.L6:
vmovdqu (%rsi), %ymm1
vmovdqa .LC0(%rip), %ymm0
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, (%rdi)
vmovdqu 32(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 32(%rdi)
vmovdqu 64(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 64(%rdi)
vmovdqu 96(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 96(%rdi)
vmovdqu 128(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 128(%rdi)
vmovdqu 160(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 160(%rdi)
vmovdqu 192(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 192(%rdi)
vmovdqu 224(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 224(%rdi)
vmovdqu 256(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 256(%rdi)
vmovdqu 288(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 288(%rdi)
vmovdqu 320(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 320(%rdi)
vmovdqu 352(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 352(%rdi)
vmovdqu 384(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 384(%rdi)
vmovdqu 416(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 416(%rdi)
vmovdqu 448(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm1
vmovdqu %ymm1, 448(%rdi)
vmovdqu 480(%rsi), %ymm1
vpmulld %ymm0, %ymm1, %ymm0
vmovdqu %ymm0, 480(%rdi)
vzeroupper
ret
.p2align 4,,10
.p2align 3
.L12:
leaq 32(%rsi), %rax
cmpq %rax, %rdi
jae .L6
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L5:
movl (%rsi,%rax), %edx
movl $42, %ecx
imull %ecx, %edx
movl %edx, (%rdi,%rax)
addq $4, %rax
cmpq $512, %rax
jne .L5
rep ret
.cfi_endproc
.LFE0:
.size set, .-set
.section .rodata.cst32,"aM",@progbits,32
.align 32
.LC0:
.long 42
.long 42
.long 42
.long 42
.long 42
.long 42
.long 42
.long 42
.ident "GCC: (Debian 4.8.1-10) 4.8.1"
.section .note.GNU-stack,"",@progbits
More information about the Gcc
mailing list