Vectorization: Loop peeling with misaligned support.

Ondřej Bílka neleai@seznam.cz
Fri Nov 15 22:26:00 GMT 2013


On Fri, Nov 15, 2013 at 09:17:14AM -0800, Hendrik Greving wrote:
> Also keep in mind that usually costs go up significantly if
> misalignment causes cache line splits (processor will fetch 2 lines).
> There are non-linear costs of filling up the store queue in modern
> out-of-order processors (x86). Bottom line is that it's much better to
> peel e.g. for AVX2/AVX3 if the loop would cause loads that cross cache
> line boundaries otherwise. The solution is to either actually always
> peel for alignment, or insert an additional check for cache line
> boundaries (for high trip count loops).

That is quite bold claim do you have a benchmark to support that?

Since nehalem there is no overhead of unaligned sse loads except of fetching
cache lines. As haswell avx2 loads behave in similar way.

You are forgetting that loop needs both cache lines when it issues
unaligned load. This will generaly take maximum of times needed to
access these lines. Now with peeling you accesss first cache line, and
after that in loop access the second, effectively doubling running time
when both lines were in main memory.

You also need to compute all factors not just that one factor is
expensive. There are several factor in plays, cost of branch
misprediction is main argument againist doing peeling, so you need to
show that cost of unaligned loads is bigger than cost of branch
misprediction of a peeled implementation.

As a quick example why peeling is generaly bad idea I did a simple
benchmark. Could somebody with haswell also test attached code generated
by gcc -O3 -march=core-avx2 (files set[13]_avx2.s)?

For the test we repeately call a function set with a pointer randomly
picked from 262144 bytes to stress a L2 cache, relevant tester 
is following (file test.c)

for (i=0;i<100000000;i++){
     set (ptr + 64 * (p % (SIZE /64) + 60), ptr2 + 64 * (q % (SIZE /64) + 60));

First vectorize by following function. A vectorizer here does
peeling (assembly is bit long, see file set1.s)

void set(int *p, int *q){
  int i;
  for (i=0; i<128; i++)
     p[i] = 42 * p[i];
}

When ran it I got

$ gcc -O3 -DSIZE= test.c
$ gcc test.o set1.s
$ time ./a.out

real	0m3.724s
user	0m3.724s
sys	0m0.000s

Now what happens if we use separate input and output arrays? A gcc
vectorizer fortunately does not peel in this case (file set2.s) which
gives better performance

void set(int *p, int *q){
  int i;
  for (i=0; i<128; i++)
     p[i] = 42 * q[i];
}

$ gcc test.o set2.s
$ time ./a.out

real	0m3.169s
user	0m3.170s
sys	0m0.000s


A speedup here is can be partialy explained by fact that inplace
modifications run slower. To eliminate this possibility we change
assembly to make input same as output (file set3.s)

 	jb	.L15
 .L7:
 	xorl	%eax, %eax
+	movq	%rdi, %rsi
 	.p2align 4,,10
 	.p2align 3
 .L5:

$ gcc test.o set3.s
$ time ./a.out

real	0m3.169s
user	0m3.170s
sys	0m0.000s

Which is still faster than what peeling vectorizer generated.

And in this test I did not alignment is constant so branch misprediction
is not a issue.
-------------- next part --------------
#define _GNU_SOURCE
#include <stdlib.h>
int main(){
   char *ptr = pvalloc(2 * SIZE + 128);
   char *ptr2 = pvalloc(2 * SIZE + 128);

   unsigned long p = 31;
   unsigned long q = 17;

   int i;
   for (i=0; i < 100000000; i++) {
     set (ptr + 64 * (p % (SIZE / 64) + 60), ptr2 + 64 * (q % (SIZE /64) + 60));
     p = 11 * p + 3;
     q = 13 * p + 5;
   }
}
-------------- next part --------------
	.file	"set1.c"
	.text
	.p2align 4,,15
	.globl	set
	.type	set, @function
set:
.LFB0:
	.cfi_startproc
	leaq	32(%rdi), %rax
	cmpq	%rax, %rsi
	jb	.L12
	movq  %rdi, %rsi
.L6:
	vmovdqu	(%rsi), %ymm1
	vmovdqa	.LC0(%rip), %ymm0
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, (%rdi)
	vmovdqu	32(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 32(%rdi)
	vmovdqu	64(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 64(%rdi)
	vmovdqu	96(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 96(%rdi)
	vmovdqu	128(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 128(%rdi)
	vmovdqu	160(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 160(%rdi)
	vmovdqu	192(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 192(%rdi)
	vmovdqu	224(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 224(%rdi)
	vmovdqu	256(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 256(%rdi)
	vmovdqu	288(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 288(%rdi)
	vmovdqu	320(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 320(%rdi)
	vmovdqu	352(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 352(%rdi)
	vmovdqu	384(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 384(%rdi)
	vmovdqu	416(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 416(%rdi)
	vmovdqu	448(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 448(%rdi)
	vmovdqu	480(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm0
	vmovdqu	%ymm0, 480(%rdi)
	vzeroupper
	ret
	.p2align 4,,10
	.p2align 3
.L12:
	leaq	32(%rsi), %rax
	cmpq	%rax, %rdi
	jae	.L6
	xorl	%eax, %eax
	.p2align 4,,10
	.p2align 3
.L5:
	movl	(%rsi,%rax), %edx
	movl	$42, %ecx
	imull	%ecx, %edx
	movl	%edx, (%rdi,%rax)
	addq	$4, %rax
	cmpq	$512, %rax
	jne	.L5
	rep ret
	.cfi_endproc
.LFE0:
	.size	set, .-set
	.section	.rodata.cst32,"aM",@progbits,32
	.align 32
.LC0:
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.ident	"GCC: (Debian 4.8.1-10) 4.8.1"
	.section	.note.GNU-stack,"",@progbits
-------------- next part --------------
	.file	"set2.c"
	.text
	.p2align 4,,15
	.globl	set
	.type	set, @function
set:
.LFB0:
	.cfi_startproc
	leaq	16(%rdi), %rax
	cmpq	%rax, %rsi
	jb	.L15
.L7:
	xorl	%eax, %eax
	.p2align 4,,10
	.p2align 3
.L5:
	movdqu	(%rsi,%rax), %xmm1
	pslld	$1, %xmm1
	movdqa	%xmm1, %xmm0
	pslld	$2, %xmm0
	psubd	%xmm1, %xmm0
	movdqa	%xmm0, %xmm1
	pslld	$3, %xmm1
	psubd	%xmm0, %xmm1
	movdqu	%xmm1, (%rdi,%rax)
	addq	$16, %rax
	cmpq	$512, %rax
	jne	.L5
	rep ret
	.p2align 4,,10
	.p2align 3
.L15:
	leaq	16(%rsi), %rax
	cmpq	%rax, %rdi
	jae	.L7
	xorl	%eax, %eax
	.p2align 4,,10
	.p2align 3
.L6:
	movl	(%rsi,%rax), %edx
	movl	$42, %ecx
	imull	%ecx, %edx
	movl	%edx, (%rdi,%rax)
	addq	$4, %rax
	cmpq	$512, %rax
	jne	.L6
	rep ret
	.cfi_endproc
.LFE0:
	.size	set, .-set
	.ident	"GCC: (Debian 4.8.1-10) 4.8.1"
	.section	.note.GNU-stack,"",@progbits
-------------- next part --------------
	.file	"set2.c"
	.text
	.p2align 4,,15
	.globl	set
	.type	set, @function
set:
.LFB0:
	.cfi_startproc
	leaq	16(%rdi), %rax
	cmpq	%rax, %rsi
	jb	.L15
.L7:
	xorl	%eax, %eax
	movq	%rdi, %rsi
	.p2align 4,,10
	.p2align 3
.L5:
	movdqu	(%rsi,%rax), %xmm1
	pslld	$1, %xmm1
	movdqa	%xmm1, %xmm0
	pslld	$2, %xmm0
	psubd	%xmm1, %xmm0
	movdqa	%xmm0, %xmm1
	pslld	$3, %xmm1
	psubd	%xmm0, %xmm1
	movdqu	%xmm1, (%rdi,%rax)
	addq	$16, %rax
	cmpq	$512, %rax
	jne	.L5
	rep ret
	.p2align 4,,10
	.p2align 3
.L15:
	leaq	16(%rsi), %rax
	cmpq	%rax, %rdi
	jae	.L7
	xorl	%eax, %eax
	.p2align 4,,10
	.p2align 3
.L6:
	movl	(%rsi,%rax), %edx
	movl	$42, %ecx
	imull	%ecx, %edx
	movl	%edx, (%rdi,%rax)
	addq	$4, %rax
	cmpq	$512, %rax
	jne	.L6
	rep ret
	.cfi_endproc
.LFE0:
	.size	set, .-set
	.ident	"GCC: (Debian 4.8.1-10) 4.8.1"
	.section	.note.GNU-stack,"",@progbits
-------------- next part --------------
	.file	"set1.c"
	.text
	.p2align 4,,15
	.globl	set
	.type	set, @function
set:
.LFB0:
	.cfi_startproc
	movq	%rdi, %rax
	andl	$31, %eax
	shrq	$2, %rax
	negq	%rax
	andl	$7, %eax
	je	.L7
	movl	(%rdi), %edx
	movl	$42, %r11d
	imull	%r11d, %edx
	cmpl	$1, %eax
	movl	%edx, (%rdi)
	jbe	.L8
	movl	4(%rdi), %edx
	movl	$42, %r10d
	imull	%r10d, %edx
	cmpl	$2, %eax
	movl	%edx, 4(%rdi)
	jbe	.L9
	movl	8(%rdi), %edx
	movl	$42, %r9d
	imull	%r9d, %edx
	cmpl	$3, %eax
	movl	%edx, 8(%rdi)
	jbe	.L10
	movl	12(%rdi), %edx
	movl	$42, %r8d
	imull	%r8d, %edx
	cmpl	$4, %eax
	movl	%edx, 12(%rdi)
	jbe	.L11
	movl	16(%rdi), %edx
	movl	$42, %esi
	imull	%esi, %edx
	cmpl	$5, %eax
	movl	%edx, 16(%rdi)
	jbe	.L12
	movl	20(%rdi), %edx
	movl	$42, %ecx
	imull	%ecx, %edx
	cmpl	$6, %eax
	movl	%edx, 20(%rdi)
	jbe	.L13
	movl	24(%rdi), %edx
	movl	$42, %r11d
	movl	$7, %r9d
	imull	%r11d, %edx
	movl	%edx, 24(%rdi)
	movl	$121, %edx
.L2:
	movl	$128, %ecx
	vmovdqa	.LC0(%rip), %ymm0
	subl	%eax, %ecx
	movl	%eax, %eax
	leaq	(%rdi,%rax,4), %rax
	movl	%ecx, %r8d
	shrl	$3, %r8d
	vpmulld	(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, (%rax)
	cmpl	$15, %r8d
	vpmulld	32(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 32(%rax)
	leal	0(,%r8,8), %esi
	vpmulld	64(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 64(%rax)
	vpmulld	96(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 96(%rax)
	vpmulld	128(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 128(%rax)
	vpmulld	160(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 160(%rax)
	vpmulld	192(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 192(%rax)
	vpmulld	224(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 224(%rax)
	vpmulld	256(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 256(%rax)
	vpmulld	288(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 288(%rax)
	vpmulld	320(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 320(%rax)
	vpmulld	352(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 352(%rax)
	vpmulld	384(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 384(%rax)
	vpmulld	416(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 416(%rax)
	vpmulld	448(%rax), %ymm0, %ymm1
	vmovdqa	%ymm1, 448(%rax)
	jbe	.L4
	vpmulld	480(%rax), %ymm0, %ymm0
	vmovdqa	%ymm0, 480(%rax)
.L4:
	leal	(%r9,%rsi), %eax
	subl	%esi, %edx
	cmpl	%esi, %ecx
	je	.L29
	movslq	%eax, %rcx
	movl	$42, %r9d
	leaq	(%rdi,%rcx,4), %rcx
	movl	(%rcx), %esi
	imull	%r9d, %esi
	cmpl	$1, %edx
	movl	%esi, (%rcx)
	leal	1(%rax), %ecx
	je	.L29
	movslq	%ecx, %rcx
	movl	$42, %r8d
	leaq	(%rdi,%rcx,4), %rcx
	movl	(%rcx), %esi
	imull	%r8d, %esi
	cmpl	$2, %edx
	movl	%esi, (%rcx)
	leal	2(%rax), %ecx
	je	.L29
	movslq	%ecx, %rcx
	movl	$42, %r11d
	leaq	(%rdi,%rcx,4), %rcx
	movl	(%rcx), %esi
	imull	%r11d, %esi
	cmpl	$3, %edx
	movl	%esi, (%rcx)
	leal	3(%rax), %ecx
	je	.L29
	movslq	%ecx, %rcx
	movl	$42, %r10d
	leaq	(%rdi,%rcx,4), %rcx
	movl	(%rcx), %esi
	imull	%r10d, %esi
	cmpl	$4, %edx
	movl	%esi, (%rcx)
	leal	4(%rax), %ecx
	je	.L29
	movslq	%ecx, %rcx
	movl	$42, %r9d
	leaq	(%rdi,%rcx,4), %rcx
	movl	(%rcx), %esi
	imull	%r9d, %esi
	cmpl	$5, %edx
	movl	%esi, (%rcx)
	leal	5(%rax), %ecx
	je	.L29
	movslq	%ecx, %rcx
	movl	$42, %r8d
	addl	$6, %eax
	leaq	(%rdi,%rcx,4), %rcx
	movl	(%rcx), %esi
	imull	%r8d, %esi
	cmpl	$6, %edx
	movl	%esi, (%rcx)
	je	.L29
	cltq
	movl	$42, %r10d
	leaq	(%rdi,%rax,4), %rax
	movl	(%rax), %edx
	imull	%r10d, %edx
	movl	%edx, (%rax)
.L29:
	vzeroupper
	ret
	.p2align 4,,10
	.p2align 3
.L7:
	movl	$128, %edx
	xorl	%r9d, %r9d
	jmp	.L2
	.p2align 4,,10
	.p2align 3
.L13:
	movl	$122, %edx
	movl	$6, %r9d
	jmp	.L2
	.p2align 4,,10
	.p2align 3
.L8:
	movl	$127, %edx
	movl	$1, %r9d
	jmp	.L2
	.p2align 4,,10
	.p2align 3
.L9:
	movl	$126, %edx
	movl	$2, %r9d
	jmp	.L2
	.p2align 4,,10
	.p2align 3
.L10:
	movl	$125, %edx
	movl	$3, %r9d
	jmp	.L2
	.p2align 4,,10
	.p2align 3
.L11:
	movl	$124, %edx
	movl	$4, %r9d
	jmp	.L2
	.p2align 4,,10
	.p2align 3
.L12:
	movl	$123, %edx
	movl	$5, %r9d
	jmp	.L2
	.cfi_endproc
.LFE0:
	.size	set, .-set
	.section	.rodata.cst32,"aM",@progbits,32
	.align 32
.LC0:
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.ident	"GCC: (Debian 4.8.1-10) 4.8.1"
	.section	.note.GNU-stack,"",@progbits
-------------- next part --------------
	.file	"set1.c"
	.text
	.p2align 4,,15
	.globl	set
	.type	set, @function
set:
.LFB0:
	.cfi_startproc
	leaq	32(%rdi), %rax
	cmpq	%rax, %rsi
	jb	.L12
	movq  %rdi, %rsi
.L6:
	vmovdqu	(%rsi), %ymm1
	vmovdqa	.LC0(%rip), %ymm0
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, (%rdi)
	vmovdqu	32(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 32(%rdi)
	vmovdqu	64(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 64(%rdi)
	vmovdqu	96(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 96(%rdi)
	vmovdqu	128(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 128(%rdi)
	vmovdqu	160(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 160(%rdi)
	vmovdqu	192(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 192(%rdi)
	vmovdqu	224(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 224(%rdi)
	vmovdqu	256(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 256(%rdi)
	vmovdqu	288(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 288(%rdi)
	vmovdqu	320(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 320(%rdi)
	vmovdqu	352(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 352(%rdi)
	vmovdqu	384(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 384(%rdi)
	vmovdqu	416(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 416(%rdi)
	vmovdqu	448(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm1
	vmovdqu	%ymm1, 448(%rdi)
	vmovdqu	480(%rsi), %ymm1
	vpmulld	%ymm0, %ymm1, %ymm0
	vmovdqu	%ymm0, 480(%rdi)
	vzeroupper
	ret
	.p2align 4,,10
	.p2align 3
.L12:
	leaq	32(%rsi), %rax
	cmpq	%rax, %rdi
	jae	.L6
	xorl	%eax, %eax
	.p2align 4,,10
	.p2align 3
.L5:
	movl	(%rsi,%rax), %edx
	movl	$42, %ecx
	imull	%ecx, %edx
	movl	%edx, (%rdi,%rax)
	addq	$4, %rax
	cmpq	$512, %rax
	jne	.L5
	rep ret
	.cfi_endproc
.LFE0:
	.size	set, .-set
	.section	.rodata.cst32,"aM",@progbits,32
	.align 32
.LC0:
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.long	42
	.ident	"GCC: (Debian 4.8.1-10) 4.8.1"
	.section	.note.GNU-stack,"",@progbits


More information about the Gcc mailing list