[Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512
hubicka at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Sat Jan 14 20:55:39 GMT 2023
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410
Bug ID: 108410
Summary: x264 averaging loop not optimized well for avx512
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
x264 benchmark has a loop averaging two unsigned char arrays that is executed
with relatively low trip counts that does not play well with our vectorized
code. For AVX512 most time is spent in unvectorized variant since the average
number of iterations is too small to reach the vector code.
This table shows runtimes of averaging given block size with scalar loop,
vectorized loop for individual vector sizes and aocc codegen:
size scalar 128 256 512 aocc
2 8.13 9.49 9.49 9.49 9.49
4 5.79 6.10 6.10 7.45 6.78
6 5.44 5.43 5.42 6.78 5.87
8 5.19 2.71 5.31 6.44 5.42
12 5.14 3.17 5.33 6.10 4.97
16 4.85 1.19 1.53 5.93 1.36
20 4.82 2.03 1.90 6.10 1.90
24 4.60 0.96 2.58 6.10 2.26
28 4.51 1.55 2.97 6.00 2.55
32 4.52 0.68 0.60 0.60 0.77
34 4.77 0.96 0.88 0.80 0.96
38 4.42 1.36 1.37 1.17 1.29
42 4.40 0.84 1.82 1.73 1.63
So for sizes 2-8 scalar loop wins.
For sizes 12-16 128bit vectorization wins, 20-28 behaves funily.
However avx512 vectorization is a huge loss for all sizes up to 31 bytes.
aocc seems to win for 16 bytes.
Note that one problem is that for 256bit vector we peel the epilogue loop
(since trip counts fits in max-completely-peeled-insns and
max-completely-peel-times. Bumping both twice makes avx512 prologue unrolled
too but it does not seem to help x264 benchmark itself.
bmk.c:
#include <stdlib.h>
unsigned char a[10000];
unsigned char b[10000];
unsigned char c[10000];
__attribute__ ((weak))
void
avg (unsigned char *a, unsigned char *b, unsigned char *c, int size)
{
for (int i = 0; i <size; i++)
{
a[i] = (b[i] + c[i] + 1) >> 1;
}
}
int
main(int argc, char**argv)
{
int size = atoi (argv[1]);
for (long i = 0 ; i < 10000000000/size; i++)
{
avg (a,b,c,size);
}
return 0;
}
#include <stdlib.h>
unsigned char a[10000];
unsigned char b[10000];
unsigned char c[10000];
__attribute__ ((weak))
void
avg (unsigned char *a, unsigned char *b, unsigned char *c, int size)
{
for (int i = 0; i <size; i++)
{
a[i] = (b[i] + c[i] + 1) >> 1;
}
}
int
main(int argc, char**argv)
{
int size = atoi (argv[1]);
for (long i = 0 ; i < 10000000000/size; i++)
{
avg (a,b,c,size);
}
return 0;
}
bmk.sh:
gcc -Ofast -march=native bmk.c -fno-tree-vectorize -o bmk.scalar
gcc -Ofast -march=native bmk.c -mprefer-vector-width=128 -o bmk.128
gcc -Ofast -march=native bmk.c -mprefer-vector-width=256 -o bmk.256
gcc -Ofast -march=native bmk.c -mprefer-vector-width=512 -o bmk.512
~/aocc-compiler-4.0.0//bin/clang -Ofast -march=native bmk.c -o bmk.aocc
echo "size scalar 128 256 512 aocc"
for size in 2 4 6 8 12 16 20 24 28 32 34 38 42
do
scalar=`time -f "%e" ./bmk.scalar $size 2>&1`
v128=`time -f "%e" ./bmk.128 $size 2>&1`
v256=`time -f "%e" ./bmk.256 $size 2>&1`
v512=`time -f "%e" ./bmk.512 $size 2>&1`
aocc=`time -f "%e" ./bmk.aocc $size 2>&1`
printf "%5i %7.2f %7.2f %7.2f %7.2f %7.2f\n" $size $scalar $v128 $v256 $v512
$aocc
done
aocc codegen:
# %bb.0: # %entry
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset %rbx, -16
testl %ecx, %ecx
jle .LBB0_15
# %bb.1: # %iter.check
movl %ecx, %r8d
cmpl $16, %ecx
jae .LBB0_3
# %bb.2:
xorl %eax, %eax
jmp .LBB0_14
.LBB0_3: # %vector.memcheck
leaq (%rsi,%r8), %r9
leaq (%rdi,%r8), %rax
leaq (%rdx,%r8), %r10
cmpq %rdi, %r9
seta %r11b
cmpq %rsi, %rax
seta %bl
cmpq %rdi, %r10
seta %r9b
cmpq %rdx, %rax
seta %r10b
xorl %eax, %eax
testb %bl, %r11b
jne .LBB0_14
# %bb.4: # %vector.memcheck
andb %r10b, %r9b
jne .LBB0_14
# %bb.5: # %vector.main.loop.iter.check
cmpl $128, %ecx
jae .LBB0_7
# %bb.6:
xorl %eax, %eax
jmp .LBB0_11
.LBB0_7: # %vector.ph
movl %r8d, %eax
andl $-128, %eax
xorl %ecx, %ecx
.p2align 4, 0x90
.LBB0_8: # %vector.body
# =>This Inner Loop Header: Depth=1
vmovdqu (%rdx,%rcx), %ymm0
vmovdqu 32(%rdx,%rcx), %ymm1
vmovdqu 64(%rdx,%rcx), %ymm2
vmovdqu 96(%rdx,%rcx), %ymm3
vpavgb (%rsi,%rcx), %ymm0, %ymm0
vpavgb 32(%rsi,%rcx), %ymm1, %ymm1
vpavgb 64(%rsi,%rcx), %ymm2, %ymm2
vpavgb 96(%rsi,%rcx), %ymm3, %ymm3
vmovdqu %ymm0, (%rdi,%rcx)
vmovdqu %ymm1, 32(%rdi,%rcx)
vmovdqu %ymm2, 64(%rdi,%rcx)
vmovdqu %ymm3, 96(%rdi,%rcx)
subq $-128, %rcx
cmpq %rcx, %rax
jne .LBB0_8
# %bb.9: # %middle.block
cmpq %r8, %rax
je .LBB0_15
# %bb.10: # %vec.epilog.iter.check
testb $112, %r8b
je .LBB0_14
.LBB0_11: # %vec.epilog.ph
movq %rax, %rcx
movl %r8d, %eax
andl $-16, %eax
.p2align 4, 0x90
.LBB0_12: # %vec.epilog.vector.body
# =>This Inner Loop Header: Depth=1
vmovdqu (%rdx,%rcx), %xmm0
vpavgb (%rsi,%rcx), %xmm0, %xmm0
vmovdqu %xmm0, (%rdi,%rcx)
addq $16, %rcx
cmpq %rcx, %rax
jne .LBB0_12
# %bb.13: # %vec.epilog.middle.block
cmpq %r8, %rax
je .LBB0_15
.p2align 4, 0x90
.LBB0_14: # %for.body
# =>This Inner Loop Header: Depth=1
movzbl (%rsi,%rax), %ecx
movzbl (%rdx,%rax), %ebx
leal 1(%rcx,%rbx), %ecx
shrl %ecx
movb %cl, (%rdi,%rax)
incq %rax
cmpq %rax, %r8
jne .LBB0_14
.LBB0_15: # %for.cond.cleanup
popq %rbx
.cfi_def_cfa_offset 8
vzeroupper
retq
trunk does generate the following.
Prologue can be simplified (i.e. cml $30, %eax replaced by $cmp $31, %rcx)
and there is 256 move at L4 just not used for small block sizes becaue of the
prologue check.
avg:
.LFB11:
.cfi_startproc
movq %rdx, %r8
movl %ecx, %edx
testl %ecx, %ecx
jle .L27
leal -1(%rcx), %eax
movl %ecx, %r9d
cmpl $30, %eax
jbe .L3
leaq 1(%rsi), %r10
movq %rdi, %rcx
subq %r10, %rcx
cmpq $62, %rcx
jbe .L3
leaq 1(%r8), %r10
movq %rdi, %rcx
subq %r10, %rcx
cmpq $62, %rcx
jbe .L3
cmpl $62, %eax
jbe .L12
movl %edx, %ecx
xorl %eax, %eax
shrl $6, %ecx
salq $6, %rcx
.p2align 4
.p2align 3
.L5:
vmovdqu8 (%rsi,%rax), %zmm1
vpavgb (%r8,%rax), %zmm1, %zmm0
vmovdqu8 %zmm0, (%rdi,%rax)
addq $64, %rax
cmpq %rax, %rcx
jne .L5
movl %edx, %eax
andl $-64, %eax
movl %eax, %ecx
cmpl %eax, %edx
je .L26
movl %edx, %r9d
subl %eax, %r9d
leal -1(%r9), %r10d
cmpl $30, %r10d
jbe .L7
.L4:
vmovdqu8 (%rsi,%rcx), %ymm2
vpavgb (%r8,%rcx), %ymm2, %ymm0
vmovdqu8 %ymm0, (%rdi,%rcx)
movl %r9d, %ecx
andl $-32, %ecx
addl %ecx, %eax
andl $31, %r9d
je .L26
.L7:
cltq
.p2align 4
.p2align 3
.L9:
movzbl (%rsi,%rax), %r9d
movzbl (%r8,%rax), %ecx
leal 1(%r9,%rcx), %ecx
sarl %ecx
movb %cl, (%rdi,%rax)
incq %rax
cmpl %eax, %edx
jg .L9
.L26:
vzeroupper
.L27:
ret
.p2align 4
.p2align 3
.L3:
movslq %edx, %rcx
xorl %eax, %eax
.p2align 4
.p2align 3
.L10:
movzbl (%rsi,%rax), %r9d
movzbl (%r8,%rax), %edx
leal 1(%r9,%rdx), %edx
sarl %edx
movb %dl, (%rdi,%rax)
incq %rax
cmpq %rcx, %rax
jne .L10
ret
.L12:
xorl %ecx, %ecx
xorl %eax, %eax
jmp .L4
.cfi_endproc
.LFE11:
More information about the Gcc-bugs
mailing list