[Bug middle-end/108410] New: x264 averaging loop not optimized well for avx512

Sat Jan 14 20:55:39 GMT 2023

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410

            Bug ID: 108410
           Summary: x264 averaging loop not optimized well for avx512
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

x264 benchmark has a loop averaging two unsigned char arrays that is executed
with relatively low trip counts that does not play well with our vectorized
code.  For AVX512 most time is spent in unvectorized variant since the average
number of iterations is too small to reach the vector code.

This table shows runtimes of averaging given block size with scalar loop,
vectorized loop for individual vector sizes and aocc codegen:

size   scalar     128     256     512    aocc
    2    8.13    9.49    9.49    9.49    9.49
    4    5.79    6.10    6.10    7.45    6.78
    6    5.44    5.43    5.42    6.78    5.87
    8    5.19    2.71    5.31    6.44    5.42
   12    5.14    3.17    5.33    6.10    4.97
   16    4.85    1.19    1.53    5.93    1.36
   20    4.82    2.03    1.90    6.10    1.90
   24    4.60    0.96    2.58    6.10    2.26
   28    4.51    1.55    2.97    6.00    2.55
   32    4.52    0.68    0.60    0.60    0.77
   34    4.77    0.96    0.88    0.80    0.96
   38    4.42    1.36    1.37    1.17    1.29
   42    4.40    0.84    1.82    1.73    1.63

So for sizes 2-8 scalar loop wins.
For sizes 12-16 128bit vectorization wins, 20-28 behaves funily.
However avx512 vectorization is a huge loss for all sizes up to 31 bytes.
aocc seems to win for 16 bytes.

Note that one problem is that for 256bit vector we peel the epilogue loop
(since trip counts fits in max-completely-peeled-insns and
max-completely-peel-times. Bumping both twice makes avx512 prologue unrolled
too but it does not seem to help x264 benchmark itself.

bmk.c:
#include <stdlib.h>
unsigned char a[10000];
unsigned char b[10000];
unsigned char c[10000];

__attribute__ ((weak))
void
avg (unsigned char *a, unsigned char *b, unsigned char *c, int size)
{
  for (int i = 0; i <size; i++)
    {
      a[i] = (b[i] + c[i] + 1) >> 1;
    }
}
int
main(int argc, char**argv)
{
  int size = atoi (argv[1]);
  for (long i = 0 ; i < 10000000000/size; i++)
    {
      avg (a,b,c,size);
    }
  return 0;
}
#include <stdlib.h>
unsigned char a[10000];
unsigned char b[10000];
unsigned char c[10000];

__attribute__ ((weak))
void
avg (unsigned char *a, unsigned char *b, unsigned char *c, int size)
{
  for (int i = 0; i <size; i++)
    {
      a[i] = (b[i] + c[i] + 1) >> 1;
    }
}
int
main(int argc, char**argv)
{
  int size = atoi (argv[1]);
  for (long i = 0 ; i < 10000000000/size; i++)
    {
      avg (a,b,c,size);
    }
  return 0;
}

bmk.sh:
gcc -Ofast -march=native bmk.c -fno-tree-vectorize -o bmk.scalar
gcc -Ofast -march=native bmk.c -mprefer-vector-width=128 -o bmk.128
gcc -Ofast -march=native bmk.c -mprefer-vector-width=256 -o bmk.256
gcc -Ofast -march=native bmk.c -mprefer-vector-width=512 -o bmk.512
~/aocc-compiler-4.0.0//bin/clang -Ofast -march=native bmk.c -o bmk.aocc

echo "size   scalar     128     256     512    aocc"
for size in 2 4 6 8 12 16 20 24 28 32 34 38 42
do
  scalar=`time -f "%e" ./bmk.scalar $size 2>&1`
  v128=`time -f "%e" ./bmk.128 $size 2>&1`
  v256=`time -f "%e" ./bmk.256 $size 2>&1`
  v512=`time -f "%e" ./bmk.512 $size 2>&1`
  aocc=`time -f "%e" ./bmk.aocc $size 2>&1`
  printf "%5i %7.2f %7.2f %7.2f %7.2f %7.2f\n" $size $scalar $v128 $v256 $v512
$aocc
done

aocc codegen:
# %bb.0:                                # %entry
        pushq   %rbx
        .cfi_def_cfa_offset 16
        .cfi_offset %rbx, -16
        testl   %ecx, %ecx
        jle     .LBB0_15
# %bb.1:                                # %iter.check
        movl    %ecx, %r8d
        cmpl    $16, %ecx
        jae     .LBB0_3
# %bb.2:
        xorl    %eax, %eax
        jmp     .LBB0_14
.LBB0_3:                                # %vector.memcheck
        leaq    (%rsi,%r8), %r9
        leaq    (%rdi,%r8), %rax
        leaq    (%rdx,%r8), %r10
        cmpq    %rdi, %r9
        seta    %r11b
        cmpq    %rsi, %rax
        seta    %bl
        cmpq    %rdi, %r10
        seta    %r9b
        cmpq    %rdx, %rax
        seta    %r10b
        xorl    %eax, %eax
        testb   %bl, %r11b
        jne     .LBB0_14
# %bb.4:                                # %vector.memcheck
        andb    %r10b, %r9b
        jne     .LBB0_14
# %bb.5:                                # %vector.main.loop.iter.check
        cmpl    $128, %ecx
        jae     .LBB0_7
# %bb.6:
        xorl    %eax, %eax
        jmp     .LBB0_11
.LBB0_7:                                # %vector.ph
        movl    %r8d, %eax
        andl    $-128, %eax
        xorl    %ecx, %ecx
        .p2align        4, 0x90
.LBB0_8:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
        vmovdqu (%rdx,%rcx), %ymm0
        vmovdqu 32(%rdx,%rcx), %ymm1
        vmovdqu 64(%rdx,%rcx), %ymm2
        vmovdqu 96(%rdx,%rcx), %ymm3
        vpavgb  (%rsi,%rcx), %ymm0, %ymm0
        vpavgb  32(%rsi,%rcx), %ymm1, %ymm1
        vpavgb  64(%rsi,%rcx), %ymm2, %ymm2
        vpavgb  96(%rsi,%rcx), %ymm3, %ymm3
        vmovdqu %ymm0, (%rdi,%rcx)
        vmovdqu %ymm1, 32(%rdi,%rcx)
        vmovdqu %ymm2, 64(%rdi,%rcx)
        vmovdqu %ymm3, 96(%rdi,%rcx)
        subq    $-128, %rcx
        cmpq    %rcx, %rax
        jne     .LBB0_8
# %bb.9:                                # %middle.block
        cmpq    %r8, %rax
        je      .LBB0_15
# %bb.10:                               # %vec.epilog.iter.check
        testb   $112, %r8b
        je      .LBB0_14
.LBB0_11:                               # %vec.epilog.ph
        movq    %rax, %rcx
        movl    %r8d, %eax
        andl    $-16, %eax
        .p2align        4, 0x90
.LBB0_12:                               # %vec.epilog.vector.body
                                        # =>This Inner Loop Header: Depth=1
        vmovdqu (%rdx,%rcx), %xmm0
        vpavgb  (%rsi,%rcx), %xmm0, %xmm0
        vmovdqu %xmm0, (%rdi,%rcx)
        addq    $16, %rcx
        cmpq    %rcx, %rax
        jne     .LBB0_12
# %bb.13:                               # %vec.epilog.middle.block
        cmpq    %r8, %rax
        je      .LBB0_15
        .p2align        4, 0x90
.LBB0_14:                               # %for.body
                                        # =>This Inner Loop Header: Depth=1
        movzbl  (%rsi,%rax), %ecx
        movzbl  (%rdx,%rax), %ebx
        leal    1(%rcx,%rbx), %ecx
        shrl    %ecx
        movb    %cl, (%rdi,%rax)
        incq    %rax
        cmpq    %rax, %r8
        jne     .LBB0_14
.LBB0_15:                               # %for.cond.cleanup
        popq    %rbx
        .cfi_def_cfa_offset 8
        vzeroupper
        retq

trunk does generate the following.
Prologue can be simplified (i.e. cml $30, %eax replaced by $cmp $31, %rcx)
and there is 256 move at L4 just not used for small block sizes becaue of the
prologue check.
avg:
.LFB11:
        .cfi_startproc
        movq    %rdx, %r8
        movl    %ecx, %edx
        testl   %ecx, %ecx
        jle     .L27
        leal    -1(%rcx), %eax
        movl    %ecx, %r9d
        cmpl    $30, %eax
        jbe     .L3
        leaq    1(%rsi), %r10
        movq    %rdi, %rcx
        subq    %r10, %rcx
        cmpq    $62, %rcx
        jbe     .L3
        leaq    1(%r8), %r10
        movq    %rdi, %rcx
        subq    %r10, %rcx
        cmpq    $62, %rcx
        jbe     .L3
        cmpl    $62, %eax
        jbe     .L12
        movl    %edx, %ecx
        xorl    %eax, %eax
        shrl    $6, %ecx
        salq    $6, %rcx
        .p2align 4
        .p2align 3
.L5:
        vmovdqu8        (%rsi,%rax), %zmm1
        vpavgb  (%r8,%rax), %zmm1, %zmm0
        vmovdqu8        %zmm0, (%rdi,%rax)
        addq    $64, %rax
        cmpq    %rax, %rcx
        jne     .L5
        movl    %edx, %eax
        andl    $-64, %eax
        movl    %eax, %ecx
        cmpl    %eax, %edx
        je      .L26
        movl    %edx, %r9d
        subl    %eax, %r9d
        leal    -1(%r9), %r10d
        cmpl    $30, %r10d
        jbe     .L7
.L4:
        vmovdqu8        (%rsi,%rcx), %ymm2
        vpavgb  (%r8,%rcx), %ymm2, %ymm0
        vmovdqu8        %ymm0, (%rdi,%rcx)
        movl    %r9d, %ecx
        andl    $-32, %ecx
        addl    %ecx, %eax
        andl    $31, %r9d
        je      .L26
.L7:
        cltq
        .p2align 4
        .p2align 3
.L9:
        movzbl  (%rsi,%rax), %r9d
        movzbl  (%r8,%rax), %ecx
        leal    1(%r9,%rcx), %ecx
        sarl    %ecx
        movb    %cl, (%rdi,%rax)
        incq    %rax
        cmpl    %eax, %edx
        jg      .L9
.L26:
        vzeroupper
.L27:
        ret
        .p2align 4
        .p2align 3
.L3:
        movslq  %edx, %rcx
        xorl    %eax, %eax
        .p2align 4
        .p2align 3
.L10:
        movzbl  (%rsi,%rax), %r9d
        movzbl  (%r8,%rax), %edx
        leal    1(%r9,%rdx), %edx
        sarl    %edx
        movb    %dl, (%rdi,%rax)
        incq    %rax
        cmpq    %rcx, %rax
        jne     .L10
        ret
.L12:
        xorl    %ecx, %ecx
        xorl    %eax, %eax
        jmp     .L4
        .cfi_endproc
.LFE11: