[PATCH 2/2][RFC] Add loop masking support for x86

Thu Jul 15 15:15:19 GMT 2021

On Thu, 15 Jul 2021, Richard Sandiford wrote:

> Richard Biener <rguenther@suse.de> writes:
> > On Thu, 15 Jul 2021, Hongtao Liu wrote:
> >
> >> On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches
> >> <gcc-patches@gcc.gnu.org> wrote:
> >> >
> >> > On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <rguenther@suse.de> wrote:
> >> > >
> >> > > The following extends the existing loop masking support using
> >> > > SVE WHILE_ULT to x86 by proving an alternate way to produce the
> >> > > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> >> > > you can now enable masked vectorized epilogues (=1) or fully
> >> > > masked vector loops (=2).
> >> > >
> >> > > What's missing is using a scalar IV for the loop control
> >> > > (but in principle AVX512 can use the mask here - just the patch
> >> > > doesn't seem to work for AVX512 yet for some reason - likely
> >> > > expand_vec_cond_expr_p doesn't work there).  What's also missing
> >> > > is providing more support for predicated operations in the case
> >> > > of reductions either via VEC_COND_EXPRs or via implementing
> >> > > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> >> > > to masked AVX512 operations.
> >> > >
> >> > > For AVX2 and
> >> > >
> >> > > int foo (unsigned *a, unsigned * __restrict b, int n)
> >> > > {
> >> > >   unsigned sum = 1;
> >> > >   for (int i = 0; i < n; ++i)
> >> > >     b[i] += a[i];
> >> > >   return sum;
> >> > > }
> >> > >
> >> > > we get
> >> > >
> >> > > .L3:
> >> > >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> >> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> >> > >         addl    $8, %edx
> >> > >         vpaddd  %ymm3, %ymm1, %ymm1
> >> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> >> > >         vmovd   %edx, %xmm1
> >> > >         vpsubd  %ymm15, %ymm2, %ymm0
> >> > >         addq    $32, %rax
> >> > >         vpbroadcastd    %xmm1, %ymm1
> >> > >         vpaddd  %ymm4, %ymm1, %ymm1
> >> > >         vpsubd  %ymm15, %ymm1, %ymm1
> >> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> >> > >         vptest  %ymm0, %ymm0
> >> > >         jne     .L3
> >> > >
> >> > > for the fully masked loop body and for the masked epilogue
> >> > > we see
> >> > >
> >> > > .L4:
> >> > >         vmovdqu (%rsi,%rax), %ymm3
> >> > >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
> >> > >         vmovdqu %ymm0, (%rsi,%rax)
> >> > >         addq    $32, %rax
> >> > >         cmpq    %rax, %rcx
> >> > >         jne     .L4
> >> > >         movl    %edx, %eax
> >> > >         andl    $-8, %eax
> >> > >         testb   $7, %dl
> >> > >         je      .L11
> >> > > .L3:
> >> > >         subl    %eax, %edx
> >> > >         vmovdqa .LC0(%rip), %ymm1
> >> > >         salq    $2, %rax
> >> > >         vmovd   %edx, %xmm0
> >> > >         movl    $-2147483648, %edx
> >> > >         addq    %rax, %rsi
> >> > >         vmovd   %edx, %xmm15
> >> > >         vpbroadcastd    %xmm0, %ymm0
> >> > >         vpbroadcastd    %xmm15, %ymm15
> >> > >         vpsubd  %ymm15, %ymm1, %ymm1
> >> > >         vpsubd  %ymm15, %ymm0, %ymm0
> >> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> >> > >         vpmaskmovd      (%rsi), %ymm0, %ymm1
> >> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
> >> > >         vpaddd  %ymm2, %ymm1, %ymm1
> >> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> >> > > .L11:
> >> > >         vzeroupper
> >> > >
> >> > > compared to
> >> > >
> >> > > .L3:
> >> > >         movl    %edx, %r8d
> >> > >         subl    %eax, %r8d
> >> > >         leal    -1(%r8), %r9d
> >> > >         cmpl    $2, %r9d
> >> > >         jbe     .L6
> >> > >         leaq    (%rcx,%rax,4), %r9
> >> > >         vmovdqu (%rdi,%rax,4), %xmm2
> >> > >         movl    %r8d, %eax
> >> > >         andl    $-4, %eax
> >> > >         vpaddd  (%r9), %xmm2, %xmm0
> >> > >         addl    %eax, %esi
> >> > >         andl    $3, %r8d
> >> > >         vmovdqu %xmm0, (%r9)
> >> > >         je      .L2
> >> > > .L6:
> >> > >         movslq  %esi, %r8
> >> > >         leaq    0(,%r8,4), %rax
> >> > >         movl    (%rdi,%r8,4), %r8d
> >> > >         addl    %r8d, (%rcx,%rax)
> >> > >         leal    1(%rsi), %r8d
> >> > >         cmpl    %r8d, %edx
> >> > >         jle     .L2
> >> > >         addl    $2, %esi
> >> > >         movl    4(%rdi,%rax), %r8d
> >> > >         addl    %r8d, 4(%rcx,%rax)
> >> > >         cmpl    %esi, %edx
> >> > >         jle     .L2
> >> > >         movl    8(%rdi,%rax), %edx
> >> > >         addl    %edx, 8(%rcx,%rax)
> >> > > .L2:
> >> > >
> >> > > I'm giving this a little testing right now but will dig on why
> >> > > I don't get masked loops when AVX512 is enabled.
> >> >
> >> > Ah, a simple thinko - rgroup_controls vectypes seem to be
> >> > always VECTOR_BOOLEAN_TYPE_P and thus we can
> >> > use expand_vec_cmp_expr_p.  The AVX512 fully masked
> >> > loop then looks like
> >> >
> >> > .L3:
> >> >         vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> >> >         vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> >> >         vpaddd  %ymm2, %ymm1, %ymm0
> >> >         vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> >> >         addq    $8, %rax
> >> >         vpbroadcastd    %eax, %ymm0
> >> >         vpaddd  %ymm4, %ymm0, %ymm0
> >> >         vpcmpud $6, %ymm0, %ymm3, %k1
> >> >         kortestb        %k1, %k1
> >> >         jne     .L3
> >> >
> >> > I guess for x86 it's not worth preserving the VEC_COND_EXPR
> >> > mask generation but other archs may not provide all required vec_cmp
> >> > expanders.
> >> 
> >> For the main loop, the full-masked loop's codegen seems much worse.
> >> Basically, we need at least 4 instructions to do what while_ult in arm does.
> >> 
> >>          vpbroadcastd    %eax, %ymm0
> >>          vpaddd  %ymm4, %ymm0, %ymm0
> >>          vpcmpud $6, %ymm0, %ymm3, %k1
> >>          kortestb        %k1, %k1
> >> vs
> >>        whilelo(or some other while<op>)
> >> 
> >> more instructions are needed for avx2 since there's no direct
> >> instruction for .COND_{ADD,SUB..}
> >> 
> >> original
> >> .L4:
> >>         vmovdqu (%rcx,%rax), %ymm1
> >>         vpaddd (%rdi,%rax), %ymm1, %ymm0
> >>         vmovdqu %ymm0, (%rcx,%rax)
> >>         addq $32, %rax
> >>         cmpq %rax, %rsi
> >>         jne .L4
> >> 
> >> vs
> >> avx512 full-masked loop
> >> .L3:
> >>          vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> >>          vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> >>          vpaddd  %ymm2, %ymm1, %ymm0
> >>          vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> >>          addq    $8, %rax
> >>          vpbroadcastd    %eax, %ymm0
> >>          vpaddd  %ymm4, %ymm0, %ymm0
> >>          vpcmpud $6, %ymm0, %ymm3, %k1
> >>          kortestb        %k1, %k1
> >>          jne     .L3
> >> 
> >> vs
> >> avx2 full-masked loop
> >> .L3:
> >>          vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> >>          vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> >>          addl    $8, %edx
> >>          vpaddd  %ymm3, %ymm1, %ymm1
> >>          vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> >>          vmovd   %edx, %xmm1
> >>          vpsubd  %ymm15, %ymm2, %ymm0
> >>          addq    $32, %rax
> >>          vpbroadcastd    %xmm1, %ymm1
> >>          vpaddd  %ymm4, %ymm1, %ymm1
> >>          vpsubd  %ymm15, %ymm1, %ymm1
> >>          vpcmpgtd        %ymm1, %ymm0, %ymm0
> >>         vptest  %ymm0, %ymm0
> >>          jne     .L3
> >> 
> >> vs  arm64's code
> >> 
> >> .L3:
> >>     ld1w z1.s, p0/z, [x1, x3, lsl 2]
> >>     ld1w z0.s, p0/z, [x0, x3, lsl 2]
> >>     add z0.s, z0.s, z1.s
> >>     st1w z0.s, p0, [x1, x3, lsl 2]
> >>     add x3, x3, x4
> >>     whilelo p0.s, w3, w2
> >>     b.any .L3
> >
> > Yes, that's true - it might still be OK for vectorizing epilogues
> > and thus --param vect-partial-vector-usage=1
> >
> > Can AVX512 do any better than this?
> >
> >         vpbroadcastd    %eax, %ymm0
> >         vpaddd  %ymm4, %ymm0, %ymm0
> >         vpcmpud $6, %ymm0, %ymm3, %k1
> >
> > Note with multiple types involved things get even worse
> > since you need masks for each vector mode.  But as far as
> > I can see that's the same for SVE (but there we have the
> > single-instruction whilelo).  I guess we'll also generate
> > wrong code at the moment for the case where we need
> > multiple vectors to hold the full mask - vect_gen_while
> > doesn't seem to be prepared for this?
> >
> > So with
> >
> > int foo (unsigned long *a, unsigned * __restrict b, int n)
> > {
> >   unsigned sum = 1;
> >   for (int i = 0; i < n; ++i)
> >     {
> >       b[i] += a[i];
> >     }
> >   return sum;
> > }
> >
> > SVE uses
> >
> > .L3:
> >         ld1d    z0.d, p0/z, [x1, x3, lsl 3]
> >         ld1d    z1.d, p0/z, [x0, x3, lsl 3]
> >         adr     z0.d, [z0.d, z1.d, lsl 2]
> >         st1d    z0.d, p0, [x1, x3, lsl 3]
> >         add     x3, x3, x4
> >         whilelo p0.d, w3, w2
> >         b.any   .L3
> >
> > so p0 vs. p0/z, whatever that means and it looks like
> > the vector add can somehow concatenate z0.d and z1.d.
> > Truly fascinating ;)
> 
> It looks like this is from a version in which “b” was also long.
> For the loop above I get:
> 
> .L3:
>         ld1d    z0.d, p0/z, [x0, x3, lsl 3]
>         ld1w    z1.d, p0/z, [x1, x3, lsl 2]
>         add     z0.s, z0.s, z1.s
>         st1w    z0.d, p0, [x1, x3, lsl 2]
>         add     x3, x3, x4
>         whilelo p0.d, w3, w2
>         b.any   .L3
> 
> where the ld1w is an extending load and the st1w is a truncating store.
> 
> But yeah, there's a hard-coded assumption that a mask for 2 SIs
> can also control 1 DI, etc.  For example:
> 
> void foo (unsigned long *a, unsigned * __restrict b, int n)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       a[i * 2] += 1;
>       a[i * 2 + 1] += 2;
>       b[i * 4] += 1;
>       b[i * 4 + 1] += 2;
>       b[i * 4 + 2] += 3;
>       b[i * 4 + 3] += 4;
>     }
> }
> 
> becomes:
> 
> .L3:
>         ld1d    z0.d, p0/z, [x0]
>         add     z0.d, z0.d, z2.d
>         st1d    z0.d, p0, [x0]
>         ld1w    z0.s, p0/z, [x1, x3, lsl 2]
>         add     z0.s, z0.s, z1.s
>         st1w    z0.s, p0, [x1, x3, lsl 2]
>         add     x0, x0, x4
>         add     x3, x3, x5
>         whilelo p0.s, x3, x2
>         b.any   .L3
> 
> This is explained more in the big comment above rgroup_controls
> (guess you've already seen it).

OK, guess I was more looking at

#define N 32
int foo (unsigned long *a, unsigned long * __restrict b,
         unsigned int *c, unsigned int * __restrict d,
         int n)
{
  unsigned sum = 1;
  for (int i = 0; i < n; ++i)
    {
      b[i] += a[i];
      d[i] += c[i];
    }
  return sum;
}

where we on x86 AVX512 vectorize with V8DI and V16SI and we
generate two masks for the two copies of V8DI (VF is 16) and one
mask for V16SI.  With SVE I see

        punpklo p1.h, p0.b
        punpkhi p2.h, p0.b

that's sth I expected to see for AVX512 as well, using the V16SI
mask and unpacking that to two V8DI ones.  But I see

        vpbroadcastd    %eax, %ymm0
        vpaddd  %ymm12, %ymm0, %ymm0
        vpcmpud $6, %ymm0, %ymm11, %k3
        vpbroadcastd    %eax, %xmm0
        vpaddd  %xmm10, %xmm0, %xmm0
        vpcmpud $1, %xmm7, %xmm0, %k1
        vpcmpud $6, %xmm0, %xmm8, %k2
        kortestb        %k1, %k1
        jne     .L3

so three %k masks generated by vpcmpud.  I'll have to look what's
the magic for SVE and why that doesn't trigger for x86 here.

Richard.

> > It looks like --param vect_partial_vector_usage defaults to 2,
> > power forces it to 1 (power10) or 0 otherwise.
> >
> > I think we'd need a target hook that toggles this per mode
> > so we could tune this dependent on AVX512 vectorization vs. AVX2.
> 
> Yeah, keying it off the architecture made sense for Power because the
> only purpose of len_load/store is to support partial vectorisation.
> But I agree a hook is needed now that we have MASK_LOAD/STORE for
> general use and are emulating WHILE_ULT.
> 
> Thanks,
> Richard
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)