Bug 111793 - OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
Summary: OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 14.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2023-10-13 07:28 UTC by Richard Biener
Modified: 2024-06-18 05:49 UTC (History)
2 users (show)

See Also:
Host:
Target: x86_64-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed: 2023-10-13 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Richard Biener 2023-10-13 07:28:13 UTC

    
Comment 1 Richard Biener 2023-10-13 07:33:14 UTC
#pragma omp declare simd simdlen(16) inbranch
__attribute__((noinline)) int
foo (int a, int b)
{
  return a + b;
}

we present if-conversion with the following which I think is good:

  <bb 3> [local count: 1073741824]:
  # iter.49_5 = PHI <0(2), iter.49_6(7)>
  # ivtmp_19 = PHI <16(2), ivtmp_18(7)>
  b_2 = b.45[iter.49_5];
  a_1 = a.44[iter.49_5];
  _8 = mask.48_7(D) >> iter.49_5;
  _9 = _8 & 1;
  if (_9 == 0)
    goto <bb 8>; [20.00%]
  else
    goto <bb 4>; [80.00%]

  <bb 8> [local count: 214748360]:
  goto <bb 5>; [100.00%]

  <bb 4> [local count: 858993464]:
  _3 = a_1 + b_2;
  retval.43[iter.49_5] = _3;

  <bb 5> [local count: 1073741824]:
  iter.49_6 = iter.49_5 + 1;
  ivtmp_18 = ivtmp_19 - 1;
  if (ivtmp_18 != 0)
    goto <bb 7>; [100.00%]
  else
    goto <bb 6>; [0.00%]

and we if-convert the body to

  <bb 3> [local count: 1073741824]:
  # iter.49_5 = PHI <iter.49_6(7), 0(15)>
  # ivtmp_19 = PHI <ivtmp_18(7), 16(15)>
  b_2 = b.45[iter.49_5];
  a_1 = a.44[iter.49_5];
  _8 = mask.48_7(D) >> iter.49_5;
  _9 = _8 & 1; 
  _22 = _9 == 0;
  _36 = (unsigned int) a_1;
  _37 = (unsigned int) b_2;
  _38 = _36 + _37;
  _3 = (int) _38;
  _39 = ~_22;
  _40 = &retval.43[iter.49_5];
  .MASK_STORE (_40, 32B, _39, _3);
  iter.49_6 = iter.49_5 + 1;
  ivtmp_18 = ivtmp_19 - 1;
  if (ivtmp_18 != 0)
    goto <bb 7>; [100.00%]
  else
    goto <bb 6>; [0.00%]

but rather than recovering the mask in its original form we vectorize this as

  vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D)};

  <bb 3> [local count: 10631108]:
  # vect_vec_iv_.125_42 = PHI <_43(7), { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }(2)>
  # vectp_b.126_44 = PHI <vectp_b.126_45(7), &b.45(2)>
  # vectp_a.129_47 = PHI <vectp_a.129_48(7), &a.44(2)>
  # vectp_retval.140_61 = PHI <vectp_retval.140_62(7), &retval.43(2)>
  # ivtmp_64 = PHI <ivtmp_65(7), 0(2)>
  _43 = vect_vec_iv_.125_42 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 };
  vect_b_2.128_46 = MEM <vector(16) int> [(int *)vectp_b.126_44];
  vect_a_1.131_49 = MEM <vector(16) int> [(int *)vectp_a.129_47];
  vect__8.132_51 = vect_cst__50 >> vect_vec_iv_.125_42;
  vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
  mask__22.134_55 = vect__9.133_53 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  vect__36.135_56 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(vect_a_1.131_49);
  vect__37.136_57 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(vect_b_2.128_46);
  vect__38.137_58 = vect__36.135_56 + vect__37.136_57;
  vect__3.138_59 = VIEW_CONVERT_EXPR<vector(16) int>(vect__38.137_58);
  mask__39.139_60 = ~mask__22.134_55;
  if (mask__39.139_60 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 })
    goto <bb 20>; [20.00%]
  else
    goto <bb 21>; [80.00%]

  <bb 21> [local count: 8504886]:
  .MASK_STORE (vectp_retval.140_61, 512B, mask__39.139_60, vect__3.138_59);

  <bb 20> [local count: 10631108]:
  vectp_b.126_45 = vectp_b.126_44 + 64;
  vectp_a.129_48 = vectp_a.129_47 + 64;
  vectp_retval.140_62 = vectp_retval.140_61 + 64;
  ivtmp_65 = ivtmp_64 + 1;
  if (ivtmp_65 < 1)
    goto <bb 7>; [0.00%]
  else
    goto <bb 17>; [100.00%]

in the end leaving us with

  <bb 2> [local count: 1073741824]:
  vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D)};
  vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
  vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
  vect__36.135_56 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(simd.46_13(D));
  vect__37.136_57 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(simd.47_15(D));
  vect__38.137_58 = vect__36.135_56 + vect__37.136_57;
  vect__3.138_59 = VIEW_CONVERT_EXPR<vector(16) int>(vect__38.137_58);
  mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  if (mask__39.139_60 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 })
    goto <bb 4>; [20.00%]
  else
    goto <bb 3>; [80.00%]

  <bb 3> [local count: 858993419]:
  .MASK_STORE (&retval.43, 512B, mask__39.139_60, vect__3.138_59);

  <bb 4> [local count: 1073741824]:
  _10 = VIEW_CONVERT_EXPR<vector(16) int>(retval.43);
  return _10;

and

_ZGVeM16vv_foo:
.LFB4:
        .cfi_startproc
        movl    $1, %eax
        pushq   %rbp
        .cfi_def_cfa_offset 16 
        .cfi_offset 6, -16
        vpbroadcastd    %edi, %zmm2
        vpaddd  %zmm1, %zmm0, %zmm0
        vpbroadcastd    %eax, %zmm3
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-64, %rsp
        vpsrlvd .LC0(%rip), %zmm2, %zmm2
        vpxor   %xmm1, %xmm1, %xmm1
        vpandd  %zmm3, %zmm2, %zmm2
        vpcmpd  $4, %zmm1, %zmm2, %k1
        kortestw        %k1, %k1
        je      .L25
        vmovdqa32       %zmm0, -64(%rsp){%k1}
.L25:
        vmovdqa32       -64(%rsp), %zmm0
        leave
        .cfi_def_cfa 7, 8
        ret
Comment 2 Richard Biener 2023-10-13 07:43:48 UTC
For AVX2 and simdlen(8) we get

  <bb 2> [local count: 1073741824]:
  vect__35.98_51 = VIEW_CONVERT_EXPR<vector(8) unsigned int>(simd.25_11(D));
  vect__36.99_52 = VIEW_CONVERT_EXPR<vector(8) unsigned int>(simd.26_13(D));
  vect__37.100_53 = vect__35.98_51 + vect__36.99_52;
  vect__3.101_54 = VIEW_CONVERT_EXPR<vector(8) int>(vect__37.100_53);
  mask__38.102_55 = mask.27_15(D) != { 0, 0, 0, 0, 0, 0, 0, 0 };
  if (mask__38.102_55 == { 0, 0, 0, 0, 0, 0, 0, 0 })
    goto <bb 4>; [20.00%]
  else
    goto <bb 3>; [80.00%]

  <bb 3> [local count: 858993419]:
  .MASK_STORE (&retval.21, 256B, mask__38.102_55, vect__3.101_54);

  <bb 4> [local count: 1073741824]:
  _8 = VIEW_CONVERT_EXPR<vector(8) int>(retval.21);
  return _8;

which is much more reasonable - I'm not sure whether the compare against 0
is required or if the ABI guarantees either 0 or -1 in the elements.  That
we end up with memory due to the use of a .MASK_STORE unfortunately persists
to the final assembly:

_ZGVdM8vv_foo:  
.LFB3:
        .cfi_startproc
        vpaddd  %ymm1, %ymm0, %ymm0
        vpxor   %xmm1, %xmm1, %xmm1
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        vpcmpeqd        %ymm1, %ymm2, %ymm2
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-32, %rsp
        vpcmpeqd        %ymm1, %ymm2, %ymm2
        vptest  %ymm2, %ymm2
        je      .L12
        vpmaskmovd      %ymm0, %ymm2, -32(%rsp)
.L12:
        vmovdqa -32(%rsp), %ymm0
        leave
        .cfi_def_cfa 7, 8
        ret

there's the opportunity to maybe rewrite .MASK_STORE to a VEC_COND_EXPR
and rewriting retval.21 to SSA.  Maybe if-conversion can see this already
given retval.21 is local automatic and not address taken.
Comment 3 Jakub Jelinek 2023-10-13 09:08:41 UTC
(In reply to Richard Biener from comment #2)
> which is much more reasonable - I'm not sure whether the compare against 0
> is required or if the ABI guarantees either 0 or -1 in the elements.

Yes, it does.  All ones elements that is, the element type can be unsigned integral or floating point as well...
Comment 4 Jakub Jelinek 2023-10-13 09:13:07 UTC
So, shouldn't we match.pd (or something else) pattern match
  vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D)};
  vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
  vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
  mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
back into mask__39.139_60 = mask.48_7(D); ?
Comment 5 Richard Biener 2023-10-13 09:39:36 UTC
(In reply to Jakub Jelinek from comment #4)
> So, shouldn't we match.pd (or something else) pattern match
>   vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
> mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
> mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
> mask.48_7(D), mask.48_7(D)};
>   vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
> 12, 13, 14, 15 };
>   vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
> 1, 1, 1 };
>   mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> 0, 0, 0, 0 };
> back into mask__39.139_60 = mask.48_7(D); ?

Yes, that's a possibility.  I wonder if it's possible to arrange things in the vectorizer itself so that costing gets more accurate (probably not that important for OMP SIMD though).

Maybe it works a bit better if we did mask & (1 << iv), but I guess we
canonicalize that back.

I've opened this for tracking for now, working on PR111795 first.
Comment 6 Richard Biener 2024-06-14 13:26:29 UTC
non-openmp testcase for the AVX512 mask inefficiency:

unsigned foo (unsigned *a, unsigned short mask)
{
  unsigned sum = 0;
  for (int i = 0; i < 16; ++i)
    if ((mask >> i) & 1)
      sum += a[i];
  return sum;
}

I think the AVX2 one is settled as being as good as it can get.
Comment 7 GCC Commits 2024-06-18 05:46:46 UTC
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>:

https://gcc.gnu.org/g:4b75ed33fa5fd604897e7a30e79bd28d46598373

commit r15-1391-g4b75ed33fa5fd604897e7a30e79bd28d46598373
Author: Richard Biener <rguenther@suse.de>
Date:   Fri Jun 14 14:46:08 2024 +0200

    Enhance if-conversion for automatic arrays
    
    Automatic arrays that are not address-taken should not be subject to
    store data races.  This applies to OMP SIMD in-branch lowered
    functions result array which for the testcase otherwise prevents
    vectorization with SSE and for AVX and AVX512 ends up with spurious
    .MASK_STORE to the stack surviving.
    
    This inefficiency was noted in PR111793.
    
    I've introduced ref_can_have_store_data_races, commonizing uses
    of flag_store_data_races in if-conversion, cselim and store motion.
    
            PR tree-optimization/111793
            * tree-ssa-alias.h (ref_can_have_store_data_races): Declare.
            * tree-ssa-alias.cc (ref_can_have_store_data_races): New
            function.
            * tree-if-conv.cc (ifcvt_memrefs_wont_trap): Use
            ref_can_have_store_data_races to allow more unconditional
            stores.
            * tree-ssa-loop-im.cc (execute_sm): Likewise.
            * tree-ssa-phiopt.cc (cond_store_replacement): Likewise.
    
            * gcc.dg/vect/vect-simd-clone-21.c: New testcase.
Comment 8 Richard Biener 2024-06-18 05:49:01 UTC
The committed fix should resolve the stray masked stores observed but not the inefficiency dealing with incoming AVX512 mask arguments.