Bug 101668 - BB vectorizer doesn't handle lowpart of existing vector
Summary: BB vectorizer doesn't handle lowpart of existing vector
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 12.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2021-07-29 01:47 UTC by Hongtao.liu
Modified: 2021-08-29 05:02 UTC (History)
1 user (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2021-08-28 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Hongtao.liu 2021-07-29 01:47:47 UTC
cat test.c

typedef int v16si __attribute__((vector_size (64)));
typedef long long v8di __attribute__((vector_size (64)));

void
bar_s32_s64 (v8di * dst, v16si src)
{
  long long tem[8];
  tem[0] = src[0];
  tem[1] = src[1];
  tem[2] = src[2];
  tem[3] = src[3];
  tem[4] = src[4];
  tem[5] = src[5];
  tem[6] = src[6];
  tem[7] = src[7];
  dst[0] = *(v8di *) tem;
}

gcc -O3 -march=skylake-avx512 will fail to vectorize the case after my r12-2549 because i've increased vec_construct cost for SKX/CLX. Here's dump for slp2

  <bb 2> [local count: 1073741824]:
  _1 = BIT_FIELD_REF <src_18(D), 32, 0>;
  _2 = (long long int) _1;
  _3 = BIT_FIELD_REF <src_18(D), 32, 32>;
  _4 = (long long int) _3;
  _5 = BIT_FIELD_REF <src_18(D), 32, 64>;
  _6 = (long long int) _5;
  _7 = BIT_FIELD_REF <src_18(D), 32, 96>;
  _8 = (long long int) _7;
  _9 = BIT_FIELD_REF <src_18(D), 32, 128>;
  _10 = (long long int) _9;
  _11 = BIT_FIELD_REF <src_18(D), 32, 160>;
  _12 = (long long int) _11;
  _13 = BIT_FIELD_REF <src_18(D), 32, 192>;
  _14 = (long long int) _13;
  _15 = BIT_FIELD_REF <src_18(D), 32, 224>;
  _31 = {_1, _3, _5, _7, _9, _11, _13, _15};
  vect__2.4_32 = (vector(8) long long int) _31;
  _16 = (long long int) _15;
  MEM <vector(8) long long int> [(long long int *)&tem] = vect__2.4_32;
  _17 = MEM[(v8di *)&tem];
  *dst_28(D) = _17;
  tem ={v} {CLOBBER};
  return;

But actually, there's no need for vec_contruct from each element, it will be optimized to

   <bb 2> [local count: 1073741824]:
  _2 = BIT_FIELD_REF <src_18(D), 256, 0>;
  vect__2.4_32 = (vector(8) long long int) _2;
  *dst_28(D) = vect__2.4_32;
  return;

So at the time slp2 can realize the optimization and categorize vec_contruct cost more accurately, we can avoid this regression.
Comment 1 Richard Biener 2021-07-29 06:55:56 UTC
The basic-block vectorizer is currently limited as to what "existing" vectors it recognizes.  In this testcase we're accessing only the lowpart of 'src',
something we cannot yet model in vectorizable_slp_permutation.  The specific
case isn't hard to fix, we'd get

  <bb 2> [local count: 1073741824]:
  _31 = VIEW_CONVERT_EXPR<vector(8) int>(src_18(D));
  vect__2.4_33 = [vec_unpack_lo_expr] _31;
  vect__2.4_34 = [vec_unpack_hi_expr] _31;
  MEM <vector(4) long long int> [(long long int *)&tem] = vect__2.4_33;
  MEM <vector(4) long long int> [(long long int *)&tem + 32B] = vect__2.4_34;
  _17 = MEM[(v8di *)&tem];
  *dst_28(D) = _17;
  tem ={v} {CLOBBER};
  return;

so we then fail to elide the temporary, producing

bar_s32_s64:
.LFB0:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        vpmovsxdq       %xmm0, %ymm1
        vextracti128    $0x1, %ymm0, %xmm0
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-64, %rsp
        subq    $8, %rsp
        vpmovsxdq       %xmm0, %ymm0
        vmovdqa %ymm1, -56(%rsp)
        vmovdqa %ymm0, -24(%rsp)
        vmovdqa64       -56(%rsp), %zmm2
        vmovdqa64       %zmm2, (%rdi)
        leave
        .cfi_def_cfa 7, 8
        ret

it looks like there's no V8SI->V8DI conversion optab or we choose V4DI
for some other reason as prefered vector mode.
Comment 2 Hongtao.liu 2021-07-29 07:03:40 UTC
> it looks like there's no V8SI->V8DI conversion optab or we choose V4DI
> for some other reason as prefered vector mode.

We have, just need to add -mprefer-vector-width=512, the we'll get

bar_s32_s64:
  vpmovsxdq zmm0, ymm0
  vmovdqa64 ZMMWORD PTR [rdi], zmm0
  ret