Bug 97770 - [ICELAKE]Missing vectorization for vpopcnt
Summary: [ICELAKE]Missing vectorization for vpopcnt
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 11.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2020-11-10 02:12 UTC by Hongtao.liu
Modified: 2020-12-03 11:24 UTC (History)
6 users (show)

See Also:
Host:
Target: x86_64-*-* i?86-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed: 2020-11-10 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Hongtao.liu 2020-11-10 02:12:22 UTC
cat test.c
---
void
foo(int* __restrict dest, int* src, int n)
{
  for (int i = 0; i != 8; i++)
    dest[i] = __builtin_popcount (src[i]);
}
---

gcc -O3 -march=icelake-server -S -fopt-info-all

Inlined 0 calls, eliminated 0 functions

test.c:4:3: missed: couldn't vectorize loop
test.c:5:15: missed: not vectorized: relevant stmt not supported: _7 = __builtin_popcount (_5);
test.c:2:1: note: vectorized 0 loops in function.
test.c:4:3: note: ***** Analysis failed with vector mode VOID
test.c:4:3: note: ***** Analysis failed with vector mode V8SI
test.c:4:3: note: ***** Skipping vector mode V32QI, which would repeat the analysis for V8SI
test.c:6:1: note: ***** Analysis failed with vector mode VOID


This loop could be vectorized by ICC and Clang:

foo(int*, int*, int):
        vpopcntd  ymm0, YMMWORD PTR [rsi]                       #5.15
        vmovdqu   YMMWORD PTR [rdi], ymm0                       #5.5
        vzeroupper                                              #6.1
        ret
Comment 1 Hongtao.liu 2020-11-10 02:14:30 UTC
For target side, we need to add expander for popcountm2 with m vector mode
Comment 2 Hongtao.liu 2020-11-10 03:04:37 UTC
After adding expander, successfully vectorize the loop.
---
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b153a87fb98..e8159997c40 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -22678,6 +22678,12 @@ (define_insn "avx5124vnniw_vp4dpwssds_maskz"
     (set_attr ("prefix") ("evex"))
     (set_attr ("mode") ("TI"))])
 
+(define_expand "popcount<mode>2"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand")
+       (popcount:VI48_AVX512VL
+         (match_operand:VI48_AVX512VL 1 "nonimmediate_operand")))]
+  "TARGET_AVX512VPOPCNTDQ")
+
 (define_insn "vpopcount<mode><mask_name>"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
        (popcount:VI48_AVX512VL
@@ -22722,6 +22728,12 @@ (define_insn "*restore_multiple_leave_return<mode>"
   "TARGET_SSE && TARGET_64BIT"
   "jmp\t%P1")
 
+(define_insn "popcount<mode>2"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
+       (popcount:VI12_AVX512VL
+         (match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512BITALG")
+
 (define_insn "vpopcount<mode><mask_name>"
   [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
        (popcount:VI12_AVX512VL

---

But for vector byte/word/quadword, vectorizer still use vpopcntd, but not vpopcnt{b,w,q}, missing corresponding ifn?

void
fooq(long long* __restrict dest, long long* src)
{
  for (int i = 0; i != 4; i++)
    dest[i] = __builtin_popcount (src[i]);
}

void
foow(short* __restrict dest, short* src)
{
  for (int i = 0; i != 16; i++)
    dest[i] = __builtin_popcount (src[i]);
}

void
foob(char* __restrict dest, char* src)
{
  for (int i = 0; i != 32; i++)
    dest[i] = __builtin_popcount (src[i]);
}


dump of test.c.164.vect

;; Function foow (foow, funcdef_no=0, decl_uid=4228, cgraph_uid=1, symbol_order=0)

Merging blocks 2 and 6
foow (short int * restrict dest, short int * src)
{
  vector(8) short int * vectp_dest.10;
  vector(8) short int * vectp_dest.9;
  vector(8) short int vect__8.8;
  vector(4) int vect__6.7;
  vector(4) unsigned int vect__5.6;
  vector(8) short int vect__4.5;
  vector(8) short int * vectp_src.4;
  vector(8) short int * vectp_src.3;
  int i;
  long unsigned int _1;
  long unsigned int _2;
  short int * _3;
  short int _4;
  unsigned int _5;
  int _6;
  short int * _7;
  short int _8;
  unsigned int ivtmp_26;
  unsigned int ivtmp_28;
  unsigned int ivtmp_34;
  unsigned int ivtmp_35;

  <bb 2> [local count: 119292720]:

  <bb 3> [local count: 119292719]:
  # i_19 = PHI <i_15(5), 0(2)>
  # ivtmp_35 = PHI <ivtmp_34(5), 8(2)>
  # vectp_src.3_24 = PHI <vectp_src.3_23(5), src_12(D)(2)>
  # vectp_dest.9_9 = PHI <vectp_dest.9_29(5), dest_13(D)(2)>
  # ivtmp_26 = PHI <ivtmp_28(5), 0(2)>
  _1 = (long unsigned int) i_19;
  _2 = _1 * 2;
  _3 = src_12(D) + _2;
  vect__4.5_22 = MEM <vector(8) short int> [(short int *)vectp_src.3_24];
  _4 = *_3;
  vect__5.6_21 = [vec_unpack_lo_expr] vect__4.5_22;
  vect__5.6_18 = [vec_unpack_hi_expr] vect__4.5_22;
  _5 = (unsigned int) _4;
  vect__6.7_17 = .POPCOUNT (vect__5.6_21);
  vect__6.7_16 = .POPCOUNT (vect__5.6_18);
  _6 = 0;
  _7 = dest_13(D) + _2;
  vect__8.8_10 = VEC_PACK_TRUNC_EXPR <vect__6.7_17, vect__6.7_16>;
  _8 = (short int) _6;
  MEM <vector(8) short int> [(short int *)vectp_dest.9_9] = vect__8.8_10;
  i_15 = i_19 + 1;
  ivtmp_34 = ivtmp_35 - 1;
  vectp_src.3_23 = vectp_src.3_24 + 16;
  vectp_dest.9_29 = vectp_dest.9_9 + 16;
  ivtmp_28 = ivtmp_26 + 1;
  if (ivtmp_28 < 1)
    goto <bb 5>; [0.00%]
  else
    goto <bb 4>; [100.00%]

  <bb 5> [local count: 0]:
  goto <bb 3>; [100.00%]

  <bb 4> [local count: 119292720]:
  return;

}
Comment 3 Hongtao.liu 2020-11-10 03:21:47 UTC
> But for vector byte/word/quadword, vectorizer still use vpopcntd, but not
> vpopcnt{b,w,q}, missing corresponding ifn?

We don't have __builtin_popcount{w,b}, but we have __builtin_popcountl.

for testcase
---
void
fooq(unsigned long long* __restrict dest, unsigned long long* src)
{
  for (int i = 0; i != 4; i++)
    dest[i] = __builtin_popcountl (src[i]);
}
----

icc/clang generate
---
_Z4fooqPxS_:                            # @_Z4fooqPxS_
        vpopcntq        ymm0, ymmword ptr [rsi]
        vmovdqu ymmword ptr [rdi], ymm0
        vzeroupper
        ret
---

But gcc generate
---
fooq:
.LFB0:
	.cfi_startproc
	vpopcntq	16(%rsi), %xmm1
	vpopcntq	(%rsi), %xmm0
	vshufps	$136, %xmm1, %xmm0, %xmm0
	vpmovsxdq	%xmm0, %xmm1
	vpsrldq	$8, %xmm0, %xmm0
	vpmovsxdq	%xmm0, %xmm0
	vmovdqu	%xmm1, (%rdi)
	vmovdqu	%xmm0, 16(%rdi)
	ret
	.cfi_endproc
---

dump for 164.vect

---
;; Function fooq (fooq, funcdef_no=0, decl_uid=4228, cgraph_uid=1, symbol_order=0)

Merging blocks 2 and 6
fooq (long long unsigned int * restrict dest, long long unsigned int * src)
{
  vector(2) long long unsigned int * vectp_dest.10;
  vector(2) long long unsigned int * vectp_dest.9;
  vector(2) long long unsigned int vect__7.8;
  vector(4) int vect__5.7;
  vector(2) long long unsigned int vect__4.6;
  vector(2) long long unsigned int vect__4.5;
  vector(2) long long unsigned int * vectp_src.4;
  vector(2) long long unsigned int * vectp_src.3;
  int i;
  long unsigned int _1;
  long unsigned int _2;
  long long unsigned int * _3;
  long long unsigned int _4;
  int _5;
  long long unsigned int * _6;
  long long unsigned int _7;
  vector(2) long long unsigned int _8;
  vector(2) long long unsigned int _26;
  unsigned int ivtmp_30;
  unsigned int ivtmp_31;
  unsigned int ivtmp_36;
  unsigned int ivtmp_37;

  <bb 2> [local count: 214748368]:

  <bb 3> [local count: 214748371]:
  # i_18 = PHI <i_14(5), 0(2)>
  # ivtmp_31 = PHI <ivtmp_30(5), 4(2)>
  # vectp_src.3_20 = PHI <vectp_src.3_17(5), src_11(D)(2)>
  # vectp_dest.9_24 = PHI <vectp_dest.9_32(5), dest_12(D)(2)>
  # ivtmp_36 = PHI <ivtmp_37(5), 0(2)>
  _1 = (long unsigned int) i_18;
  _2 = _1 * 8;
  _3 = src_11(D) + _2;
  vect__4.5_16 = MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_src.3_20];
  vectp_src.3_15 = vectp_src.3_20 + 16;
  vect__4.6_9 = MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_src.3_15];
  _4 = *_3;
  _8 = .POPCOUNT (vect__4.5_16);
  _26 = .POPCOUNT (vect__4.6_9);
  vect__5.7_22 = VEC_PACK_TRUNC_EXPR <_8, _26>; --- Why do we do this?
  _5 = 0;
  _6 = dest_12(D) + _2;
  vect__7.8_23 = [vec_unpack_lo_expr] vect__5.7_22;
  vect__7.8_25 = [vec_unpack_hi_expr] vect__5.7_22;
  _7 = (long long unsigned int) _5;
  MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_dest.9_24] = vect__7.8_23;
  vectp_dest.9_34 = vectp_dest.9_24 + 16;
  MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_dest.9_34] = vect__7.8_25;
  i_14 = i_18 + 1;
  ivtmp_30 = ivtmp_31 - 1;
  vectp_src.3_17 = vectp_src.3_15 + 16;
  vectp_dest.9_32 = vectp_dest.9_34 + 16;
  ivtmp_37 = ivtmp_36 + 1;
  if (ivtmp_37 < 1)
    goto <bb 5>; [0.00%]
  else
    goto <bb 4>; [100.00%]

  <bb 5> [local count: 0]:
  goto <bb 3>; [100.00%]

  <bb 4> [local count: 214748368]:
  return;

}
---
Comment 4 Richard Biener 2020-11-10 08:12:28 UTC
What's missing is middle-end folding support to narrow popcount to the appropriate internal function call with byte/half-word width when target support
is available.  But I'm quite sure there's no scalar popcount instruction
operating on half-word or byte pieces of a GPR?

Alternatively the vectorizer can use patterns to do this.
Comment 5 Hongtao.liu 2020-11-10 08:37:00 UTC
(In reply to Richard Biener from comment #4)
> What's missing is middle-end folding support to narrow popcount to the
> appropriate internal function call with byte/half-word width when target
> support
> is available.  But I'm quite sure there's no scalar popcount instruction
> operating on half-word or byte pieces of a GPR?
> 
> Alternatively the vectorizer can use patterns to do this.

Yes, but for 64bit width, vectorizer generate suboptimal code.

sse #c3

  vector(2) long long unsigned int vect__4.6;
  vector(2) long long unsigned int vect__4.5;
  vector(2) long long unsigned int _8;
  vector(2) long long unsigned int _26;

  ...
  ...

  _8 = .POPCOUNT (vect__4.5_16);
  _26 = .POPCOUNT (vect__4.6_9);
  vect__5.7_22 = VEC_PACK_TRUNC_EXPR <_8, _26>; --- Why do we do this?
  vector(4) int vect__5.7;


It could generate directly

  v4di = .POPCOUNT (v4di);
Comment 6 Richard Biener 2020-11-10 08:55:45 UTC
(In reply to Hongtao.liu from comment #5)
> (In reply to Richard Biener from comment #4)
> > What's missing is middle-end folding support to narrow popcount to the
> > appropriate internal function call with byte/half-word width when target
> > support
> > is available.  But I'm quite sure there's no scalar popcount instruction
> > operating on half-word or byte pieces of a GPR?
> > 
> > Alternatively the vectorizer can use patterns to do this.
> 
> Yes, but for 64bit width, vectorizer generate suboptimal code.
> 
> sse #c3
> 
>   vector(2) long long unsigned int vect__4.6;
>   vector(2) long long unsigned int vect__4.5;
>   vector(2) long long unsigned int _8;
>   vector(2) long long unsigned int _26;
> 
>   ...
>   ...
> 
>   _8 = .POPCOUNT (vect__4.5_16);
>   _26 = .POPCOUNT (vect__4.6_9);
>   vect__5.7_22 = VEC_PACK_TRUNC_EXPR <_8, _26>; --- Why do we do this?
>   vector(4) int vect__5.7;
> 
> 
> It could generate directly
> 
>   v4di = .POPCOUNT (v4di);

I guess that the vectorized popcount IFN is defined to be VnDI -> VnDI
but we want to have VnSImode results.  This means the instruction is
wrongly modeled in vectorized form?

Note the vectorizer isn't very good in handling narrowing operations here.

If you can push the missing patterns I can have a look.  Bonus points for
a correctness testcase (from the above I think we're generating wrong code).
Comment 7 Thomas Koenig 2020-11-10 09:07:43 UTC
Some literature:

https://arxiv.org/pdf/1611.07612
Comment 8 Uroš Bizjak 2020-11-10 10:14:46 UTC
(In reply to Richard Biener from comment #4)
> What's missing is middle-end folding support to narrow popcount to the
> appropriate internal function call with byte/half-word width when target
> support
> is available.  But I'm quite sure there's no scalar popcount instruction
> operating on half-word or byte pieces of a GPR?

x86 has popcnt that operates on 16bit register.

https://www.felixcloutier.com/x86/popcnt
Comment 9 Hongtao.liu 2020-11-12 05:18:57 UTC
> I guess that the vectorized popcount IFN is defined to be VnDI -> VnDI
> but we want to have VnSImode results.  This means the instruction is
> wrongly modeled in vectorized form?
> 

Yes, because we have __builtin_popcount{l,ll} defined as {BT_FN_INT_ULONG, BT_FN_INT_ULONGLONG}

but for vectorized form, gcc require mode of src and dest to be the same. 

popcountm2:

Store into operand 0 the number of 1-bits in operand 1.
m is either a scalar or vector integer mode. When it is a scalar, operand 1 has
mode m but operand 0 can have whatever scalar integer mode is suitable for the
target. The compiler will insert conversion instructions as necessary (typically to convert the result to the same width as int). When m is a vector, both operands must have mode m. This pattern is not allowed to FAIL.
Comment 10 Richard Biener 2020-11-12 07:33:32 UTC
Hmm, but

DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, popcount, unary)

so there's clearly a mismatch between either the vectorizers interpretation
or the optab.  But as far as I can see this is not a direct internal fn so
vectorizable_internal_function shouldn't apply and I do not see the x86
backend handle POPCOUNT in the vectorizable function target hook.

So w/o a compiler capable I can't trace how the vectorizer vectorizes this
and thus I have no idea where it goes wrong ...
Comment 11 Hongtao.liu 2020-11-12 08:47:16 UTC
A patch is posted at https://gcc.gnu.org/pipermail/gcc-patches/2020-November/558777.html
Comment 12 CVS Commits 2020-12-03 02:03:13 UTC
The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>:

https://gcc.gnu.org/g:81d590760c31e11e3a09135f4e182aea232035f2

commit r11-5693-g81d590760c31e11e3a09135f4e182aea232035f2
Author: Hongyu Wang <hongyu.wang@intel.com>
Date:   Wed Nov 11 09:41:13 2020 +0800

    Add popcount<mode> expander to enable popcount auto vectorization under AVX512BITALG/AVX512POPCNTDQ target.
    
    gcc/ChangeLog
    
            PR target/97770
            * config/i386/sse.md (popcount<mode>2): New expander
            for SI/DI vector modes.
            (popcount<mode>2): Likewise for QI/HI vector modes.
    
    gcc/testsuite/ChangeLog
    
            PR target/97770
            * gcc.target/i386/avx512bitalg-pr97770-1.c: New test.
            * gcc.target/i386/avx512vpopcntdq-pr97770-1.c: Likewise.
            * gcc.target/i386/avx512vpopcntdq-pr97770-2.c: Likewise.
            * gcc.target/i386/avx512vpopcntdqvl-pr97770-1.c: Likewise.
Comment 13 Hongtao.liu 2020-12-03 02:04:08 UTC
(In reply to Richard Biener from comment #10)
> Hmm, but
> 
> DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, popcount, unary)
> 
> so there's clearly a mismatch between either the vectorizers interpretation
> or the optab.  But as far as I can see this is not a direct internal fn so
> vectorizable_internal_function shouldn't apply and I do not see the x86
> backend handle POPCOUNT in the vectorizable function target hook.
> 
> So w/o a compiler capable I can't trace how the vectorizer vectorizes this
> and thus I have no idea where it goes wrong ...

capable compiler is ready.
Comment 14 Richard Biener 2020-12-03 11:24:27 UTC
So we vectorize to

  _18 = .POPCOUNT (vect__5.7_22);
  _17 = .POPCOUNT (vect__5.7_21);
  vect__6.8_16 = VEC_PACK_TRUNC_EXPR <_18, _17>;
  _6 = 0;
  _7 = dest_13(D) + _2;
  vect__8.9_10 = [vec_unpack_lo_expr] vect__6.8_16;
  vect__8.9_9 = [vec_unpack_hi_expr] vect__6.8_16;
  _8 = (long long int) _6;

which is exactly the issue that in the scalar code we have a 'int' producing
popcount with long long argument but the vector IFN produces a result of the
same width as the argument.  So the vectorizer compensates for that
(VEC_PACK_TRUNC_EXPR) and then vectorizes the widening that's in the scalar
code (vec_unpack_{lo,hi}_expr).  The fix for this and for the missing
byte and word variants is to add a pattern to tree-vect-patterns.c for this
case matching it to the .POPCOUNT internal function.  That possibly applies
to other bitops, too, like parity, ctz, ffs, etc.  There's quite some
_widen helpers in the pattern recog code so I'm not sure how complicated
it is to match

 (long)popcountl(long)

and

 (short)popcount((int)short)

Richard may have a good idea since he did the last "big" surgery there.