cat test.c --- void foo(int* __restrict dest, int* src, int n) { for (int i = 0; i != 8; i++) dest[i] = __builtin_popcount (src[i]); } --- gcc -O3 -march=icelake-server -S -fopt-info-all Inlined 0 calls, eliminated 0 functions test.c:4:3: missed: couldn't vectorize loop test.c:5:15: missed: not vectorized: relevant stmt not supported: _7 = __builtin_popcount (_5); test.c:2:1: note: vectorized 0 loops in function. test.c:4:3: note: ***** Analysis failed with vector mode VOID test.c:4:3: note: ***** Analysis failed with vector mode V8SI test.c:4:3: note: ***** Skipping vector mode V32QI, which would repeat the analysis for V8SI test.c:6:1: note: ***** Analysis failed with vector mode VOID This loop could be vectorized by ICC and Clang: foo(int*, int*, int): vpopcntd ymm0, YMMWORD PTR [rsi] #5.15 vmovdqu YMMWORD PTR [rdi], ymm0 #5.5 vzeroupper #6.1 ret
For target side, we need to add expander for popcountm2 with m vector mode
After adding expander, successfully vectorize the loop. --- diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b153a87fb98..e8159997c40 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -22678,6 +22678,12 @@ (define_insn "avx5124vnniw_vp4dpwssds_maskz" (set_attr ("prefix") ("evex")) (set_attr ("mode") ("TI"))]) +(define_expand "popcount<mode>2" + [(set (match_operand:VI48_AVX512VL 0 "register_operand") + (popcount:VI48_AVX512VL + (match_operand:VI48_AVX512VL 1 "nonimmediate_operand")))] + "TARGET_AVX512VPOPCNTDQ") + (define_insn "vpopcount<mode><mask_name>" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") (popcount:VI48_AVX512VL @@ -22722,6 +22728,12 @@ (define_insn "*restore_multiple_leave_return<mode>" "TARGET_SSE && TARGET_64BIT" "jmp\t%P1") +(define_insn "popcount<mode>2" + [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") + (popcount:VI12_AVX512VL + (match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "vm")))] + "TARGET_AVX512BITALG") + (define_insn "vpopcount<mode><mask_name>" [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") (popcount:VI12_AVX512VL --- But for vector byte/word/quadword, vectorizer still use vpopcntd, but not vpopcnt{b,w,q}, missing corresponding ifn? void fooq(long long* __restrict dest, long long* src) { for (int i = 0; i != 4; i++) dest[i] = __builtin_popcount (src[i]); } void foow(short* __restrict dest, short* src) { for (int i = 0; i != 16; i++) dest[i] = __builtin_popcount (src[i]); } void foob(char* __restrict dest, char* src) { for (int i = 0; i != 32; i++) dest[i] = __builtin_popcount (src[i]); } dump of test.c.164.vect ;; Function foow (foow, funcdef_no=0, decl_uid=4228, cgraph_uid=1, symbol_order=0) Merging blocks 2 and 6 foow (short int * restrict dest, short int * src) { vector(8) short int * vectp_dest.10; vector(8) short int * vectp_dest.9; vector(8) short int vect__8.8; vector(4) int vect__6.7; vector(4) unsigned int vect__5.6; vector(8) short int vect__4.5; vector(8) short int * vectp_src.4; vector(8) short int * vectp_src.3; int i; long unsigned int _1; long unsigned int _2; short int * _3; short int _4; unsigned int _5; int _6; short int * _7; short int _8; unsigned int ivtmp_26; unsigned int ivtmp_28; unsigned int ivtmp_34; unsigned int ivtmp_35; <bb 2> [local count: 119292720]: <bb 3> [local count: 119292719]: # i_19 = PHI <i_15(5), 0(2)> # ivtmp_35 = PHI <ivtmp_34(5), 8(2)> # vectp_src.3_24 = PHI <vectp_src.3_23(5), src_12(D)(2)> # vectp_dest.9_9 = PHI <vectp_dest.9_29(5), dest_13(D)(2)> # ivtmp_26 = PHI <ivtmp_28(5), 0(2)> _1 = (long unsigned int) i_19; _2 = _1 * 2; _3 = src_12(D) + _2; vect__4.5_22 = MEM <vector(8) short int> [(short int *)vectp_src.3_24]; _4 = *_3; vect__5.6_21 = [vec_unpack_lo_expr] vect__4.5_22; vect__5.6_18 = [vec_unpack_hi_expr] vect__4.5_22; _5 = (unsigned int) _4; vect__6.7_17 = .POPCOUNT (vect__5.6_21); vect__6.7_16 = .POPCOUNT (vect__5.6_18); _6 = 0; _7 = dest_13(D) + _2; vect__8.8_10 = VEC_PACK_TRUNC_EXPR <vect__6.7_17, vect__6.7_16>; _8 = (short int) _6; MEM <vector(8) short int> [(short int *)vectp_dest.9_9] = vect__8.8_10; i_15 = i_19 + 1; ivtmp_34 = ivtmp_35 - 1; vectp_src.3_23 = vectp_src.3_24 + 16; vectp_dest.9_29 = vectp_dest.9_9 + 16; ivtmp_28 = ivtmp_26 + 1; if (ivtmp_28 < 1) goto <bb 5>; [0.00%] else goto <bb 4>; [100.00%] <bb 5> [local count: 0]: goto <bb 3>; [100.00%] <bb 4> [local count: 119292720]: return; }
> But for vector byte/word/quadword, vectorizer still use vpopcntd, but not > vpopcnt{b,w,q}, missing corresponding ifn? We don't have __builtin_popcount{w,b}, but we have __builtin_popcountl. for testcase --- void fooq(unsigned long long* __restrict dest, unsigned long long* src) { for (int i = 0; i != 4; i++) dest[i] = __builtin_popcountl (src[i]); } ---- icc/clang generate --- _Z4fooqPxS_: # @_Z4fooqPxS_ vpopcntq ymm0, ymmword ptr [rsi] vmovdqu ymmword ptr [rdi], ymm0 vzeroupper ret --- But gcc generate --- fooq: .LFB0: .cfi_startproc vpopcntq 16(%rsi), %xmm1 vpopcntq (%rsi), %xmm0 vshufps $136, %xmm1, %xmm0, %xmm0 vpmovsxdq %xmm0, %xmm1 vpsrldq $8, %xmm0, %xmm0 vpmovsxdq %xmm0, %xmm0 vmovdqu %xmm1, (%rdi) vmovdqu %xmm0, 16(%rdi) ret .cfi_endproc --- dump for 164.vect --- ;; Function fooq (fooq, funcdef_no=0, decl_uid=4228, cgraph_uid=1, symbol_order=0) Merging blocks 2 and 6 fooq (long long unsigned int * restrict dest, long long unsigned int * src) { vector(2) long long unsigned int * vectp_dest.10; vector(2) long long unsigned int * vectp_dest.9; vector(2) long long unsigned int vect__7.8; vector(4) int vect__5.7; vector(2) long long unsigned int vect__4.6; vector(2) long long unsigned int vect__4.5; vector(2) long long unsigned int * vectp_src.4; vector(2) long long unsigned int * vectp_src.3; int i; long unsigned int _1; long unsigned int _2; long long unsigned int * _3; long long unsigned int _4; int _5; long long unsigned int * _6; long long unsigned int _7; vector(2) long long unsigned int _8; vector(2) long long unsigned int _26; unsigned int ivtmp_30; unsigned int ivtmp_31; unsigned int ivtmp_36; unsigned int ivtmp_37; <bb 2> [local count: 214748368]: <bb 3> [local count: 214748371]: # i_18 = PHI <i_14(5), 0(2)> # ivtmp_31 = PHI <ivtmp_30(5), 4(2)> # vectp_src.3_20 = PHI <vectp_src.3_17(5), src_11(D)(2)> # vectp_dest.9_24 = PHI <vectp_dest.9_32(5), dest_12(D)(2)> # ivtmp_36 = PHI <ivtmp_37(5), 0(2)> _1 = (long unsigned int) i_18; _2 = _1 * 8; _3 = src_11(D) + _2; vect__4.5_16 = MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_src.3_20]; vectp_src.3_15 = vectp_src.3_20 + 16; vect__4.6_9 = MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_src.3_15]; _4 = *_3; _8 = .POPCOUNT (vect__4.5_16); _26 = .POPCOUNT (vect__4.6_9); vect__5.7_22 = VEC_PACK_TRUNC_EXPR <_8, _26>; --- Why do we do this? _5 = 0; _6 = dest_12(D) + _2; vect__7.8_23 = [vec_unpack_lo_expr] vect__5.7_22; vect__7.8_25 = [vec_unpack_hi_expr] vect__5.7_22; _7 = (long long unsigned int) _5; MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_dest.9_24] = vect__7.8_23; vectp_dest.9_34 = vectp_dest.9_24 + 16; MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_dest.9_34] = vect__7.8_25; i_14 = i_18 + 1; ivtmp_30 = ivtmp_31 - 1; vectp_src.3_17 = vectp_src.3_15 + 16; vectp_dest.9_32 = vectp_dest.9_34 + 16; ivtmp_37 = ivtmp_36 + 1; if (ivtmp_37 < 1) goto <bb 5>; [0.00%] else goto <bb 4>; [100.00%] <bb 5> [local count: 0]: goto <bb 3>; [100.00%] <bb 4> [local count: 214748368]: return; } ---
What's missing is middle-end folding support to narrow popcount to the appropriate internal function call with byte/half-word width when target support is available. But I'm quite sure there's no scalar popcount instruction operating on half-word or byte pieces of a GPR? Alternatively the vectorizer can use patterns to do this.
(In reply to Richard Biener from comment #4) > What's missing is middle-end folding support to narrow popcount to the > appropriate internal function call with byte/half-word width when target > support > is available. But I'm quite sure there's no scalar popcount instruction > operating on half-word or byte pieces of a GPR? > > Alternatively the vectorizer can use patterns to do this. Yes, but for 64bit width, vectorizer generate suboptimal code. sse #c3 vector(2) long long unsigned int vect__4.6; vector(2) long long unsigned int vect__4.5; vector(2) long long unsigned int _8; vector(2) long long unsigned int _26; ... ... _8 = .POPCOUNT (vect__4.5_16); _26 = .POPCOUNT (vect__4.6_9); vect__5.7_22 = VEC_PACK_TRUNC_EXPR <_8, _26>; --- Why do we do this? vector(4) int vect__5.7; It could generate directly v4di = .POPCOUNT (v4di);
(In reply to Hongtao.liu from comment #5) > (In reply to Richard Biener from comment #4) > > What's missing is middle-end folding support to narrow popcount to the > > appropriate internal function call with byte/half-word width when target > > support > > is available. But I'm quite sure there's no scalar popcount instruction > > operating on half-word or byte pieces of a GPR? > > > > Alternatively the vectorizer can use patterns to do this. > > Yes, but for 64bit width, vectorizer generate suboptimal code. > > sse #c3 > > vector(2) long long unsigned int vect__4.6; > vector(2) long long unsigned int vect__4.5; > vector(2) long long unsigned int _8; > vector(2) long long unsigned int _26; > > ... > ... > > _8 = .POPCOUNT (vect__4.5_16); > _26 = .POPCOUNT (vect__4.6_9); > vect__5.7_22 = VEC_PACK_TRUNC_EXPR <_8, _26>; --- Why do we do this? > vector(4) int vect__5.7; > > > It could generate directly > > v4di = .POPCOUNT (v4di); I guess that the vectorized popcount IFN is defined to be VnDI -> VnDI but we want to have VnSImode results. This means the instruction is wrongly modeled in vectorized form? Note the vectorizer isn't very good in handling narrowing operations here. If you can push the missing patterns I can have a look. Bonus points for a correctness testcase (from the above I think we're generating wrong code).
Some literature: https://arxiv.org/pdf/1611.07612
(In reply to Richard Biener from comment #4) > What's missing is middle-end folding support to narrow popcount to the > appropriate internal function call with byte/half-word width when target > support > is available. But I'm quite sure there's no scalar popcount instruction > operating on half-word or byte pieces of a GPR? x86 has popcnt that operates on 16bit register. https://www.felixcloutier.com/x86/popcnt
> I guess that the vectorized popcount IFN is defined to be VnDI -> VnDI > but we want to have VnSImode results. This means the instruction is > wrongly modeled in vectorized form? > Yes, because we have __builtin_popcount{l,ll} defined as {BT_FN_INT_ULONG, BT_FN_INT_ULONGLONG} but for vectorized form, gcc require mode of src and dest to be the same. popcountm2: Store into operand 0 the number of 1-bits in operand 1. m is either a scalar or vector integer mode. When it is a scalar, operand 1 has mode m but operand 0 can have whatever scalar integer mode is suitable for the target. The compiler will insert conversion instructions as necessary (typically to convert the result to the same width as int). When m is a vector, both operands must have mode m. This pattern is not allowed to FAIL.
Hmm, but DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, popcount, unary) so there's clearly a mismatch between either the vectorizers interpretation or the optab. But as far as I can see this is not a direct internal fn so vectorizable_internal_function shouldn't apply and I do not see the x86 backend handle POPCOUNT in the vectorizable function target hook. So w/o a compiler capable I can't trace how the vectorizer vectorizes this and thus I have no idea where it goes wrong ...
A patch is posted at https://gcc.gnu.org/pipermail/gcc-patches/2020-November/558777.html
The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>: https://gcc.gnu.org/g:81d590760c31e11e3a09135f4e182aea232035f2 commit r11-5693-g81d590760c31e11e3a09135f4e182aea232035f2 Author: Hongyu Wang <hongyu.wang@intel.com> Date: Wed Nov 11 09:41:13 2020 +0800 Add popcount<mode> expander to enable popcount auto vectorization under AVX512BITALG/AVX512POPCNTDQ target. gcc/ChangeLog PR target/97770 * config/i386/sse.md (popcount<mode>2): New expander for SI/DI vector modes. (popcount<mode>2): Likewise for QI/HI vector modes. gcc/testsuite/ChangeLog PR target/97770 * gcc.target/i386/avx512bitalg-pr97770-1.c: New test. * gcc.target/i386/avx512vpopcntdq-pr97770-1.c: Likewise. * gcc.target/i386/avx512vpopcntdq-pr97770-2.c: Likewise. * gcc.target/i386/avx512vpopcntdqvl-pr97770-1.c: Likewise.
(In reply to Richard Biener from comment #10) > Hmm, but > > DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, popcount, unary) > > so there's clearly a mismatch between either the vectorizers interpretation > or the optab. But as far as I can see this is not a direct internal fn so > vectorizable_internal_function shouldn't apply and I do not see the x86 > backend handle POPCOUNT in the vectorizable function target hook. > > So w/o a compiler capable I can't trace how the vectorizer vectorizes this > and thus I have no idea where it goes wrong ... capable compiler is ready.
So we vectorize to _18 = .POPCOUNT (vect__5.7_22); _17 = .POPCOUNT (vect__5.7_21); vect__6.8_16 = VEC_PACK_TRUNC_EXPR <_18, _17>; _6 = 0; _7 = dest_13(D) + _2; vect__8.9_10 = [vec_unpack_lo_expr] vect__6.8_16; vect__8.9_9 = [vec_unpack_hi_expr] vect__6.8_16; _8 = (long long int) _6; which is exactly the issue that in the scalar code we have a 'int' producing popcount with long long argument but the vector IFN produces a result of the same width as the argument. So the vectorizer compensates for that (VEC_PACK_TRUNC_EXPR) and then vectorizes the widening that's in the scalar code (vec_unpack_{lo,hi}_expr). The fix for this and for the missing byte and word variants is to add a pattern to tree-vect-patterns.c for this case matching it to the .POPCOUNT internal function. That possibly applies to other bitops, too, like parity, ctz, ffs, etc. There's quite some _widen helpers in the pattern recog code so I'm not sure how complicated it is to match (long)popcountl(long) and (short)popcount((int)short) Richard may have a good idea since he did the last "big" surgery there.
(In reply to Richard Biener from comment #14) > So we vectorize to > > _18 = .POPCOUNT (vect__5.7_22); > _17 = .POPCOUNT (vect__5.7_21); > vect__6.8_16 = VEC_PACK_TRUNC_EXPR <_18, _17>; > _6 = 0; > _7 = dest_13(D) + _2; > vect__8.9_10 = [vec_unpack_lo_expr] vect__6.8_16; > vect__8.9_9 = [vec_unpack_hi_expr] vect__6.8_16; > _8 = (long long int) _6; > > which is exactly the issue that in the scalar code we have a 'int' producing > popcount with long long argument but the vector IFN produces a result of the > same width as the argument. So the vectorizer compensates for that > (VEC_PACK_TRUNC_EXPR) and then vectorizes the widening that's in the scalar > code (vec_unpack_{lo,hi}_expr). The fix for this and for the missing > byte and word variants is to add a pattern to tree-vect-patterns.c for this > case matching it to the .POPCOUNT internal function. That possibly applies > to other bitops, too, like parity, ctz, ffs, etc. There's quite some > _widen helpers in the pattern recog code so I'm not sure how complicated > it is to match > > (long)popcountl(long) > > and > > (short)popcount((int)short) > > Richard may have a good idea since he did the last "big" surgery there. Any suggestion for this, should we change prototype of builtins or add vec_recog_popcnt_pattern in vectorizer?
Changing the signature will not help given we don't want to regress other cases. Instead we have to somehow remove the pro- and demotions with the help of patterns. I think using .POPCOUNT should work there.
(In reply to Richard Biener from comment #16) > Changing the signature will not help given we don't want to regress other > cases. > Instead we have to somehow remove the pro- and demotions with the help of > patterns. I think using .POPCOUNT should work there. I'm thinking of reimplementing _mm_popcnt_u64 with a backend builtin like __builtin_ia32_popcountll which is defined as ULONGLONG_FTYPE_ULONGLONG, and handle that in TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION.
On Thu, 10 Jun 2021, crazylht at gmail dot com wrote: > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97770 > > --- Comment #17 from Hongtao.liu <crazylht at gmail dot com> --- > (In reply to Richard Biener from comment #16) > > Changing the signature will not help given we don't want to regress other > > cases. > > Instead we have to somehow remove the pro- and demotions with the help of > > patterns. I think using .POPCOUNT should work there. > > I'm thinking of reimplementing _mm_popcnt_u64 with a backend builtin like > __builtin_ia32_popcountll which is defined as ULONGLONG_FTYPE_ULONGLONG, and > handle that in TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION. That's going to work I guess but it will pessimize general optimization which no longer know this is computing popcount (not sure if the old version exposed that fact).
(In reply to rguenther@suse.de from comment #18) > > That's going to work I guess but it will pessimize general optimization > which no longer know this is computing popcount (not sure if the old > version exposed that fact). Do we have testcase for that?
(In reply to H.J. Lu from comment #19) > (In reply to rguenther@suse.de from comment #18) > > > > That's going to work I guess but it will pessimize general optimization > > which no longer know this is computing popcount (not sure if the old > > version exposed that fact). > > Do we have testcase for that? Sth like int foo (unsigned long long a) { return _mm_popcnt_u64 (a) == 0; } basically anything that triggers a simplification of POPCOUNT as implemented in match.pd (there are quite some). But no, I don't think we have any of those using the x86 intrinsics in the testsuite.
The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>: https://gcc.gnu.org/g:e08a125b208e717f99caa991052537305ca75b6a commit r12-1707-ge08a125b208e717f99caa991052537305ca75b6a Author: liuhongt <hongtao.liu@intel.com> Date: Wed Jun 16 17:34:43 2021 +0800 Add vect_recog_popcount_pattern to handle mismatch between the vectorized popcount IFN and scalar popcount builtin. The patch remove those pro- and demotions when backend support direct optab. For i386: it enables vectorization for vpopcntb/vpopcntw and optimized for vpopcntq. gcc/ChangeLog: PR tree-optimization/97770 * tree-vect-patterns.c (vect_recog_popcount_pattern): New. (vect_recog_func vect_vect_recog_func_ptrs): Add new pattern. gcc/testsuite/ChangeLog: PR tree-optimization/97770 * gcc.target/i386/avx512bitalg-pr97770-1.c: Remove xfail. * gcc.target/i386/avx512vpopcntdq-pr97770-1.c: Remove xfail.
Fixed in GCC12.