Created attachment 50597 [details] denbench_mp2decoddata2.cpp https://godbolt.org/z/EGoz1zx61 cat test.cpp static inline void idctrow(e_s16 *blk) { e_s32 x0, x1, x2, x3, x4, x5, x6, x7, x8; if (!((x1 = blk[4]<<11) | (x2 = blk[6]) | (x3 = blk[2]) | (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) { blk[0]=blk[1]=blk[2]=blk[3]=blk[4]=blk[5]=blk[6]=blk[7]=(e_s16)blk[0]<<3; return; } x0 = (blk[0]<<11) + 128; x8 = 565*(x4+x5); x4 = x8 + (2841 -565)*x4; x5 = x8 - (2841 +565)*x5; x8 = 2408*(x6+x7); x6 = x8 - (2408 -1609)*x6; x7 = x8 - (2408 +1609)*x7; x8 = x0 + x1; x0 -= x1; x1 = 1108*(x3+x2); x2 = x1 - (2676 +1108)*x2; x3 = x1 + (2676 -1108)*x3; x1 = x4 + x6; x4 -= x6; x6 = x5 + x7; x5 -= x7; x7 = x8 + x3; x8 -= x3; x3 = x0 + x2; x0 -= x2; x2 = (181*(x4+x5)+128)>>8; x4 = (181*(x4-x5)+128)>>8; blk[0] = (e_s16)((x7+x1)>>8); blk[1] = (e_s16)((x3+x2)>>8); blk[2] = (e_s16)((x0+x4)>>8); blk[3] = (e_s16)((x8+x6)>>8); blk[4] = (e_s16)((x8-x6)>>8); blk[5] = (e_s16)((x0-x4)>>8); blk[6] = (e_s16)((x3-x2)>>8); blk[7] = (e_s16)((x7-x1)>>8); } int __attribute__ ((noipa)) Fast_IDCT(e_s16 *block) { e_s32 i; for (i=0; i<8; i++) idctrow(block+8*i); return 1; } pass_ifcvt transforms the if branch in idctrow into an conditional move, and then pass_vect finds that although there's no loop vectorization opportunity but there are opportunities for SLP, but the cost model of SLP does not consider the cost of these conditional movs, which eventually generates a large number of redundant test and cmov in codegen. test.cpp:76:11: note: stmt 1 MEM[(e_s16 *)_3 + 2B] = _ifc__264; test.cpp:76:11: note: stmt 2 MEM[(e_s16 *)_3 + 4B] = _ifc__267; test.cpp:76:11: note: stmt 3 MEM[(e_s16 *)_3 + 6B] = _ifc__270; test.cpp:76:11: note: stmt 4 MEM[(e_s16 *)_3 + 8B] = _ifc__273; test.cpp:76:11: note: stmt 5 MEM[(e_s16 *)_3 + 10B] = _ifc__276; test.cpp:76:11: note: stmt 6 MEM[(e_s16 *)_3 + 12B] = _ifc__279; test.cpp:76:11: note: stmt 7 MEM[(e_s16 *)_3 + 14B] = _ifc__282; test.cpp:76:11: note: children 0x3ec9580 test.cpp:76:11: note: node (external) 0x3ec9580 (max_nunits=1, refcnt=1) test.cpp:76:11: note: { _ifc__261, _ifc__264, _ifc__267, _ifc__270, _ifc__273, _ifc__276, _ifc__279, _ifc__282 } test.cpp:76:11: note: Cost model analysis: 0x3c1aee0 _ifc__261 1 times scalar_store costs 12 in body 0x3c1aee0 _ifc__264 1 times scalar_store costs 12 in body 0x3c1aee0 _ifc__267 1 times scalar_store costs 12 in body 0x3c1aee0 _ifc__270 1 times scalar_store costs 12 in body 0x3c1aee0 _ifc__273 1 times scalar_store costs 12 in body 0x3c1aee0 _ifc__276 1 times scalar_store costs 12 in body 0x3c1aee0 _ifc__279 1 times scalar_store costs 12 in body 0x3c1aee0 _ifc__282 1 times scalar_store costs 12 in body 0x3c1aee0 _ifc__261 1 times unaligned_store (misalign -1) costs 12 in body 0x3c1aee0 <unknown> 1 times vec_construct costs 32 in prologue test.cpp:76:11: note: Cost model analysis for part in loop 1: Vector cost: 44 Scalar cost: 96 int Fast_IDCT (e_s16 * block) { vector(8) short int * vectp.78; vector(8) short int * vectp.77; e_s32 x0; e_s32 x1; e_s32 x2; e_s32 x3; e_s32 x4; e_s32 x5; e_s32 x6; e_s32 x7; e_s32 x8; e_s32 i; long unsigned int i.0_1; long unsigned int _2; e_s16 * _3; unsigned long ivtmp_4; unsigned long ivtmp_5; short int _10; int _11; int _12; short int _14; short int _16; short int _17; short int _19; short int _20; short int _22; short int _23; short int _25; short int _26; short int _28; short int _29; long int _31; int _34; short int _35; int _38; int _39; long int _41; long int _43; long int _45; long int _47; long int _49; long int _51; long int _55; long int _57; long int _59; long int _69; long int _70; long int _71; long int _73; long int _74; long int _75; long int _77; long int _78; short int _79; long int _80; long int _81; short int _82; long int _83; long int _84; short int _85; long int _86; long int _87; short int _88; long int _89; long int _90; short int _91; long int _92; long int _93; short int _94; long int _95; long int _96; short int _97; long int _98; long int _99; short int _100; long int _118; bool _121; short int pretmp_159; int _160; short int _ifc__237; short int _ifc__240; short int _ifc__243; short int _ifc__246; short int _ifc__249; short int _ifc__252; short int _ifc__255; short int _ifc__258; short int _ifc__261; short int _ifc__264; short int _ifc__267; short int _ifc__270; short int _ifc__273; short int _ifc__276; short int _ifc__279; short int _ifc__282; vector(8) short int _1154; <bb 2> [local count: 119292720]: _121 = 1; <bb 3> [local count: 954449105]: # i_122 = PHI <i_9(8), 0(2)> # ivtmp_5 = PHI <ivtmp_4(8), 8(2)> i.0_1 = (long unsigned int) i_122; _2 = i.0_1 * 16; _3 = block_7(D) + _2; _10 = MEM[(e_s16 *)_3 + 8B]; _11 = (int) _10; _12 = _11 << 11; x1_13 = (e_s32) _12; _14 = MEM[(e_s16 *)_3 + 12B]; _17 = MEM[(e_s16 *)_3 + 4B]; _16 = _14 | _17; _20 = MEM[(e_s16 *)_3 + 2B]; _19 = _16 | _20; _23 = MEM[(e_s16 *)_3 + 14B]; _22 = _19 | _23; _26 = MEM[(e_s16 *)_3 + 10B]; _25 = _22 | _26; _29 = MEM[(e_s16 *)_3 + 6B]; _28 = _25 | _29; _118 = (long int) _28; _31 = x1_13 | _118; pretmp_159 = *_3; _160 = (int) pretmp_159; _34 = _160 << 3; _35 = (short int) _34; _ifc__237 = _31 != 0 ? _23 : _35; _ifc__240 = _31 != 0 ? _14 : _35; _ifc__243 = _31 != 0 ? _26 : _35; _ifc__246 = _31 != 0 ? _10 : _35; _ifc__249 = _31 != 0 ? _29 : _35; _ifc__252 = _31 != 0 ? _17 : _35; _ifc__255 = _31 != 0 ? _20 : _35; _ifc__258 = _31 == 0 ? _35 : pretmp_159; x2_15 = (e_s32) _14; x3_18 = (e_s32) _17; x4_21 = (e_s32) _20; x5_24 = (e_s32) _23; x6_27 = (e_s32) _26; x7_30 = (e_s32) _29; _38 = _160 << 11; _39 = _38 + 128; x0_40 = (e_s32) _39; _41 = x4_21 + x5_24; x8_42 = _41 * 565; _43 = x4_21 * 2276; x4_44 = x8_42 + _43; _45 = x5_24 * -3406; x5_46 = x8_42 + _45; _47 = x6_27 + x7_30; x8_48 = _47 * 2408; _49 = x6_27 * -799; x6_50 = x8_48 + _49; _51 = x7_30 * -4017; x7_52 = x8_48 + _51; x8_53 = x1_13 + x0_40; x0_54 = x0_40 - x1_13; _55 = x2_15 + x3_18; x1_56 = _55 * 1108; _57 = x2_15 * -3784; x2_58 = x1_56 + _57; _59 = x3_18 * 1568; x3_60 = x1_56 + _59; x1_61 = x4_44 + x6_50; x4_62 = x4_44 - x6_50; x6_63 = x5_46 + x7_52; x5_64 = x5_46 - x7_52; x7_65 = x8_53 + x3_60; x8_66 = x8_53 - x3_60; x3_67 = x0_54 + x2_58; x0_68 = x0_54 - x2_58; _69 = x4_62 + x5_64; _70 = _69 * 181; _71 = _70 + 128; x2_72 = _71 >> 8; _73 = x4_62 - x5_64; _74 = _73 * 181; _75 = _74 + 128; x4_76 = _75 >> 8; _77 = x1_61 + x7_65; _78 = _77 >> 8; _79 = (short int) _78; _ifc__261 = _31 != 0 ? _79 : _ifc__258; _80 = x3_67 + x2_72; _81 = _80 >> 8; _82 = (short int) _81; _ifc__264 = _31 != 0 ? _82 : _ifc__255; _83 = x0_68 + x4_76; _84 = _83 >> 8; _85 = (short int) _84; _ifc__267 = _31 != 0 ? _85 : _ifc__252; _86 = x6_63 + x8_66; _87 = _86 >> 8; _88 = (short int) _87; _ifc__270 = _31 != 0 ? _88 : _ifc__249; _89 = x8_66 - x6_63; _90 = _89 >> 8; _91 = (short int) _90; _ifc__273 = _31 != 0 ? _91 : _ifc__246; _92 = x0_68 - x4_76; _93 = _92 >> 8; _94 = (short int) _93; _ifc__276 = _31 != 0 ? _94 : _ifc__243; _95 = x3_67 - x2_72; _96 = _95 >> 8; _97 = (short int) _96; _ifc__279 = _31 != 0 ? _97 : _ifc__240; _98 = x7_65 - x1_61; _99 = _98 >> 8; _100 = (short int) _99; _ifc__282 = _31 != 0 ? _100 : _ifc__237; _1154 = {_ifc__261, _ifc__264, _ifc__267, _ifc__270, _ifc__273, _ifc__276, _ifc__279, _ifc__282}; vectp.78_1155 = _3; MEM <vector(8) short int> [(e_s16 *)vectp.78_1155] = _1154; i_9 = i_122 + 1; ivtmp_4 = ivtmp_5 - 1; if (ivtmp_4 != 0) goto <bb 8>; [87.50%] else goto <bb 7>; [12.50%] <bb 8> [local count: 835156386]: goto <bb 3>; [100.00%] <bb 7> [local count: 119292720]: return 1; }
Indeed loop vectorization throws if-converted bodies at the BB vectorizer as a last resort (because BB vectorization doesn't do if-conversion itself). But the BB vectorizer then uses the if-converted scalar code as the thing to cost against (costing against the not if-converted loop body isn't really possible). To quote /* If we applied if-conversion then try to vectorize the BB of innermost loops. ??? Ideally BB vectorization would learn to vectorize control flow by applying if-conversion on-the-fly, the following retains the if-converted loop body even when only non-if-converted parts took part in BB vectorization. */ if (flag_tree_slp_vectorize != 0 && loop_vectorized_call && ! loop->inner) { as a "hack" we could see to scalar cost the always executed part of the not if-converted loop body and apply the full bias of this cost vs. the scalar cost of the if-converted body to the scalar cost of the BB vectorization. But that's really apples-to-oranges in the end (as it is now). Maybe we can cost the whole partly vectorized loop body in this mode and compare it against the scalar cost of the original loop. But even the loop vectorizer costs the if-converted scalar loop, so it is off as well. Long-term if-conversion needs to be integrated with vectorization so we can at least keep track of what stmts were originally executed conditional and what not. Short-term I'm not sure we can do much. Doing SLP on the if-converted body does help in quite some cases.
GCC 11.1 has been released, retargeting bugs to GCC 11.2.
Is this really a costing issue, or should we instead reject the BB fallback if it leaves any scalar COND_EXPRs around? This would be similar to the way that we reject IFN_MASK_LOAD/STORE calls, except that the COND_EXPR tests would only apply to unvectorised statements and so would need to be tested after SLP discovery rather than before it. (Ideally IFN_MASK_LOAD/STORE would work like that too.)
(In reply to rsandifo@gcc.gnu.org from comment #3) > Is this really a costing issue, or should we instead reject the > BB fallback if it leaves any scalar COND_EXPRs around? This would > be similar to the way that we reject IFN_MASK_LOAD/STORE calls, > except that the COND_EXPR tests would only apply to unvectorised > statements and so would need to be tested after SLP discovery > rather than before it. (Ideally IFN_MASK_LOAD/STORE would work > like that too.) I suppose we could do that, but then I'm not sure how exactly we'd do it ;) Good idea anyway.
GCC 11.2 is being released, retargeting bugs to GCC 11.3
Working on this now.
The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>: https://gcc.gnu.org/g:819b7c3a339e3bdaf85cd55954c5536bd98aae09 commit r12-3103-g819b7c3a339e3bdaf85cd55954c5536bd98aae09 Author: liuhongt <hongtao.liu@intel.com> Date: Wed Aug 4 16:39:31 2021 +0800 Disable slp in loop vectorizer when cost model is very-cheap. Performance impact for the commit with option: -march=x86-64 -O2 -ftree-vectorize -fvect-cost-model=very-cheap SPEC2017 fprate 503.bwaves_r BuildSame 507.cactuBSSN_r -0.04 508.namd_r 0.14 510.parest_r -0.54 511.povray_r 0.10 519.lbm_r BuildSame 521.wrf_r 0.64 526.blender_r -0.32 527.cam4_r 0.17 538.imagick_r 0.09 544.nab_r BuildSame 549.fotonik3d_r BuildSame 554.roms_r BuildSame 997.specrand_fr -0.09 Geometric mean: 0.02 SPEC2017 intrate 500.perlbench_r 0.26 502.gcc_r 0.21 505.mcf_r -0.09 520.omnetpp_r BuildSame 523.xalancbmk_r BuildSame 525.x264_r -0.41 531.deepsjeng_r BuildSame 541.leela_r 0.13 548.exchange2_r BuildSame 557.xz_r BuildSame 999.specrand_ir BuildSame Geometric mean: 0.02 EEMBC: no regression, only improvement or build the same, the below is improved benchmarks. mp2decoddata1 7.59 mp2decoddata2 31.80 mp2decoddata3 12.15 mp2decoddata4 11.16 mp2decoddata5 11.19 mp2decoddata1 7.06 mp2decoddata2 24.12 mp2decoddata3 10.83 mp2decoddata4 10.04 mp2decoddata5 10.07 gcc/ChangeLog: PR tree-optimization/100089 * tree-vectorizer.c (try_vectorize_loop_1): Disable slp in loop vectorizer when cost model is very-cheap.
So if we agree to a sane way to cost branchy code on the scalar side then it should be possible to compare the scalar cost of the not if-converted inner loop body against the full partially vectorized and if-converted inner loop body. vect_bb_vectorization_profitable_p would have to add the cost of the scalar stmts not covered by vectorization - this set is conveniently available as the set of stmts not having the visited flag set before we clear it here: vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, vec<slp_instance> slp_instances) { ... /* Unset visited flag. */ stmt_info_for_cost *cost; FOR_EACH_VEC_ELT (scalar_costs, i, cost) gimple_set_visited (cost->stmt_info->stmt, false); so we'd need to walk over all stmts in the BB and add the cost of the not marked stmts to the vector cost. We'd want to force a single SLP "subgraph" in this mode to avoid going over the whole block multiple times.
Created attachment 51350 [details] patch This implements scanning for not vectorized COND_EXPRs with -fvect-cost-model=very-cheap when vectorizing if-converted loop bodies. It also gives vect_bb_vectorization_profitable_p enough info to eventually do something more fancy (namely the original not if-converted loop body).
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:9216ee6d1195d48388f825cf1b072e570129cbbe commit r12-3116-g9216ee6d1195d48388f825cf1b072e570129cbbe Author: Richard Biener <rguenther@suse.de> Date: Tue Aug 24 12:25:25 2021 +0200 tree-optimization/100089 - avoid leaving scalar if-converted code around This avoids leaving scalar if-converted code around for the case of BB vectorizing an if-converted loop body when using the very-cheap cost model. In this case we scan not vectorized scalar stmts in the basic-block vectorized for COND_EXPRs and force the vectorization to be marked as not profitable. The patch also makes sure to always consider all BB vectorization subgraphs together for costing purposes when vectorizing an if-converted loop body. 2021-08-24 Richard Biener <rguenther@suse.de> PR tree-optimization/100089 * tree-vectorizer.h (vect_slp_bb): Rename to ... (vect_slp_if_converted_bb): ... this and get the original loop as new argument. * tree-vectorizer.c (try_vectorize_loop_1): Revert previous fix, pass original loop to vect_slp_if_converted_bb. * tree-vect-slp.c (vect_bb_vectorization_profitable_p): If orig_loop was passed scan the not vectorized stmts for COND_EXPRs and force not profitable if found. (vect_slp_region): Pass down all SLP instances to costing if orig_loop was specified. (vect_slp_bbs): Pass through orig_loop. (vect_slp_bb): Rename to ... (vect_slp_if_converted_bb): ... this and get the original loop as new argument. (vect_slp_function): Adjust.
So this fixes it for -fvect-cost-model=very-cheap. One could argue that we should enable the code for all cost models, fixing the -O3 regression (and backportable to the branch). I'll see to experiment with "fancy" costing of the stray vectorizations. I'll also note that the scanning for load/store (and other ifns not supported) could be handled in the costing as well (but the costing would need to run also for -fno-vect-cost-model then, just the result ignored if not forced). I'm talking about bool require_loop_vectorize = false; for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple *stmt = gsi_stmt (gsi); gcall *call = dyn_cast <gcall *> (stmt); if (call && gimple_call_internal_p (call)) { internal_fn ifn = gimple_call_internal_fn (call); if (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE /* Don't keep the if-converted parts when the ifn with specifc type is not supported by the backend. */ || (direct_internal_fn_p (ifn) && !direct_internal_fn_supported_p (call, OPTIMIZE_FOR_SPEED))) { require_loop_vectorize = true; break; } }
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:199cd0e0f8744ca1e61a95987b2d020a592a46d9 commit r12-6795-g199cd0e0f8744ca1e61a95987b2d020a592a46d9 Author: Richard Biener <rguenther@suse.de> Date: Fri Jan 21 13:29:06 2022 +0100 tree-optimization/100089 - BB vectorization of if-converted loop bodies The PR complains that when we only partially BB vectorize an if-converted loop body that this can leave unvectorized code unconditionally executed and thus effectively slow down code. For -O2 we already mitigated the issue by not doing BB vectorization when not all if-converted stmts were covered but the issue is present with -O3 as well. Thus the following simply extends the fix to cover all but the unlimited cost models. It is after all very likely that we vectorize some stmts, if only a single paired store. 2022-01-21 Richard Biener <rguenther@suse.de> PR tree-optimization/100089 * tree-vect-slp.cc (vect_slp_region): Reject BB vectorization of if-converted loops with unvectorized COND_EXPRs for all but the unlimited cost models.
Now fixed for GCC 12 even with -O3.
GCC 11.3 is being released, retargeting bugs to GCC 11.4.