#pragma omp declare simd simdlen(16) inbranch __attribute__((noinline)) int foo (int a, int b) { return a + b; } we present if-conversion with the following which I think is good: <bb 3> [local count: 1073741824]: # iter.49_5 = PHI <0(2), iter.49_6(7)> # ivtmp_19 = PHI <16(2), ivtmp_18(7)> b_2 = b.45[iter.49_5]; a_1 = a.44[iter.49_5]; _8 = mask.48_7(D) >> iter.49_5; _9 = _8 & 1; if (_9 == 0) goto <bb 8>; [20.00%] else goto <bb 4>; [80.00%] <bb 8> [local count: 214748360]: goto <bb 5>; [100.00%] <bb 4> [local count: 858993464]: _3 = a_1 + b_2; retval.43[iter.49_5] = _3; <bb 5> [local count: 1073741824]: iter.49_6 = iter.49_5 + 1; ivtmp_18 = ivtmp_19 - 1; if (ivtmp_18 != 0) goto <bb 7>; [100.00%] else goto <bb 6>; [0.00%] and we if-convert the body to <bb 3> [local count: 1073741824]: # iter.49_5 = PHI <iter.49_6(7), 0(15)> # ivtmp_19 = PHI <ivtmp_18(7), 16(15)> b_2 = b.45[iter.49_5]; a_1 = a.44[iter.49_5]; _8 = mask.48_7(D) >> iter.49_5; _9 = _8 & 1; _22 = _9 == 0; _36 = (unsigned int) a_1; _37 = (unsigned int) b_2; _38 = _36 + _37; _3 = (int) _38; _39 = ~_22; _40 = &retval.43[iter.49_5]; .MASK_STORE (_40, 32B, _39, _3); iter.49_6 = iter.49_5 + 1; ivtmp_18 = ivtmp_19 - 1; if (ivtmp_18 != 0) goto <bb 7>; [100.00%] else goto <bb 6>; [0.00%] but rather than recovering the mask in its original form we vectorize this as vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D)}; <bb 3> [local count: 10631108]: # vect_vec_iv_.125_42 = PHI <_43(7), { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }(2)> # vectp_b.126_44 = PHI <vectp_b.126_45(7), &b.45(2)> # vectp_a.129_47 = PHI <vectp_a.129_48(7), &a.44(2)> # vectp_retval.140_61 = PHI <vectp_retval.140_62(7), &retval.43(2)> # ivtmp_64 = PHI <ivtmp_65(7), 0(2)> _43 = vect_vec_iv_.125_42 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; vect_b_2.128_46 = MEM <vector(16) int> [(int *)vectp_b.126_44]; vect_a_1.131_49 = MEM <vector(16) int> [(int *)vectp_a.129_47]; vect__8.132_51 = vect_cst__50 >> vect_vec_iv_.125_42; vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; mask__22.134_55 = vect__9.133_53 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; vect__36.135_56 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(vect_a_1.131_49); vect__37.136_57 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(vect_b_2.128_46); vect__38.137_58 = vect__36.135_56 + vect__37.136_57; vect__3.138_59 = VIEW_CONVERT_EXPR<vector(16) int>(vect__38.137_58); mask__39.139_60 = ~mask__22.134_55; if (mask__39.139_60 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }) goto <bb 20>; [20.00%] else goto <bb 21>; [80.00%] <bb 21> [local count: 8504886]: .MASK_STORE (vectp_retval.140_61, 512B, mask__39.139_60, vect__3.138_59); <bb 20> [local count: 10631108]: vectp_b.126_45 = vectp_b.126_44 + 64; vectp_a.129_48 = vectp_a.129_47 + 64; vectp_retval.140_62 = vectp_retval.140_61 + 64; ivtmp_65 = ivtmp_64 + 1; if (ivtmp_65 < 1) goto <bb 7>; [0.00%] else goto <bb 17>; [100.00%] in the end leaving us with <bb 2> [local count: 1073741824]: vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D)}; vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; vect__36.135_56 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(simd.46_13(D)); vect__37.136_57 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(simd.47_15(D)); vect__38.137_58 = vect__36.135_56 + vect__37.136_57; vect__3.138_59 = VIEW_CONVERT_EXPR<vector(16) int>(vect__38.137_58); mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; if (mask__39.139_60 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }) goto <bb 4>; [20.00%] else goto <bb 3>; [80.00%] <bb 3> [local count: 858993419]: .MASK_STORE (&retval.43, 512B, mask__39.139_60, vect__3.138_59); <bb 4> [local count: 1073741824]: _10 = VIEW_CONVERT_EXPR<vector(16) int>(retval.43); return _10; and _ZGVeM16vv_foo: .LFB4: .cfi_startproc movl $1, %eax pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 vpbroadcastd %edi, %zmm2 vpaddd %zmm1, %zmm0, %zmm0 vpbroadcastd %eax, %zmm3 movq %rsp, %rbp .cfi_def_cfa_register 6 andq $-64, %rsp vpsrlvd .LC0(%rip), %zmm2, %zmm2 vpxor %xmm1, %xmm1, %xmm1 vpandd %zmm3, %zmm2, %zmm2 vpcmpd $4, %zmm1, %zmm2, %k1 kortestw %k1, %k1 je .L25 vmovdqa32 %zmm0, -64(%rsp){%k1} .L25: vmovdqa32 -64(%rsp), %zmm0 leave .cfi_def_cfa 7, 8 ret
For AVX2 and simdlen(8) we get <bb 2> [local count: 1073741824]: vect__35.98_51 = VIEW_CONVERT_EXPR<vector(8) unsigned int>(simd.25_11(D)); vect__36.99_52 = VIEW_CONVERT_EXPR<vector(8) unsigned int>(simd.26_13(D)); vect__37.100_53 = vect__35.98_51 + vect__36.99_52; vect__3.101_54 = VIEW_CONVERT_EXPR<vector(8) int>(vect__37.100_53); mask__38.102_55 = mask.27_15(D) != { 0, 0, 0, 0, 0, 0, 0, 0 }; if (mask__38.102_55 == { 0, 0, 0, 0, 0, 0, 0, 0 }) goto <bb 4>; [20.00%] else goto <bb 3>; [80.00%] <bb 3> [local count: 858993419]: .MASK_STORE (&retval.21, 256B, mask__38.102_55, vect__3.101_54); <bb 4> [local count: 1073741824]: _8 = VIEW_CONVERT_EXPR<vector(8) int>(retval.21); return _8; which is much more reasonable - I'm not sure whether the compare against 0 is required or if the ABI guarantees either 0 or -1 in the elements. That we end up with memory due to the use of a .MASK_STORE unfortunately persists to the final assembly: _ZGVdM8vv_foo: .LFB3: .cfi_startproc vpaddd %ymm1, %ymm0, %ymm0 vpxor %xmm1, %xmm1, %xmm1 pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 vpcmpeqd %ymm1, %ymm2, %ymm2 movq %rsp, %rbp .cfi_def_cfa_register 6 andq $-32, %rsp vpcmpeqd %ymm1, %ymm2, %ymm2 vptest %ymm2, %ymm2 je .L12 vpmaskmovd %ymm0, %ymm2, -32(%rsp) .L12: vmovdqa -32(%rsp), %ymm0 leave .cfi_def_cfa 7, 8 ret there's the opportunity to maybe rewrite .MASK_STORE to a VEC_COND_EXPR and rewriting retval.21 to SSA. Maybe if-conversion can see this already given retval.21 is local automatic and not address taken.
(In reply to Richard Biener from comment #2) > which is much more reasonable - I'm not sure whether the compare against 0 > is required or if the ABI guarantees either 0 or -1 in the elements. Yes, it does. All ones elements that is, the element type can be unsigned integral or floating point as well...
So, shouldn't we match.pd (or something else) pattern match vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D)}; vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; back into mask__39.139_60 = mask.48_7(D); ?
(In reply to Jakub Jelinek from comment #4) > So, shouldn't we match.pd (or something else) pattern match > vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), > mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), > mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), > mask.48_7(D), mask.48_7(D)}; > vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, > 12, 13, 14, 15 }; > vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > 1, 1, 1 }; > mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0, 0, 0, 0 }; > back into mask__39.139_60 = mask.48_7(D); ? Yes, that's a possibility. I wonder if it's possible to arrange things in the vectorizer itself so that costing gets more accurate (probably not that important for OMP SIMD though). Maybe it works a bit better if we did mask & (1 << iv), but I guess we canonicalize that back. I've opened this for tracking for now, working on PR111795 first.
non-openmp testcase for the AVX512 mask inefficiency: unsigned foo (unsigned *a, unsigned short mask) { unsigned sum = 0; for (int i = 0; i < 16; ++i) if ((mask >> i) & 1) sum += a[i]; return sum; } I think the AVX2 one is settled as being as good as it can get.
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:4b75ed33fa5fd604897e7a30e79bd28d46598373 commit r15-1391-g4b75ed33fa5fd604897e7a30e79bd28d46598373 Author: Richard Biener <rguenther@suse.de> Date: Fri Jun 14 14:46:08 2024 +0200 Enhance if-conversion for automatic arrays Automatic arrays that are not address-taken should not be subject to store data races. This applies to OMP SIMD in-branch lowered functions result array which for the testcase otherwise prevents vectorization with SSE and for AVX and AVX512 ends up with spurious .MASK_STORE to the stack surviving. This inefficiency was noted in PR111793. I've introduced ref_can_have_store_data_races, commonizing uses of flag_store_data_races in if-conversion, cselim and store motion. PR tree-optimization/111793 * tree-ssa-alias.h (ref_can_have_store_data_races): Declare. * tree-ssa-alias.cc (ref_can_have_store_data_races): New function. * tree-if-conv.cc (ifcvt_memrefs_wont_trap): Use ref_can_have_store_data_races to allow more unconditional stores. * tree-ssa-loop-im.cc (execute_sm): Likewise. * tree-ssa-phiopt.cc (cond_store_replacement): Likewise. * gcc.dg/vect/vect-simd-clone-21.c: New testcase.
The committed fix should resolve the stray masked stores observed but not the inefficiency dealing with incoming AVX512 mask arguments.