Running 531.deepsjeng_r refrate (ref) peak gcc7-m64 (1 copy) [2024-07-09 15:08:48] Error with '/abuild/rguenther/cpu2017/bin/specinvoke -d /abuild/rguenther/cpu2017/benchspec/CPU/531.deepsjeng_r/run/run_peak_refrate_gcc7-m64.0000 -f compare.cmd -E -e compare.err -o compare.stdout'; no non-empty output files exist Command returned exit code 1 *** Miscompare of ref.out; for details see /abuild/rguenther/cpu2017/benchspec/CPU/531.deepsjeng_r/run/run_peak_refrate_gcc7-m64.0000/ref.out.mis
I verified that dropping --param vect-partial-vector-usage=2 avoids the verification error.
Verified on trunk as well, works with 13.3
bitboard.o is miscompiled. All of the following individually bitboard.cpp:466:19: optimized: loop vectorized using 64 byte vectors bitboard.cpp:456:19: optimized: loop vectorized using 64 byte vectors bitboard.cpp:301:19: optimized: loop vectorized using 64 byte vectors but not bitboard.cpp:395:23: optimized: loop vectorized using 32 byte vectors bitboard.cpp:391:23: optimized: loop vectorized using 64 byte vectors bitboard.cpp:369:23: optimized: loop vectorized using 32 byte vectors bitboard.cpp:369:23: optimized: loop vectorized using 16 byte vectors are enough to trigger failure.
The loops are for (i = 0; i < 64; i++) { KnightMoves[i] = 0; if (Rank(i) > 0) { if (Rank(i) > 1) { if (File(i) > 0) KnightMoves[i] |= Mask[i-17]; if (File(i) < 7) KnightMoves[i] |= Mask[i-15]; } if (File(i) > 1) KnightMoves[i] |= Mask[i-10]; if (File(i) < 6) KnightMoves[i] |= Mask[i-6]; } if (Rank(i) < 7) { if (Rank(i) < 6) { if (File(i) > 0) KnightMoves[i] |= Mask[i+15]; if (File(i) < 7) KnightMoves[i] |= Mask[i+17]; } if (File(i) > 1) KnightMoves[i] |= Mask[i+6]; if (File(i) < 6) KnightMoves[i] |= Mask[i+10]; } } for (i = 0; i < 64; i++) { if (File(i) == FileA) { KingPressureMask[i] = KingSafetyMask[i + 1]; } else if (File(i) == FileH) { KingPressureMask[i] = KingSafetyMask[i - 1]; } else { KingPressureMask[i] = KingSafetyMask[i]; } } for (i = 0; i < 64; i++) { if (File(i) == FileA) { KingPressureMask1[i] = KingSafetyMask1[i + 1]; } else if (File(i) == FileH) { KingPressureMask1[i] = KingSafetyMask1[i - 1]; } else { KingPressureMask1[i] = KingSafetyMask1[i]; } } the last loop is <bb 302> [local count: 145013]: <bb 183> [local count: 9271420]: # i_38 = PHI <_1526(215), 0(302)> # ivtmp_1427 = PHI <ivtmp_1430(215), 64(302)> _296 = i_38 & 7; _1526 = i_38 + 1; _380 = _296 == 0; _1371 = &KingSafetyMask1[_1526]; _298 = .MASK_LOAD (_1371, 64B, _380); _804 = _296 == 7; _1370 = (unsigned int) i_38; _1369 = _1370 + 4294967295; _299 = (int) _1369; _1368 = &KingSafetyMask1[_299]; _300 = .MASK_LOAD (_1368, 64B, _804); _301 = KingSafetyMask1[i_38]; _ifc__1431 = _804 ? _300 : _301; _336 = _380 ? _298 : _ifc__1431; KingPressureMask1[i_38] = _336; ivtmp_1430 = ivtmp_1427 - 1; if (ivtmp_1430 != 0) goto <bb 215>; [98.44%] else goto <bb 189>; [1.56%] <bb 215> [local count: 9126407]: goto <bb 183>; [100.00%] vectorized as <bb 183> [local count: 579464]: # vect_vec_iv_.194_1737 = PHI <_1915(215), { -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0 }(198)> # vectp_KingSafetyMask1.198_1768 = PHI <vectp_KingSafetyMask1.198_1859(215), &MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -112B](198)> # vectp_KingSafetyMask1.204_1878 = PHI <vectp_KingSafetyMask1.204_1879(215), &MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -128B](198)> # vectp_KingSafetyMask1.208_2015 = PHI <vectp_KingSafetyMask1.208_2017(215), &MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -120B](198)> # vectp_KingPressureMask1.216_2023 = PHI <vectp_KingPressureMask1.216_2025(215), &MEM <BITBOARD[64]> [(void *)&KingPressureMask1 + -120B](198)> # ivtmp_2028 = PHI <ivtmp_2030(215), 79(198)> # loop_mask_1995 = PHI <_1989(215), { 0, 0, 0, 0, 0, 0, 0, 0 }(198)> # loop_mask_1860 = PHI <_1990(215), { 0, 0, 0, 0, 0, 0, 0, 0 }(198)> _1915 = vect_vec_iv_.194_1737 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; vect__296.195_1901 = vect_vec_iv_.194_1737 & { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; mask__380.196_1920 = vect__296.195_1901 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; mask_patt_1854.197_1855 = [vec_unpack_lo_expr] mask__380.196_1920; mask_patt_1854.197_1733 = [vec_unpack_hi_expr] mask__380.196_1920; vec_mask_and_1997 = mask_patt_1854.197_1855 & loop_mask_1860; vect_patt_1732.200_1998 = .MASK_LOAD (vectp_KingSafetyMask1.198_1768, 128B, vec_mask_and_1997); vectp_KingSafetyMask1.198_1865 = vectp_KingSafetyMask1.198_1768 + 64; vec_mask_and_2002 = mask_patt_1854.197_1733 & loop_mask_1995; vect_patt_1732.201_2003 = .MASK_LOAD (vectp_KingSafetyMask1.198_1865, 128B, vec_mask_and_2002); mask__804.202_1876 = vect__296.195_1901 == { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; mask_patt_1734.203_2005 = [vec_unpack_lo_expr] mask__804.202_1876; mask_patt_1734.203_2007 = [vec_unpack_hi_expr] mask__804.202_1876; vec_mask_and_2010 = mask_patt_1734.203_2005 & loop_mask_1860; vect_patt_1772.206_2012 = .MASK_LOAD (vectp_KingSafetyMask1.204_1878, 512B, vec_mask_and_2010); vectp_KingSafetyMask1.204_2013 = vectp_KingSafetyMask1.204_1878 + 64; vec_mask_and_1980 = mask_patt_1734.203_2007 & loop_mask_1995; vect_patt_1772.207_1981 = .MASK_LOAD (vectp_KingSafetyMask1.204_2013, 512B, vec_mask_and_1980); vect__301.210_1882 = .MASK_LOAD (vectp_KingSafetyMask1.208_2015, 64B, loop_mask_1860); vectp_KingSafetyMask1.208_2018 = vectp_KingSafetyMask1.208_2015 + 64; vect__301.211_2019 = .MASK_LOAD (vectp_KingSafetyMask1.208_2018, 64B, loop_mask_1995); vect_patt_1775.213_2021 = VEC_COND_EXPR <mask_patt_1734.203_2005, vect_patt_1772.206_2012, vect__301.210_1882>; vect_patt_1775.213_2022 = VEC_COND_EXPR <mask_patt_1734.203_2007, vect_patt_1772.207_1981, vect__301.211_2019>; vect_patt_1897.215_1984 = VEC_COND_EXPR <mask_patt_1854.197_1855, vect_patt_1732.200_1998, vect_patt_1775.213_2021>; vect_patt_1897.215_1985 = VEC_COND_EXPR <mask_patt_1854.197_1733, vect_patt_1732.201_2003, vect_patt_1775.213_2022>; .MASK_STORE (vectp_KingPressureMask1.216_2023, 64B, loop_mask_1860, vect_patt_1897.215_1984); vectp_KingPressureMask1.216_2026 = vectp_KingPressureMask1.216_2023 + 64; .MASK_STORE (vectp_KingPressureMask1.216_2026, 64B, loop_mask_1995, vect_patt_1897.215_1985); vectp_KingSafetyMask1.198_1859 = vectp_KingSafetyMask1.198_1865 + 64; vectp_KingSafetyMask1.204_1879 = vectp_KingSafetyMask1.204_2013 + 64; vectp_KingSafetyMask1.208_2017 = vectp_KingSafetyMask1.208_2018 + 64; vectp_KingPressureMask1.216_2025 = vectp_KingPressureMask1.216_2026 + 64; ivtmp_2030 = ivtmp_2028 - 16; _2031 = (unsigned short) ivtmp_2030; _1988 = {_2031, _2031, _2031, _2031, _2031, _2031, _2031, _2031}; _1989 = { 8, 9, 10, 11, 12, 13, 14, 15 } < _1988; _1990 = { 0, 1, 2, 3, 4, 5, 6, 7 } < _1988; if (ivtmp_2028 > 16) goto <bb 215>; [74.97%] else goto <bb 529>; [25.03%] <bb 215> [local count: 434451]: goto <bb 183>; [100.00%] and with -mtune=cascadelake -mprefer-vector-width=512 we avoid the failure, generating <bb 183> [local count: 435039]: # i_38 = PHI <_1526(215), 0(198)> # ivtmp_1427 = PHI <ivtmp_1430(215), 64(198)> # vect_vec_iv_.194_1737 = PHI <_1915(215), { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }(198)> # vectp_KingSafetyMask1.198_1768 = PHI <vectp_KingSafetyMask1.198_1859(215), &MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + 8B](198)> # vectp_KingSafetyMask1.204_1876 = PHI <vectp_KingSafetyMask1.204_2005(215), &MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -8B](198)> # vectp_KingSafetyMask1.208_1879 = PHI <vectp_KingSafetyMask1.208_2010(215), &KingSafetyMask1(198)> # vectp_KingPressureMask1.216_2020 = PHI <vectp_KingPressureMask1.216_2021(215), &KingPressureMask1(198)> # ivtmp_1984 = PHI <ivtmp_1985(215), 0(198)> _1915 = vect_vec_iv_.194_1737 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; vect__296.195_1901 = vect_vec_iv_.194_1737 & { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; mask__380.196_1920 = vect__296.195_1901 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; ... the difference is peeling for alignment (which is an odd thing to do here, but ...).
The following fails with -O3 -mavx512vl --param vect-partial-vector-usage=2 -mtune=znver4 -mprefer-vector-width=512 but succeeds with -mtune=cascadelake. typedef __UINT64_TYPE__ BITBOARD; BITBOARD KingPressureMask1[64], KingSafetyMask1[64]; void __attribute__((noinline)) foo() { int i; for (i = 0; i < 64; i++) { if ((i & 7) == 0) { KingPressureMask1[i] = KingSafetyMask1[i + 1]; } else if ((i & 7) == 7) { KingPressureMask1[i] = KingSafetyMask1[i - 1]; } else { KingPressureMask1[i] = KingSafetyMask1[i]; } } } BITBOARD verify[64] = {1, 1, 2, 3, 4, 5, 6, 6, 9, 9, 10, 11, 12, 13, 14, 14, 17, 17, 18, 19, 20, 21, 22, 22, 25, 25, 26, 27, 28, 29, 30, 30, 33, 33, 34, 35, 36, 37, 38, 38, 41, 41, 42, 43, 44, 45, 46, 46, 49, 49, 50, 51, 52, 53, 54, 54, 57, 57, 58, 59, 60, 61, 62, 62}; int main() { for (int i = 0; i < 64; ++i) KingSafetyMask1[i] = i; foo (); for (int i = 0; i < 64; ++i) if (KingPressureMask1[i] != verify[i]) __builtin_abort (); return 0; }
t.c:9:17: note: misalignment for fully-masked loop: 15 so in the first iteration only the last element should be active. But # loop_mask_58 = PHI <_100(10), { 0, 0, 0, 0, 0, 0, 0, 0 }(2)> # loop_mask_57 = PHI <_101(10), { 0, 0, 0, 0, 0, 0, 0, 0 }(2)> is then wrong and # vect_vec_iv_.6_46 = PHI <_47(10), { -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0 }(2)> _47 = vect_vec_iv_.6_46 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; vect__1.7_49 = vect_vec_iv_.6_46 & { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; are the values for { 1, 2, ... } thus the next iteration (not relevant for this particular induction use). There are then un(loop-)masked uses of the mask derived from vect__1.7_49 in vect_patt_15.25_84 = VEC_COND_EXPR <mask_patt_23.15_66, vect_patt_20.18_72, vect__6.22_79>; but ultimatively the loop mask is applied in its uses via .MASK_STORE. I have a fix.
Note while on the GCC 14 branch with the fix as posted I see the correct movl $-128, %eax vpxor %xmm2, %xmm2, %xmm2 kxorb %k4, %k4, %k4 kmovb %eax, %k1 vmovdqu64 KingSafetyMask1-56(%rip), %zmm0{%k1}{z} vmovdqu64 KingSafetyMask1-48(%rip), %zmm1{%k1}{z} movl $64, %eax kmovb %eax, %k2 .. oddly enough on trunk while there's (insn 5 26 76 2 (set (reg:QI 4 si [orig:113 loop_mask_57 ] [113]) (const_int -128 [0xffffffffffffff80])) "t.c":6:1 91 {*movqi_internal} (expr_list:REG_EQUAL (const_int -128 [0xffffffffffffff80]) (nil))) (insn:TI 76 5 92 2 (set (reg:QI 73 k5 [orig:113 loop_mask_57 ] [113]) (reg:QI 4 si [orig:113 loop_mask_57 ] [113])) "t.c":6:1 91 {*movqi_internal} (expr_list:REG_DEAD (reg:QI 4 si [orig:113 loop_mask_57 ] [113]) (nil))) in .dfinish there's movl $-128, %esi kmovw %esi, %k5 in the assembly and we leak extra set bits into %k5. I have a debug patch which then causes the testcase to fail again on trunk but not on the branch. How do we end up with kmovw from the above insns? It looks like *movqi_internal might benefit from the new [] syntax - maybe alternatives/attributes got mixed up?
(In reply to Richard Biener from comment #7) > Note while on the GCC 14 branch with the fix as posted I see the correct > > movl $-128, %eax > vpxor %xmm2, %xmm2, %xmm2 > kxorb %k4, %k4, %k4 > kmovb %eax, %k1 > vmovdqu64 KingSafetyMask1-56(%rip), %zmm0{%k1}{z} > vmovdqu64 KingSafetyMask1-48(%rip), %zmm1{%k1}{z} > movl $64, %eax > kmovb %eax, %k2 > .. > > oddly enough on trunk while there's > > (insn 5 26 76 2 (set (reg:QI 4 si [orig:113 loop_mask_57 ] [113]) > (const_int -128 [0xffffffffffffff80])) "t.c":6:1 91 {*movqi_internal} > (expr_list:REG_EQUAL (const_int -128 [0xffffffffffffff80]) > (nil))) > (insn:TI 76 5 92 2 (set (reg:QI 73 k5 [orig:113 loop_mask_57 ] [113]) > (reg:QI 4 si [orig:113 loop_mask_57 ] [113])) "t.c":6:1 91 > {*movqi_internal} > (expr_list:REG_DEAD (reg:QI 4 si [orig:113 loop_mask_57 ] [113]) > (nil))) > > in .dfinish there's > > movl $-128, %esi > kmovw %esi, %k5 > > in the assembly and we leak extra set bits into %k5. I have a debug patch > which then causes the testcase to fail again on trunk but not on the branch. > How do we end up with kmovw from the above insns? It looks like > *movqi_internal might benefit from the new [] syntax - maybe > alternatives/attributes got mixed up? movqi_internal will emit kmovw when -mno-avx512dq on kmov alternatives, this was added in r7-4839-g46e89251c471b2 So I wonder how gcc14 will choose kmovb on just -mavx512vl. The code keeps the same for this part. But using kmovw for QImode mask is not correct as we don't know the value in gpr. Perhaps we'd consider restrict the kmovb under avx512dq only.
Observed one miss-optimization: kxorw %k4, %k4, %k4 # 262 [c=4 l=4] *movqi_internal/14 vmovdqu64 %zmm0, KingPressureMask1-120(%rip){%k4} # 44 [c=65 l=10] avx512f_storev8di_mask vmovdqu64 %zmm0, KingPressureMask1-56(%rip){%k4} # 47 [c=65 l=10] avx512f_storev8di_mask when mask is 0, maskstore can be optimized off.
> But using kmovw for QImode mask is not correct as we don't know the value in > gpr. Perhaps we'd consider restrict the kmovb under avx512dq only. Why? as long as we only care about lower 8 bits, vmovw should be fine.
(In reply to Hongtao Liu from comment #10) > > But using kmovw for QImode mask is not correct as we don't know the value in > > gpr. Perhaps we'd consider restrict the kmovb under avx512dq only. > > Why? as long as we only care about lower 8 bits, vmovw should be fine. Ah yes, I was wrong. As long as the usage of mask did not touch those extra bits there's nothing wrong. And suppose the QI->HI conversion will use zext sematic so we can still get correct value.
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:1e3aa9c9278db69d4bdb661a750a7268789188d6 commit r15-2054-g1e3aa9c9278db69d4bdb661a750a7268789188d6 Author: Richard Biener <rguenther@suse.de> Date: Mon Jul 15 13:01:24 2024 +0200 Fixup unaligned load/store cost for znver4 Currently unaligned YMM and ZMM load and store costs are cheaper than aligned which causes the vectorizer to purposely mis-align accesses by adding an alignment prologue. It looks like the unaligned costs were simply left untouched from znver3 where they equate the aligned costs when tweaking aligned costs for znver4. The following makes the unaligned costs equal to the aligned costs. This avoids the miscompile seen in PR115843 but it's of course not a real fix for the issue uncovered there. But it makes it qualify as a regression fix. PR tree-optimization/115843 * config/i386/x86-tune-costs.h (znver4_cost): Update unaligned load and store cost from the aligned costs.
Hmm, interesting. We even vectorize this with just -mavx512f but end up using vector(16) int besides vector(8) long and equality compares of vector(16) int: vpcmpd $0, %zmm7, %zmm0, %k2 according to docs that's fine with AVX512F. But then for both long and double you need byte masks so I wonder why kmovb isn't in AVX512F ... I will adjust the testcase to use only AVX512F and push the fix now. I can't reproduce the runfail in a different worktree. Note I don't see all-zero masks but vect_patt_22.11_6 = .MASK_LOAD (&MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + 8B], 64B, { -1, 0, 0, 0, 0, 0, 0, 0 }); could be optimized to movq $mem, %zmmN (just a single or just a power-of-two number of initial elements read). Not sure if the corresponding vect_patt_20.17_34 = .MASK_LOAD (&MEM <BITBOARD[64]> [(void *)&KingSafetyMask1 + -8B], 64B, { 0, 0, 0, 0, 0, 0, 0, -1 }); is worth optimizing to xor %zmmN, %zmmN and pinsr $MEM, %zmmN? Eliding constant masks might help to avoid STLF issues due to false dependences on masked out elements (IIRC all uarchs currently suffer from that). Note even all-zero masks cannot be optimized on GIMPLE currently since the value of the masked out lanes isn't well-defined there (we're working on that).
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:a177be05f6952c3f7e62186d2e138d96c475b81a commit r15-2055-ga177be05f6952c3f7e62186d2e138d96c475b81a Author: Richard Biener <rguenther@suse.de> Date: Mon Jul 15 13:50:58 2024 +0200 tree-optimization/115843 - fix wrong-code with fully-masked loop and peeling When AVX512 uses a fully masked loop and peeling we fail to create the correct initial loop mask when the mask is composed of multiple components in some cases. The following fixes this by properly applying the bias for the component to the shift amount. PR tree-optimization/115843 * tree-vect-loop-manip.cc (vect_set_loop_condition_partial_vectors_avx512): Properly bias the shift of the initial mask for alignment peeling. * gcc.dg/vect/pr115843.c: New testcase.
This should be fixed on trunk, I'll backport in time for 14.2.
The releases/gcc-11 branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:bcb2a35a0c04417c407a97d9ff05c2af1d6d1b8d commit r11-11578-gbcb2a35a0c04417c407a97d9ff05c2af1d6d1b8d Author: Richard Biener <rguenther@suse.de> Date: Mon Jul 15 13:01:24 2024 +0200 Fixup unaligned load/store cost for znver4 Currently unaligned YMM and ZMM load and store costs are cheaper than aligned which causes the vectorizer to purposely mis-align accesses by adding an alignment prologue. It looks like the unaligned costs were simply left untouched from znver3 where they equate the aligned costs when tweaking aligned costs for znver4. The following makes the unaligned costs equal to the aligned costs. This avoids the miscompile seen in PR115843 but it's of course not a real fix for the issue uncovered there. But it makes it qualify as a regression fix. PR tree-optimization/115843 * config/i386/x86-tune-costs.h (znver4_cost): Update unaligned load and store cost from the aligned costs. (cherry picked from commit 1e3aa9c9278db69d4bdb661a750a7268789188d6)
The releases/gcc-14 branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:d702a957753caf020cb550d143e9e9a62f79e9f5 commit r14-10434-gd702a957753caf020cb550d143e9e9a62f79e9f5 Author: Richard Biener <rguenther@suse.de> Date: Mon Jul 15 13:01:24 2024 +0200 Fixup unaligned load/store cost for znver4 Currently unaligned YMM and ZMM load and store costs are cheaper than aligned which causes the vectorizer to purposely mis-align accesses by adding an alignment prologue. It looks like the unaligned costs were simply left untouched from znver3 where they equate the aligned costs when tweaking aligned costs for znver4. The following makes the unaligned costs equal to the aligned costs. This avoids the miscompile seen in PR115843 but it's of course not a real fix for the issue uncovered there. But it makes it qualify as a regression fix. PR tree-optimization/115843 * config/i386/x86-tune-costs.h (znver4_cost): Update unaligned load and store cost from the aligned costs. (cherry picked from commit 1e3aa9c9278db69d4bdb661a750a7268789188d6)
The releases/gcc-14 branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:06829e593d2e5611e7924624cb8228795691e2b7 commit r14-10439-g06829e593d2e5611e7924624cb8228795691e2b7 Author: Richard Biener <rguenther@suse.de> Date: Mon Jul 15 13:50:58 2024 +0200 tree-optimization/115843 - fix wrong-code with fully-masked loop and peeling When AVX512 uses a fully masked loop and peeling we fail to create the correct initial loop mask when the mask is composed of multiple components in some cases. The following fixes this by properly applying the bias for the component to the shift amount. PR tree-optimization/115843 * tree-vect-loop-manip.cc (vect_set_loop_condition_partial_vectors_avx512): Properly bias the shift of the initial mask for alignment peeling. * gcc.dg/vect/pr115843.c: New testcase. (cherry picked from commit a177be05f6952c3f7e62186d2e138d96c475b81a)
Fixed.
The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>: https://gcc.gnu.org/g:228972b2b7bf50f4776f8ccae0d7c2950827d0f1 commit r15-2127-g228972b2b7bf50f4776f8ccae0d7c2950827d0f1 Author: liuhongt <hongtao.liu@intel.com> Date: Tue Jul 16 15:29:01 2024 +0800 Optimize maskstore when mask is 0 or -1 in UNSPEC_MASKMOV gcc/ChangeLog: PR target/115843 * config/i386/predicates.md (const0_or_m1_operand): New predicate. * config/i386/sse.md (*<avx512>_store<mode>_mask_1): New pre_reload define_insn_and_split. (V): Add V32BF,V16BF,V8BF. (V4SF_V8BF): Rename to .. (V24F_128): .. this. (*vec_concat<mode>): Adjust with V24F_128. (*vec_concat<mode>_0): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr115843.c: New test.
The releases/gcc-13 branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:b35276655e6767a6e037e58edfa4738317498337 commit r13-8936-gb35276655e6767a6e037e58edfa4738317498337 Author: Richard Biener <rguenther@suse.de> Date: Mon Jul 15 13:01:24 2024 +0200 Fixup unaligned load/store cost for znver4 Currently unaligned YMM and ZMM load and store costs are cheaper than aligned which causes the vectorizer to purposely mis-align accesses by adding an alignment prologue. It looks like the unaligned costs were simply left untouched from znver3 where they equate the aligned costs when tweaking aligned costs for znver4. The following makes the unaligned costs equal to the aligned costs. This avoids the miscompile seen in PR115843 but it's of course not a real fix for the issue uncovered there. But it makes it qualify as a regression fix. PR tree-optimization/115843 * config/i386/x86-tune-costs.h (znver4_cost): Update unaligned load and store cost from the aligned costs. (cherry picked from commit 1e3aa9c9278db69d4bdb661a750a7268789188d6)
The releases/gcc-12 branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:f78eb9524bd97679c8baa47a62e82147272719ae commit r12-10636-gf78eb9524bd97679c8baa47a62e82147272719ae Author: Richard Biener <rguenther@suse.de> Date: Mon Jul 15 13:01:24 2024 +0200 Fixup unaligned load/store cost for znver4 Currently unaligned YMM and ZMM load and store costs are cheaper than aligned which causes the vectorizer to purposely mis-align accesses by adding an alignment prologue. It looks like the unaligned costs were simply left untouched from znver3 where they equate the aligned costs when tweaking aligned costs for znver4. The following makes the unaligned costs equal to the aligned costs. This avoids the miscompile seen in PR115843 but it's of course not a real fix for the issue uncovered there. But it makes it qualify as a regression fix. PR tree-optimization/115843 * config/i386/x86-tune-costs.h (znver4_cost): Update unaligned load and store cost from the aligned costs. (cherry picked from commit 1e3aa9c9278db69d4bdb661a750a7268789188d6)