The reproducer is a bit big, but I was not able to reduce it further. Reproducer: // func.cpp #include <algorithm> extern short var_1, var_29, var_89; extern unsigned var_2, var_11; extern bool var_4; extern long var_6; extern char var_7; extern int var_8, var_10; extern short arr_206[10][14][13][21][14] __attribute__((aligned)); extern int arr_257[]; long f(long l) { return 0 > l ? 0 : l; } void test() { var_11 = var_6; for (char a = 0; a < (char)var_2; a = 6) for (int b = 0; b < var_2; b = ~0) for (int c = 0; c < 2; c = var_1) for (bool d = 0; d < var_4; d = 1) var_29 = f(~var_6); for (short e = 0; e < short(var_6); e = var_6) { for (; 0 < (int)var_6;) ; for (char g = 0; g < 4; g++) for (; std::min(var_7 / 405077347810ULL, (unsigned long long)9); var_7 += 2) for (char h = 0; h < (char)var_8; h += 4) for (short i = 0; i < (var_4 && var_6) + 13; i++) { arr_206[0][g][0][h][i] = var_6; var_89 = std::min(var_4 ?: 709U, (unsigned)var_4); } for (short j = 0; j < var_2; j += 4) for (int k = 0; k < 5U; k = var_10) arr_257[k] = var_6; } } // driver.cpp #include <stdio.h> short var_1 = (short)7531; unsigned int var_2 = 187158918U; bool var_4 = (bool)1; unsigned long long int var_6 = 10263287916162477044ULL; signed char var_7 = 0; long long int var_8 = 21; unsigned int var_10 = 3309705747U; unsigned int var_11 = 222967114U; short var_29 = (short)-22723; short var_89 = (short)-19017; short arr_206 [10] [14] [13] [21] [14] __attribute__((aligned)); int arr_257 [5]; void test(); int main() { test(); for (size_t i_0 = 0; i_0 < 5; ++i_0) printf("%d ", arr_257 [i_0]); printf("\n"); } Error: >$ g++ -march=skylake-avx512 func.cpp driver.cpp -O2 && sde -skx -- ./a.out -2039714828 0 0 0 0 >$ g++ -march=skylake-avx512 func.cpp driver.cpp -O3 && sde -skx -- ./a.out 27636 0 0 0 0 gcc version 11.0.0 20210113 (8fc183ccd0628465205b8a88c29ab69bfe74a08a)
cprop hardreg change (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86]) (reg:SI 37 r9 [orig:86 _11 ] [86])) "test.c":29:36 75 {*movsi_internal} (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86]) (nil))) to (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86]) (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "test.c":29:36 75 {*movsi_internal} (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86]) (nil))) since it thought the lower 32bit of r9 and xmm2 is the same? but with xmm2 defined as kmovw %k0, %edi # 69 [c=4 l=4] *movhi_internal/6 kmovd %k0, %edx # 487 [c=4 l=3] *movsi_internal/16 vmovd %edi, %xmm2 # 489 the bit16-32 is clear with kmovw(note k0 is equal to r9 with SImode, it's var_6 in source code) (insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96]) (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76 {*movhi_internal} (nil)) (insn 489 75 78 12 (set (reg:SI 22 xmm2 [297]) (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal} (nil))
Just for the record, started with r11-4428-g4a369d199bf2f34e but it only made it visible I think.
(In reply to Hongtao.liu from comment #1) > cprop hardreg change > > (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86]) > (reg:SI 37 r9 [orig:86 _11 ] [86])) "test.c":29:36 75 > {*movsi_internal} > (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86]) > (nil))) > > to > > (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86]) > (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "test.c":29:36 75 > {*movsi_internal} > (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86]) > (nil))) > > since it thought the lower 32bit of r9 and xmm2 is the same? > > but with xmm2 defined as > > kmovw %k0, %edi # 69 [c=4 l=4] *movhi_internal/6 > kmovd %k0, %edx # 487 [c=4 l=3] *movsi_internal/16 > vmovd %edi, %xmm2 # 489 > > the bit16-32 is clear with kmovw(note k0 is equal to r9 with SImode, it's > var_6 in source code) > > (insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96]) > (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76 > {*movhi_internal} > (nil)) > > (insn 489 75 78 12 (set (reg:SI 22 xmm2 [297]) > (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal} > (nil)) It seems to be be handled here. cut from copy_value in regcprop.c: ---- /* If SRC had been assigned a mode narrower than the copy, we can't link DEST into the chain, because not all of the pieces of the copy came from oldest_regno. */ else if (sn > hard_regno_nregs (sr, vd->e[sr].mode)) return; ---- here we have %edi set as HImode, but use as SImode and be copied to %xmm2, but the condition failed to check this beacuase both SImode and HImode has nregs as 1, since the upper part could be garbage, it can't link DEST into the chain. kmovw %k0, %edi # 69 [c=4 l=4] *movhi_internal/6 <----HI kmovd %k0, %edx # 487 [c=4 l=3] *movsi_internal/16 vmovd %edi, %xmm2 # 489 [c=4 l=6] *movsi_internal/13 <----SI sall $16, %edx # 73 [c=4 l=3] *ashlsi3_1/0 kmovw %k0, %r8d # 74 [c=4 l=5] *zero_extendhisi2/1 vpshuflw $0, %xmm2, %xmm0 # 78 [c=4 l=5] *vec_dupv4hi/1 orl %edx, %r8d # 75 [c=4 l=3] *iorsi_1/0 testw %di, %di # 82 [c=4 l=3] *cmphi_ccno_1/0 jle .L52 # 83 [c=12 l=6] *jcc kmovd %k0, %r9d # 85 [c=4 l=4] *movsi_internal/16 <----SI testl %r9d, %r9d # 88 [c=4 l=3] *cmpsi_ccno_1/0
> It seems to be be handled here. > > cut from copy_value in regcprop.c: > ---- > /* If SRC had been assigned a mode narrower than the copy, we can't > link DEST into the chain, because not all of the pieces of the > copy came from oldest_regno. */ > else if (sn > hard_regno_nregs (sr, vd->e[sr].mode)) > return; > ---- > > here we have %edi set as HImode, but use as SImode and be copied to %xmm2, > but the condition failed to check this beacuase both SImode and HImode has > nregs as 1, since the upper part could be garbage, it can't link DEST into > the chain. > > kmovw %k0, %edi # 69 [c=4 l=4] *movhi_internal/6 <----HI > kmovd %k0, %edx # 487 [c=4 l=3] *movsi_internal/16 > vmovd %edi, %xmm2 # 489 [c=4 l=6] *movsi_internal/13 <----SI > sall $16, %edx # 73 [c=4 l=3] *ashlsi3_1/0 > kmovw %k0, %r8d # 74 [c=4 l=5] *zero_extendhisi2/1 > vpshuflw $0, %xmm2, %xmm0 # 78 [c=4 l=5] > *vec_dupv4hi/1 > orl %edx, %r8d # 75 [c=4 l=3] *iorsi_1/0 > testw %di, %di # 82 [c=4 l=3] *cmphi_ccno_1/0 > jle .L52 # 83 [c=12 l=6] *jcc > kmovd %k0, %r9d # 85 [c=4 l=4] *movsi_internal/16 <----SI > testl %r9d, %r9d # 88 [c=4 l=3] *cmpsi_ccno_1/0 and it looks like a generic code bug.
and rewritten pattern (define_insn "*vec_dupv4hi" [(set (match_operand:V4HI 0 "register_operand" "=y,xYw") (vec_duplicate:V4HI (truncate:HI (match_operand:SI 1 "register_operand" "0,xYw"))))] to (define_insn "*vec_dupv4hi" [(set (match_operand:V4HI 0 "register_operand" "=y,xYw") (vec_duplicate:V4HI (vecTselect:HI (match_operand:V4HI 1 "register_operand" "0,xYw") (parallel [(const_int 0)]))))] could avoid this issue.
(In reply to Hongtao.liu from comment #5) > and rewritten pattern > (define_insn "*vec_dupv4hi" > [(set (match_operand:V4HI 0 "register_operand" "=y,xYw") > (vec_duplicate:V4HI > (truncate:HI > (match_operand:SI 1 "register_operand" "0,xYw"))))] > to > > (define_insn "*vec_dupv4hi" > [(set (match_operand:V4HI 0 "register_operand" "=y,xYw") > (vec_duplicate:V4HI > (vecTselect:HI > (match_operand:V4HI 1 "register_operand" "0,xYw") > (parallel [(const_int 0)]))))] > > could avoid this issue. Oh, not workable. --- if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass)) { /* Vector registers do not support QI or HImode loads. If we don't disallow a change to these modes, reload will assume it's ok to drop the subreg from (subreg:SI (reg:HI 100) 0). This affects the vec_dupv4hi pattern. */ if (GET_MODE_SIZE (from) < 4) return ---
Another testcase reproduce the same issue. #include<immintrin.h> typedef short v4hi __attribute__ ((vector_size (8))); typedef int v2si __attribute__ ((vector_size (8))); v4hi b; __attribute__ ((noipa)) v2si foo (__m512i src1, __m512i src2) { __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2); short s = (short) m; int i = (int)m; b = __extension__ (v4hi) {s, s, s, s}; return __extension__ (v2si) {i, i}; } int main () { __m512i src1 = _mm512_setzero_si512 (); __m512i src2 = _mm512_set_epi8 (0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2); v2si a = foo (src1, src2); if (a[0] != (int)m) __builtin_abort (); return 0; }
The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>: https://gcc.gnu.org/g:e711b67a9081ae84c66174a50705dc98ba993a43 commit r11-6828-ge711b67a9081ae84c66174a50705dc98ba993a43 Author: liuhongt <hongtao.liu@intel.com> Date: Mon Jan 18 16:55:32 2021 +0800 Fix incorrect optimization by cprop_hardreg. If SRC had been assigned a mode narrower than the copy, we can't always link DEST into the chain even they have same hard_regno_nregs(i.e. HImode/SImode in i386 backend). i.e kmovw %k0, %edi vmovd %edi, %xmm2 vpshuflw $0, %xmm2, %xmm0 kmovw %k0, %r8d kmovd %k0, %r9d ... - movl %r9d, %r11d + vmovd %xmm2, %r11d gcc/ChangeLog: PR rtl-optimization/98694 * regcprop.c (copy_value): If SRC had been assigned a mode narrower than the copy, we can't link DEST into the chain even they have same hard_regno_nregs(i.e. HImode/SImode in i386 backend). gcc/testsuite/ChangeLog: PR rtl-optimization/98694 * gcc.target/i386/pr98694.c: New test.
Fix on trunk sofar
Fixed on trunk, latent on the branch(es) where we don't have a testcase(?)
(In reply to Richard Biener from comment #10) > Fixed on trunk, latent on the branch(es) where we don't have a testcase(?) Yes, not sure about the backport.
Related to PR100342.
The releases/gcc-10 branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>: https://gcc.gnu.org/g:0372a414e7500dccab1eb423a2a620645c820a52 commit r10-10611-g0372a414e7500dccab1eb423a2a620645c820a52 Author: liuhongt <hongtao.liu@intel.com> Date: Mon Jan 18 16:55:32 2021 +0800 Fix incorrect optimization by cprop_hardreg. If SRC had been assigned a mode narrower than the copy, we can't always link DEST into the chain even they have same hard_regno_nregs(i.e. HImode/SImode in i386 backend). i.e kmovw %k0, %edi vmovd %edi, %xmm2 vpshuflw $0, %xmm2, %xmm0 kmovw %k0, %r8d kmovd %k0, %r9d ... - movl %r9d, %r11d + vmovd %xmm2, %r11d gcc/ChangeLog: PR rtl-optimization/98694 * regcprop.c (copy_value): If SRC had been assigned a mode narrower than the copy, we can't link DEST into the chain even they have same hard_regno_nregs(i.e. HImode/SImode in i386 backend). gcc/testsuite/ChangeLog: PR rtl-optimization/98694 * gcc.target/i386/pr98694.c: New test. (cherry picked from commit e711b67a9081ae84c66174a50705dc98ba993a43)
Should this issue be marked as Resolved and Fixed?
Fixed.