Observed gcc-12 ICE on today's gcc when was building firefox checkout. I might have reduced it down to something invalid. Here is the minimal reproducer: // cat dec_reconstruct.cc.cc #pragma GCC target "avx" typedef float __m256 __attribute__((__vector_size__(32))); __m256 _mm256_blendv_ps___Y, _mm256_blendv_ps___M, _mm256_mul_ps___A, _mm256_mul_ps___B, IfThenElse___trans_tmp_9; void IfThenElse(__m256 no) { IfThenElse___trans_tmp_9 = __builtin_ia32_blendvps256( no, _mm256_blendv_ps___Y, _mm256_blendv_ps___M); } #pragma GCC target "avx512vl" void EncodedFromDisplay() { __m256 __trans_tmp_11 = _mm256_mul_ps___A * _mm256_mul_ps___B; IfThenElse(__trans_tmp_11); } -O0 works, -O2 fails: $ /tmp/gcc-c/gcc/xg++ -B/tmp/gcc-c/gcc -c dec_reconstruct.cc.cc -O0 $ /tmp/gcc-c/gcc/xg++ -B/tmp/gcc-c/gcc -c dec_reconstruct.cc.cc -O2 during RTL pass: expand dec_reconstruct.cc.cc: In function 'void EncodedFromDisplay()': dec_reconstruct.cc.cc:10:6: internal compiler error: in expand_insn, at optabs.c:7946 10 | void EncodedFromDisplay() { | ^~~~~~~~~~~~~~~~~~ 0x7f3941d62286 __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58 0x7f3941d62337 __libc_start_main_impl ../csu/libc-start.c:409 Please submit a full bug report, with preprocessed source if appropriate. Please include the complete backtrace with any bug report. See <https://gcc.gnu.org/bugs/> for instructions. $ /tmp/gcc-c/gcc/xg++ -B/tmp/gcc-c/gcc -v Reading specs from /tmp/gcc-c/gcc/specs COLLECT_GCC=/tmp/gcc-c/gcc/xg++ COLLECT_LTO_WRAPPER=/tmp/gcc-c/gcc/lto-wrapper Target: x86_64-pc-linux-gnu Configured with: /home/slyfox/dev/git/gcc/configure --build=x86_64-pc-linux-gnu --host=x86_64-pc-linux-gnu --target=x86_64-pc-linux-gnu --enable-languages=c,c++ --disable-bootstrap --with-multilib-list=m64 --prefix=/tmp/gcc-c/../gcc-native-quick-installed --disable-nls --without-isl --disable-libsanitizer --disable-libvtv --disable-libgomp --disable-libstdcxx-pch --disable-libunwind-exceptions CFLAGS='-O1 ' CXXFLAGS='-O1 ' Thread model: posix Supported LTO compression algorithms: zlib zstd gcc version 12.0.0 20210826 (experimental) (GCC)
Created attachment 51362 [details] dec_reconstruct.cc.cc.orig.xz dec_reconstruct.cc.cc.orig.xz is the original extracted preprocessed file, 5MB. Fails the same as: $ /tmp/gcc-c/gcc/xg++ -B/tmp/gcc-c/gcc -c -x c++ /tmp/dec_reconstruct.cc.cc.orig -O2 during RTL pass: expand In file included from /home/slyfox/dev/hg/firefox-source/obj-x86_64-pc-linux-gnu/dist/include/hwy/foreach_target.h:102, from /home/slyfox/dev/hg/firefox-source/third_party/jpeg-xl/lib/jxl/dec_reconstruct.cc:16: /home/slyfox/dev/hg/firefox-source/third_party/jpeg-xl/lib/jxl/dec_reconstruct.cc: In function 'jxl::Status jxl::N_AVX3::UndoXYBInPlace(jxl::Image3F*, const jxl::Rect&, const jxl::OutputEncodingInfo&)': /home/slyfox/dev/hg/firefox-source/third_party/jpeg-xl/lib/jxl/dec_reconstruct.cc:39:8: internal compiler error: in expand_insn, at optabs.c:7946 39 | Status UndoXYBInPlace(Image3F* idct, const Rect& rect, | ^~~~~~~~~~~~~~ 0x7f6eea182286 __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58 0x7f6eea182337 __libc_start_main_impl ../csu/libc-start.c:409 Please submit a full bug report, with preprocessed source if appropriate. Please include the complete backtrace with any bug report. See <https://gcc.gnu.org/bugs/> for instructions.
It is caused by r12-2679.
(In reply to H.J. Lu from comment #2) > It is caused by r12-2679. Mine.
diff --git a/test.c.032t.ccp1 b/test.c.033t.forwprop1 index 5b18739..c6f0587 100644 --- a/test.c.032t.ccp1 +++ b/test.c.033t.forwprop1 @@ -31,11 +31,12 @@ void EncodedFromDisplay () __m256 __trans_tmp_11; vector(8) float _mm256_mul_ps___A.2_1; vector(8) float _mm256_mul_ps___B.3_2; + vector(8) <signed-boolean:32> _5; vector(8) float _mm256_blendv_ps___M.0_6; vector(8) float _mm256_blendv_ps___Y.1_7; vector(8) int _8; vector(8) <signed-boolean:32> _9; - vector(8) float _10; + vector(8) float _12; <bb 2> : _mm256_mul_ps___A.2_1 = _mm256_mul_ps___A; @@ -45,8 +46,9 @@ void EncodedFromDisplay () _mm256_blendv_ps___Y.1_7 = _mm256_blendv_ps___Y; _8 = VIEW_CONVERT_EXPR<vector(8) int>(_mm256_blendv_ps___M.0_6); _9 = _8 < { 0, 0, 0, 0, 0, 0, 0, 0 }; - _10 = VEC_COND_EXPR <_9, _mm256_blendv_ps___Y.1_7, __trans_tmp_11_4>; - IfThenElse___trans_tmp_9 = _10; + _5 = _8 >= { 0, 0, 0, 0, 0, 0, 0, 0 }; + _12 = .COND_MUL (_5, _mm256_mul_ps___A.2_1, _mm256_mul_ps___B.3_2, _mm256_blendv_ps___Y.1_7); + IfThenElse___trans_tmp_9 = _12; return; fwprop1 should check if the type of _5 satisfies the predicate of cond_mul.
(In reply to Hongtao.liu from comment #4) > diff --git a/test.c.032t.ccp1 b/test.c.033t.forwprop1 > index 5b18739..c6f0587 100644 > --- a/test.c.032t.ccp1 > +++ b/test.c.033t.forwprop1 > @@ -31,11 +31,12 @@ void EncodedFromDisplay () > __m256 __trans_tmp_11; > vector(8) float _mm256_mul_ps___A.2_1; > vector(8) float _mm256_mul_ps___B.3_2; > + vector(8) <signed-boolean:32> _5; > vector(8) float _mm256_blendv_ps___M.0_6; > vector(8) float _mm256_blendv_ps___Y.1_7; > vector(8) int _8; > vector(8) <signed-boolean:32> _9; > - vector(8) float _10; > + vector(8) float _12; > > <bb 2> : > _mm256_mul_ps___A.2_1 = _mm256_mul_ps___A; > @@ -45,8 +46,9 @@ void EncodedFromDisplay () > _mm256_blendv_ps___Y.1_7 = _mm256_blendv_ps___Y; > _8 = VIEW_CONVERT_EXPR<vector(8) int>(_mm256_blendv_ps___M.0_6); > _9 = _8 < { 0, 0, 0, 0, 0, 0, 0, 0 }; > - _10 = VEC_COND_EXPR <_9, _mm256_blendv_ps___Y.1_7, __trans_tmp_11_4>; > - IfThenElse___trans_tmp_9 = _10; > + _5 = _8 >= { 0, 0, 0, 0, 0, 0, 0, 0 }; > + _12 = .COND_MUL (_5, _mm256_mul_ps___A.2_1, _mm256_mul_ps___B.3_2, > _mm256_blendv_ps___Y.1_7); > + IfThenElse___trans_tmp_9 = _12; > return; > > fwprop1 should check if the type of _5 satisfies the predicate of cond_mul. Actually this seems backwards ... what match pattern is doing this?
This is the match pattern which does it: (for uncond_op (UNCOND_BINARY) cond_op (COND_BINARY) (simplify (vec_cond @0 (view_convert? (uncond_op@4 @1 @2)) @3) (with { tree op_type = TREE_TYPE (@4); } (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type) && element_precision (type) == element_precision (op_type)) (view_convert (cond_op @0 @1 @2 (view_convert:op_type @3)))))) (simplify (vec_cond @0 @1 (view_convert? (uncond_op@4 @2 @3))) (with { tree op_type = TREE_TYPE (@4); } (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type) && element_precision (type) == element_precision (op_type)) (view_convert (cond_op (bit_not @0) @2 @3 (view_convert:op_type @1))))))) I don't see anything wrong here. Are we saying the (bit_not @0) part is causing issues?
The operation works elementwise if the operands are vectors. No I think x86's cond_* patterns are not correct.
That is the mask is a vector mode still for these patterns according to the internals doc. Rather than the scalar mode you have: (match_operand:<avx512fmaskmode> 1 "register_operand")
(In reply to Andrew Pinski from comment #8) > That is the mask is a vector mode still for these patterns according to the > internals doc. > Rather than the scalar mode you have: > (match_operand:<avx512fmaskmode> 1 "register_operand") No, according to doc, mode of operands[1] is decided by TARGET_VECTORIZE_GET_MASK_MODE. Which means integer mode shoud also be accepted. Operands 0, 2, 3 and 4 all have mode m. Operand 1 is a scalar integer if m is scalar, otherwise it has the mode returned by TARGET_VECTORIZE_GET_MASK_ MODE.
(In reply to Hongtao.liu from comment #9) > (In reply to Andrew Pinski from comment #8) > > That is the mask is a vector mode still for these patterns according to the > > internals doc. > > Rather than the scalar mode you have: > > (match_operand:<avx512fmaskmode> 1 "register_operand") > > No, according to doc, mode of operands[1] is decided by > TARGET_VECTORIZE_GET_MASK_MODE. Which means integer mode shoud also be > accepted. > > Operands 0, 2, 3 and 4 all have mode m. Operand 1 is a scalar integer if m is > scalar, otherwise it has the mode returned by TARGET_VECTORIZE_GET_MASK_ > MODE. Looks like vectorized_internal_fn_supported_p might need to be extended here or a better function needs to be used to check on cond_* optabs.
Created attachment 51363 [details] Proposed patch I'm testing this patch.
(In reply to Hongtao.liu from comment #11) > Created attachment 51363 [details] > Proposed patch > > I'm testing this patch. This looks sensible - Richard, any opinion?
There's a reduced test-case: #pragma GCC target "avx" typedef float __m256 __attribute__((__vector_size__(32))); __m256 _mm256_blendv_ps___Y, _mm256_blendv_ps___M, _mm256_mul_ps___A, _mm256_mul_ps___B, IfThenElse___trans_tmp_9; struct Raw256 { using type = __m256; }; struct Vec256 { using Raw = Raw256::type; Raw raw; } UndoXYBInPlace_linear_g; Vec256 IfThenElse(Vec256 no) { IfThenElse___trans_tmp_9 = __builtin_ia32_blendvps256( no.raw, _mm256_blendv_ps___Y, _mm256_blendv_ps___M); return {IfThenElse___trans_tmp_9}; } Vec256 operator*(Vec256, Vec256) { __m256 __trans_tmp_11 = _mm256_mul_ps___A * _mm256_mul_ps___B; return {__trans_tmp_11}; } struct TF_SRGB { template <class D, class V> void EncodedFromDisplay(D, V x) { Vec256 __trans_tmp_12, linear = x * __trans_tmp_12; IfThenElse(linear); } }; #pragma GCC target \ "avx,avx2,bmi,bmi2,fma,f16c,avx512f,avx512vl,avx512dq,avx512bw" void UndoXYBInPlace_d() { TF_SRGB().EncodedFromDisplay(UndoXYBInPlace_d, UndoXYBInPlace_linear_g); }
Even simpler test-case: #pragma GCC target "avx" typedef float __m256 __attribute__((__vector_size__(32))); __m256 _mm256_blendv_ps___Y, _mm256_blendv_ps___M, _mm256_mul_ps___A, _mm256_mul_ps___B, IfThenElse___trans_tmp_9; struct Raw256 { using type = __m256; }; struct Vec256 { using Raw = Raw256::type; Raw raw; } UndoXYBInPlace_linear_g; Vec256 IfThenElse(Vec256 no) { IfThenElse___trans_tmp_9 = __builtin_ia32_blendvps256( no.raw, _mm256_blendv_ps___Y, _mm256_blendv_ps___M); return {IfThenElse___trans_tmp_9}; } Vec256 operator*(Vec256, Vec256) { __m256 __trans_tmp_11 = _mm256_mul_ps___A * _mm256_mul_ps___B; return {__trans_tmp_11}; } struct TF_SRGB { template <class D, class V> void EncodedFromDisplay(D, V x) { Vec256 __trans_tmp_12, linear = x * __trans_tmp_12; IfThenElse(linear); } }; #pragma GCC target "avx512vl" void UndoXYBInPlace_d() { TF_SRGB().EncodedFromDisplay(UndoXYBInPlace_d, UndoXYBInPlace_linear_g); }
The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>: https://gcc.gnu.org/g:a26ff83ed07e33c4aa46f3314553c0d15ca21100 commit r12-3569-ga26ff83ed07e33c4aa46f3314553c0d15ca21100 Author: liuhongt <hongtao.liu@intel.com> Date: Thu Sep 2 13:05:54 2021 +0800 Check mask type when doing cond_op related gimple simplification. gcc/ChangeLog: PR middle-end/102080 * match.pd: Check mask type when doing cond_op related gimple simplification. * tree.c (is_truth_type_for): New function. * tree.h (is_truth_type_for): New declaration. gcc/testsuite/ChangeLog: PR middle-end/102080 * gcc.target/i386/pr102080.c: New test.
So fixed?
Fixed.