Initially observed test failures on highway-1.0.4 project against r14-1868-ga4df0ce78d6f1b . There testsuite fails as: The following tests FAILED: 299 - HwyCombineTestGroup/HwyCombineTest.TestAllConcatOddEven/AVX2 # GetParam() = 512 (Subprocess aborted) 684 - HwyMulTestGroup/HwyMulTest.TestAllRearrangeToOddPlusEven/AVX2 # GetParam() = 512 (Subprocess aborted) If I did not miss anything here is the self-contained example: $ cat shift_test.cc #include <cstring> /* memcmp() */ #include <immintrin.h> typedef __m256i v16x256; typedef __m256i v32x256; typedef __m128i v16x128; static v16x256 Zero_() { return _mm256_setzero_si256(); } static v16x256 Iota0_() { return _mm256_set_epi16( int16_t{15}, int16_t{14}, int16_t{13}, int16_t{12}, int16_t{11}, int16_t{10}, int16_t{9}, int16_t{8}, int16_t{7}, int16_t{6}, int16_t{5}, int16_t{4}, int16_t{3}, int16_t{2}, int16_t{1}, int16_t{0}); } static v16x128 LowerHalf_(v16x256 v) { return _mm256_castsi256_si128(v); } static v16x128 UpperHalf_(v16x256 v) { return _mm256_extracti128_si256(v, 1); } static v32x256 bcast_16_to_32(v16x256 v) { return v; } static v32x256 And_(v32x256 a, v32x256 b) { return _mm256_and_si256(a, b); } static v32x256 Set_16(int t) { return _mm256_set1_epi32(t); } static v16x256 ConcatEven_(v16x256 hi, v16x256 lo) { // Isolate lower 16 bits per u32 so we can pack. const v32x256 mask = Set_16(0x0000FFFF); const v32x256 uH = And_(bcast_16_to_32(hi), mask); const v32x256 uL = And_(bcast_16_to_32(lo), mask); const __m256i u16 = _mm256_packus_epi32(uL, uH); return _mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0)); } static v32x256 PromoteTo_(v16x128 v) { return _mm256_cvtepu16_epi32(v); } static v32x256 Shl_32(v32x256 v, v32x256 bits) { return _mm256_sllv_epi32(v, bits); } static v16x256 bcast_32_to_16(v32x256 v) { return v; } static v16x256 AVX2ShlU16Vec256_(v16x256 v, v16x256 bits) { const v32x256 lo_shl_result = Shl_32(PromoteTo_(LowerHalf_(v)), PromoteTo_(LowerHalf_(bits))); const v32x256 hi_shl_result = Shl_32(PromoteTo_(UpperHalf_(v)), PromoteTo_(UpperHalf_(bits))); return ConcatEven_(bcast_32_to_16(hi_shl_result), bcast_32_to_16(lo_shl_result)); } static v16x256 Shl_16(v16x256 v, v16x256 bits) { return AVX2ShlU16Vec256_(v, bits); } static void TestAllVariableShifts() { const auto v0 = Zero_(); const auto values = Iota0_(); const auto r = Shl_16(values, v0); // is there a better way to compare __m256i? if (memcmp(&values, &r, sizeof(r)) != 0) __builtin_trap(); } int main() { TestAllVariableShifts(); } Triggering the bug: $ g++ -o bug -O0 -mavx2 shift_test.cc && ./bug <ok> $ g++ -o bug -O1 -mavx2 shift_test.cc && ./bug Illegal instruction (core dumped) From what I understand the test generates an Iota sample vector and shifts it left for 0 bits via mask register. Test expects that the result will not change Iota value. But somehow gcc-14 generates something else. Chances are I extracted the input incorrectly and introduced the bug. But neither asan nor ubsan complain about it. Thus I expect -O0/-O1 to produce the same result in any case. $ g++ -v Using built-in specs. COLLECT_GCC=/<<NIX>>/gcc-14.0.0/bin/g++ COLLECT_LTO_WRAPPER=/<<NIX>>/gcc-14.0.0/libexec/gcc/x86_64-unknown-linux-gnu/14.0.0/lto-wrapper Target: x86_64-unknown-linux-gnu Configured with: Thread model: posix Supported LTO compression algorithms: zlib gcc version 14.0.0 99999999 (experimental) (GCC)
Created attachment 55336 [details] shift_test.cc
Dup of bug 110235. *** This bug has been marked as a duplicate of bug 110235 ***
Note it is _mm256_packus_epi32 which is being miscompiled.