Bug 110274 - [14 Regression] Wrong AVX2 code on highway-1.0.4 on -O1 and above
Summary: [14 Regression] Wrong AVX2 code on highway-1.0.4 on -O1 and above
Status: RESOLVED DUPLICATE of bug 110235
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 14.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2023-06-15 22:52 UTC by Sergei Trofimovich
Modified: 2023-06-16 05:11 UTC (History)
1 user (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments
shift_test.cc (875 bytes, text/x-csrc)
2023-06-15 22:53 UTC, Sergei Trofimovich
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Sergei Trofimovich 2023-06-15 22:52:19 UTC
Initially observed test failures on highway-1.0.4 project against r14-1868-ga4df0ce78d6f1b . There testsuite fails as:

    The following tests FAILED:
        299 - HwyCombineTestGroup/HwyCombineTest.TestAllConcatOddEven/AVX2  # GetParam() = 512 (Subprocess aborted)
        684 - HwyMulTestGroup/HwyMulTest.TestAllRearrangeToOddPlusEven/AVX2  # GetParam() = 512 (Subprocess aborted)

If I did not miss anything here is the self-contained example:

$ cat shift_test.cc

#include <cstring> /* memcmp() */
#include <immintrin.h>

typedef __m256i v16x256;
typedef __m256i v32x256;
typedef __m128i v16x128;

static v16x256 Zero_() {
  return _mm256_setzero_si256();
}

static v16x256 Iota0_() {
  return _mm256_set_epi16(
      int16_t{15}, int16_t{14}, int16_t{13}, int16_t{12},
      int16_t{11}, int16_t{10},  int16_t{9},  int16_t{8},
       int16_t{7},  int16_t{6},  int16_t{5},  int16_t{4},
       int16_t{3},  int16_t{2},  int16_t{1},  int16_t{0});
}

static v16x128 LowerHalf_(v16x256 v) { return _mm256_castsi256_si128(v); }
static v16x128 UpperHalf_(v16x256 v) { return _mm256_extracti128_si256(v, 1); }

static v32x256 bcast_16_to_32(v16x256 v) { return v; }
static v32x256 And_(v32x256 a, v32x256 b) { return _mm256_and_si256(a, b); }
static v32x256 Set_16(int t) { return _mm256_set1_epi32(t); }

static v16x256 ConcatEven_(v16x256 hi, v16x256 lo) {
  // Isolate lower 16 bits per u32 so we can pack.
  const v32x256 mask = Set_16(0x0000FFFF);
  const v32x256 uH = And_(bcast_16_to_32(hi), mask);
  const v32x256 uL = And_(bcast_16_to_32(lo), mask);
  const __m256i u16 = _mm256_packus_epi32(uL, uH);
  return _mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0));
}

static v32x256 PromoteTo_(v16x128 v) { return _mm256_cvtepu16_epi32(v); }
static v32x256 Shl_32(v32x256 v, v32x256 bits) { return _mm256_sllv_epi32(v, bits); }
static v16x256 bcast_32_to_16(v32x256 v) { return v; }

static v16x256 AVX2ShlU16Vec256_(v16x256 v, v16x256 bits) {
  const v32x256 lo_shl_result = Shl_32(PromoteTo_(LowerHalf_(v)), PromoteTo_(LowerHalf_(bits)));
  const v32x256 hi_shl_result = Shl_32(PromoteTo_(UpperHalf_(v)), PromoteTo_(UpperHalf_(bits)));
  return ConcatEven_(bcast_32_to_16(hi_shl_result), bcast_32_to_16(lo_shl_result));
}

static v16x256 Shl_16(v16x256 v, v16x256 bits) { return AVX2ShlU16Vec256_(v, bits); }

static void TestAllVariableShifts() {
  const auto v0 = Zero_();
  const auto values = Iota0_();
  const auto r = Shl_16(values, v0);

  // is there a better way to compare __m256i?
  if (memcmp(&values, &r, sizeof(r)) != 0)
    __builtin_trap();
}

int main() { TestAllVariableShifts(); }



 Triggering the bug:

  $ g++ -o bug -O0 -mavx2 shift_test.cc && ./bug
  <ok>
  $ g++ -o bug -O1 -mavx2 shift_test.cc && ./bug
  Illegal instruction (core dumped)

From what I understand the test generates an Iota sample vector and shifts it left for 0 bits via mask register. Test expects that the result will not change Iota value. But somehow gcc-14 generates something else.

Chances are I extracted the input incorrectly and introduced the bug. But neither asan nor ubsan complain about it. Thus I expect -O0/-O1 to produce the same result in any case.

$ g++ -v
Using built-in specs.
COLLECT_GCC=/<<NIX>>/gcc-14.0.0/bin/g++
COLLECT_LTO_WRAPPER=/<<NIX>>/gcc-14.0.0/libexec/gcc/x86_64-unknown-linux-gnu/14.0.0/lto-wrapper
Target: x86_64-unknown-linux-gnu
Configured with:
Thread model: posix
Supported LTO compression algorithms: zlib
gcc version 14.0.0 99999999 (experimental) (GCC)
Comment 1 Sergei Trofimovich 2023-06-15 22:53:04 UTC
Created attachment 55336 [details]
shift_test.cc
Comment 2 Andrew Pinski 2023-06-15 23:01:52 UTC
Dup of bug 110235.

*** This bug has been marked as a duplicate of bug 110235 ***
Comment 3 Andrew Pinski 2023-06-15 23:02:57 UTC
Note it is _mm256_packus_epi32 which is being miscompiled.