[Bug target/103611] New: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets

Wed Dec 8 00:55:07 GMT 2021

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611

            Bug ID: 103611
           Summary: GCC generates suboptimal code for SSE2/SSE4.1 64-bit
                    integer element extraction on 32-bit x86 targets
           Product: gcc
           Version: 11.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: john_platts at hotmail dot com
  Target Milestone: ---

Here is some code for extracting 64-bit integers from a SSE2 vector:
#include <cstdint>
#include <immintrin.h>

template<int ElemIdx>
std::int64_t SSE2ExtractInt64(__m128i vect) noexcept {
    static_assert(ElemIdx == (ElemIdx & 1), "ElemIdx must be between 0 and 1");

    __m128i vect2;
    if constexpr(ElemIdx == 0) {
        vect2 = _mm_shuffle_epi32(vect, 1);
    } else {
        vect2 = _mm_shuffle_epi32(vect, 3);
        vect = _mm_shuffle_epi32(vect, 2);
    }

    auto loVal = std::uint32_t(_mm_cvtsi128_si32(vect));
    auto hiVal = std::uint32_t(_mm_cvtsi128_si32(vect2));

    return std::int64_t(loVal) | std::int64_t(std::uint64_t(hiVal) << 32);
}

template std::int64_t SSE2ExtractInt64<0>(__m128i vect) noexcept;
template std::int64_t SSE2ExtractInt64<1>(__m128i vect) noexcept;

Here is the assembly code that is generated when the above C++ code is compiled
with the -O2 -std=c++17 -march=nocona -mtune=skylake -m32 options:
_Z16SSE2ExtractInt64ILi0EExDv2_x:
        pushl   %ebx
        pshufd  $1, %xmm0, %xmm1
        xorl    %ebx, %ebx
        movd    %xmm1, %edx
        movd    %xmm0, %eax
        orl     %ebx, %edx
        orb     $0, %ah
        popl    %ebx
        ret
_Z16SSE2ExtractInt64ILi1EExDv2_x:
        pushl   %esi
        pshufd  $3, %xmm0, %xmm1
        xorl    %esi, %esi
        pushl   %ebx
        pshufd  $2, %xmm0, %xmm0
        movl    %esi, %edx
        movd    %xmm1, %ecx
        movd    %xmm0, %eax
        popl    %ebx
        orb     $0, %ah
        orl     %ecx, %edx
        popl    %esi
        ret

Here is a more optimal implementation of the above functions:
_Z16SSE2ExtractInt64ILi0EExDv2_x:
        pshufd  $1, %xmm0, %xmm1
        movd    %xmm1, %edx
        movd    %xmm0, %eax
        ret
_Z16SSE2ExtractInt64ILi1EExDv2_x:
        pshufd  $3, %xmm0, %xmm1
        pshufd  $2, %xmm0, %xmm0
        movd    %xmm1, %edx
        movd    %xmm0, %eax
        ret

Here is the code that is generated when the above C++ code is compiled with
clang 13.0.0 with the -O2 -std=c++17 -march=nocona -mtune=skylake -m32 options:
_Z16SSE2ExtractInt64ILi0EExDv2_x:       # @_Z16SSE2ExtractInt64ILi0EExDv2_x
        movd    %xmm0, %eax
        pshufd  $85, %xmm0, %xmm0               # xmm0 = xmm0[1,1,1,1]
        movd    %xmm0, %edx
        retl
_Z16SSE2ExtractInt64ILi1EExDv2_x:       # @_Z16SSE2ExtractInt64ILi1EExDv2_x
        pshufd  $238, %xmm0, %xmm1              # xmm1 = xmm0[2,3,2,3]
        movd    %xmm1, %eax
        pshufd  $255, %xmm0, %xmm0              # xmm0 = xmm0[3,3,3,3]
        movd    %xmm0, %edx
        retl