[Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2

Sat May 4 14:00:02 GMT 2024

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944

            Bug ID: 114944
           Summary: Codegen of __builtin_shuffle for an 16-byte uint8_t
                    vector is suboptimal on SSE2
           Product: gcc
           Version: 13.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: john_platts at hotmail dot com
  Target Milestone: ---

Here is a snippet of code that has suboptimal codegen on SSE2:

#include <stdint.h>
#include <emmintrin.h>

__m128i SSE2ShuffleI8(__m128i a, __m128i b) {
  typedef uint8_t GccU8M128Vec __attribute__((__vector_size__(16)));
  return reinterpret_cast<__m128i>(__builtin_shuffle(
    reinterpret_cast<GccU8M128Vec>(a), reinterpret_cast<GccU8M128Vec>(b)));
}

Here is the code that is generated when the above code is compiled on x86_64
GCC 13.2.0 with the -O2 option:
SSE2ShuffleI8(long long __vector(2), long long __vector(2)):
        push    r15
        movd    r11d, xmm1
        push    r14
        and     r11d, 15
        push    r13
        push    r12
        push    rbp
        push    rbx
        sub     rsp, 160
        movaps  XMMWORD PTR [rsp+8], xmm1
        movzx   edx, BYTE PTR [rsp+16]
        movaps  XMMWORD PTR [rsp+24], xmm1
        movzx   eax, BYTE PTR [rsp+31]
        movaps  XMMWORD PTR [rsp+40], xmm1
        mov     rcx, rdx
        movzx   r15d, BYTE PTR [rsp+46]
        and     ecx, 15
        and     eax, 15
        movaps  XMMWORD PTR [rsp+120], xmm1
        movzx   ebx, BYTE PTR [rsp+121]
        mov     QWORD PTR [rsp-120], rcx
        and     r15d, 15
        movaps  XMMWORD PTR [rsp+136], xmm0
        and     ebx, 15
        movaps  XMMWORD PTR [rsp+104], xmm1
        movzx   ebp, BYTE PTR [rsp+106]
        movaps  XMMWORD PTR [rsp+88], xmm1
        movzx   r12d, BYTE PTR [rsp+91]
        movaps  XMMWORD PTR [rsp+72], xmm1
        movzx   r13d, BYTE PTR [rsp+76]
        and     ebp, 15
        movaps  XMMWORD PTR [rsp+56], xmm1
        movzx   r14d, BYTE PTR [rsp+61]
        and     r12d, 15
        movaps  XMMWORD PTR [rsp-8], xmm1
        movzx   edx, BYTE PTR [rsp+1]
        and     r13d, 15
        movaps  XMMWORD PTR [rsp-24], xmm1
        movzx   ecx, BYTE PTR [rsp-14]
        and     r14d, 15
        movaps  XMMWORD PTR [rsp-40], xmm1
        movzx   esi, BYTE PTR [rsp-29]
        and     edx, 15
        movaps  XMMWORD PTR [rsp-56], xmm1
        movzx   edi, BYTE PTR [rsp-44]
        and     ecx, 15
        movaps  XMMWORD PTR [rsp-72], xmm1
        movzx   r8d, BYTE PTR [rsp-59]
        and     esi, 15
        movaps  XMMWORD PTR [rsp-88], xmm1
        movzx   r9d, BYTE PTR [rsp-74]
        and     edi, 15
        movaps  XMMWORD PTR [rsp-104], xmm1
        movzx   r10d, BYTE PTR [rsp-89]
        and     r8d, 15
        movzx   eax, BYTE PTR [rsp+136+rax]
        movzx   r15d, BYTE PTR [rsp+136+r15]
        and     r9d, 15
        movzx   r14d, BYTE PTR [rsp+136+r14]
        sal     rax, 8
        movzx   ebp, BYTE PTR [rsp+136+rbp]
        movzx   r13d, BYTE PTR [rsp+136+r13]
        and     r10d, 15
        or      rax, r15
        movzx   r12d, BYTE PTR [rsp+136+r12]
        movzx   ebx, BYTE PTR [rsp+136+rbx]
        sal     rax, 8
        movzx   edi, BYTE PTR [rsp+136+rdi]
        movzx   r9d, BYTE PTR [rsp+136+r9]
        or      rax, r14
        movzx   esi, BYTE PTR [rsp+136+rsi]
        movzx   r8d, BYTE PTR [rsp+136+r8]
        sal     rax, 8
        movzx   ecx, BYTE PTR [rsp+136+rcx]
        movzx   edx, BYTE PTR [rsp+136+rdx]
        or      rax, r13
        sal     rax, 8
        or      rax, r12
        sal     rax, 8
        or      rax, rbp
        sal     rax, 8
        or      rax, rbx
        movzx   ebx, BYTE PTR [rsp+136+r11]
        sal     rax, 8
        mov     r11, rax
        movzx   eax, BYTE PTR [rsp+136+r10]
        sal     rax, 8
        or      rax, r9
        sal     rax, 8
        or      r11, rbx
        or      rax, r8
        sal     rax, 8
        or      rax, rdi
        sal     rax, 8
        or      rax, rsi
        sal     rax, 8
        or      rax, rcx
        mov     rcx, QWORD PTR [rsp-120]
        mov     QWORD PTR [rsp-120], r11
        sal     rax, 8
        or      rax, rdx
        movzx   edx, BYTE PTR [rsp+136+rcx]
        sal     rax, 8
        or      rax, rdx
        mov     QWORD PTR [rsp-112], rax
        movdqa  xmm0, XMMWORD PTR [rsp-120]
        add     rsp, 160
        pop     rbx
        pop     rbp
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret

The above code unnecessarily allocates more stack space than is necessary and
stores xmm1 (the index vector) multiple times.

Here is an more optimal version of SSE2ShuffleI8:
.LSSE2ShuffleI8_Element_Mask:
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15
  .byte 15

SSE2ShuffleI8:
  push    rbp
  push    r15
  push    r14
  push    r13
  push    r12
  push    rbx
  movdqa  XMMWORD PTR [rsp - 24], xmm0
  pand    xmm1, XMMWORD PTR .LSSE2ShuffleI8_Element_Mask[rip]
  movdqa  XMMWORD PTR [rsp - 56], xmm1
  movzx   eax, BYTE PTR [rsp - 56]
  movzx   ebx, BYTE PTR [rsp - 55]
  movzx   ecx, BYTE PTR [rsp - 54]
  movzx   edx, BYTE PTR [rsp - 53]
  movzx   esi, BYTE PTR [rsp - 52]
  movzx   edi, BYTE PTR [rsp - 51]
  movzx   ebp, BYTE PTR [rsp - 50]
  movzx   r8d, BYTE PTR [rsp - 49]
  movzx   r9d, BYTE PTR [rsp - 48]
  movzx   r10d, BYTE PTR [rsp - 47]
  movzx   r11d, BYTE PTR [rsp - 46]
  movzx   r12d, BYTE PTR [rsp - 45]
  movzx   r13d, BYTE PTR [rsp - 44]
  movzx   r14d, BYTE PTR [rsp - 43]
  movzx   r15d, BYTE PTR [rsp - 42]
  movzx   eax, BYTE PTR [rsp + rax - 24]
  movzx   ebx, BYTE PTR [rsp + rbx - 24]
  movzx   ecx, BYTE PTR [rsp + rcx - 24]
  movzx   edx, BYTE PTR [rsp + rdx - 24]
  movzx   esi, BYTE PTR [rsp + rsi - 24]
  movzx   edi, BYTE PTR [rsp + rdi - 24]
  movzx   ebp, BYTE PTR [rsp + rbp - 24]
  movzx   r8d, BYTE PTR [rsp + r8 - 24]
  movd    xmm0, eax
  movzx   eax, BYTE PTR [rsp - 41]
  movzx   r9d, BYTE PTR [rsp + r9 - 24]
  movzx   r10d, BYTE PTR [rsp + r10 - 24]
  movzx   r11d, BYTE PTR [rsp + r11 - 24]
  movzx   r12d, BYTE PTR [rsp + r12 - 24]
  movzx   r13d, BYTE PTR [rsp + r13 - 24]
  movzx   r14d, BYTE PTR [rsp + r14 - 24]
  movzx   r15d, BYTE PTR [rsp + r15 - 24]
  movzx   eax, BYTE PTR [rsp + rax - 24]
  movd    xmm1, ebx
  movd    xmm2, ecx
  movd    xmm3, edx
  movd    xmm4, esi
  movd    xmm5, edi
  movd    xmm6, ebp
  movd    xmm7, r8d
  movd    xmm8, r9d
  movd    xmm9, r10d
  movd    xmm10, r11d
  movd    xmm11, r12d
  movd    xmm12, r13d
  movd    xmm13, r14d
  movd    xmm14, r15d
  movd    xmm15, eax
  punpcklbw xmm0, xmm1
  punpcklbw xmm2, xmm3
  punpcklbw xmm4, xmm5
  punpcklbw xmm6, xmm7
  punpcklbw xmm8, xmm9
  punpcklbw xmm10, xmm11
  punpcklbw xmm12, xmm13
  punpcklbw xmm14, xmm15
  punpcklwd xmm0, xmm2
  punpcklwd xmm4, xmm6
  punpcklwd xmm8, xmm10
  punpcklwd xmm12, xmm14
  punpckldq xmm0, xmm4
  punpckldq xmm8, xmm12
  punpcklqdq xmm0, xmm8
  pop     rbx
  pop     r12
  pop     r13
  pop     r14
  pop     r15
  pop     rbp
  ret

The second version of the SSE2ShuffleI8 op above requires 79 instructions and
only requires 80 bytes of stack, compared to the first version of the
SSE2ShuffleI8 op (which is generated by GCC 13.2.0), which allocates 160 bytes
of stack and uses an additional 120 bytes of stack in the red zone and
generates 114 instructions.