[Bug target/114944] New: Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2
john_platts at hotmail dot com
gcc-bugzilla@gcc.gnu.org
Sat May 4 14:00:02 GMT 2024
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944
Bug ID: 114944
Summary: Codegen of __builtin_shuffle for an 16-byte uint8_t
vector is suboptimal on SSE2
Product: gcc
Version: 13.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: john_platts at hotmail dot com
Target Milestone: ---
Here is a snippet of code that has suboptimal codegen on SSE2:
#include <stdint.h>
#include <emmintrin.h>
__m128i SSE2ShuffleI8(__m128i a, __m128i b) {
typedef uint8_t GccU8M128Vec __attribute__((__vector_size__(16)));
return reinterpret_cast<__m128i>(__builtin_shuffle(
reinterpret_cast<GccU8M128Vec>(a), reinterpret_cast<GccU8M128Vec>(b)));
}
Here is the code that is generated when the above code is compiled on x86_64
GCC 13.2.0 with the -O2 option:
SSE2ShuffleI8(long long __vector(2), long long __vector(2)):
push r15
movd r11d, xmm1
push r14
and r11d, 15
push r13
push r12
push rbp
push rbx
sub rsp, 160
movaps XMMWORD PTR [rsp+8], xmm1
movzx edx, BYTE PTR [rsp+16]
movaps XMMWORD PTR [rsp+24], xmm1
movzx eax, BYTE PTR [rsp+31]
movaps XMMWORD PTR [rsp+40], xmm1
mov rcx, rdx
movzx r15d, BYTE PTR [rsp+46]
and ecx, 15
and eax, 15
movaps XMMWORD PTR [rsp+120], xmm1
movzx ebx, BYTE PTR [rsp+121]
mov QWORD PTR [rsp-120], rcx
and r15d, 15
movaps XMMWORD PTR [rsp+136], xmm0
and ebx, 15
movaps XMMWORD PTR [rsp+104], xmm1
movzx ebp, BYTE PTR [rsp+106]
movaps XMMWORD PTR [rsp+88], xmm1
movzx r12d, BYTE PTR [rsp+91]
movaps XMMWORD PTR [rsp+72], xmm1
movzx r13d, BYTE PTR [rsp+76]
and ebp, 15
movaps XMMWORD PTR [rsp+56], xmm1
movzx r14d, BYTE PTR [rsp+61]
and r12d, 15
movaps XMMWORD PTR [rsp-8], xmm1
movzx edx, BYTE PTR [rsp+1]
and r13d, 15
movaps XMMWORD PTR [rsp-24], xmm1
movzx ecx, BYTE PTR [rsp-14]
and r14d, 15
movaps XMMWORD PTR [rsp-40], xmm1
movzx esi, BYTE PTR [rsp-29]
and edx, 15
movaps XMMWORD PTR [rsp-56], xmm1
movzx edi, BYTE PTR [rsp-44]
and ecx, 15
movaps XMMWORD PTR [rsp-72], xmm1
movzx r8d, BYTE PTR [rsp-59]
and esi, 15
movaps XMMWORD PTR [rsp-88], xmm1
movzx r9d, BYTE PTR [rsp-74]
and edi, 15
movaps XMMWORD PTR [rsp-104], xmm1
movzx r10d, BYTE PTR [rsp-89]
and r8d, 15
movzx eax, BYTE PTR [rsp+136+rax]
movzx r15d, BYTE PTR [rsp+136+r15]
and r9d, 15
movzx r14d, BYTE PTR [rsp+136+r14]
sal rax, 8
movzx ebp, BYTE PTR [rsp+136+rbp]
movzx r13d, BYTE PTR [rsp+136+r13]
and r10d, 15
or rax, r15
movzx r12d, BYTE PTR [rsp+136+r12]
movzx ebx, BYTE PTR [rsp+136+rbx]
sal rax, 8
movzx edi, BYTE PTR [rsp+136+rdi]
movzx r9d, BYTE PTR [rsp+136+r9]
or rax, r14
movzx esi, BYTE PTR [rsp+136+rsi]
movzx r8d, BYTE PTR [rsp+136+r8]
sal rax, 8
movzx ecx, BYTE PTR [rsp+136+rcx]
movzx edx, BYTE PTR [rsp+136+rdx]
or rax, r13
sal rax, 8
or rax, r12
sal rax, 8
or rax, rbp
sal rax, 8
or rax, rbx
movzx ebx, BYTE PTR [rsp+136+r11]
sal rax, 8
mov r11, rax
movzx eax, BYTE PTR [rsp+136+r10]
sal rax, 8
or rax, r9
sal rax, 8
or r11, rbx
or rax, r8
sal rax, 8
or rax, rdi
sal rax, 8
or rax, rsi
sal rax, 8
or rax, rcx
mov rcx, QWORD PTR [rsp-120]
mov QWORD PTR [rsp-120], r11
sal rax, 8
or rax, rdx
movzx edx, BYTE PTR [rsp+136+rcx]
sal rax, 8
or rax, rdx
mov QWORD PTR [rsp-112], rax
movdqa xmm0, XMMWORD PTR [rsp-120]
add rsp, 160
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
The above code unnecessarily allocates more stack space than is necessary and
stores xmm1 (the index vector) multiple times.
Here is an more optimal version of SSE2ShuffleI8:
.LSSE2ShuffleI8_Element_Mask:
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
SSE2ShuffleI8:
push rbp
push r15
push r14
push r13
push r12
push rbx
movdqa XMMWORD PTR [rsp - 24], xmm0
pand xmm1, XMMWORD PTR .LSSE2ShuffleI8_Element_Mask[rip]
movdqa XMMWORD PTR [rsp - 56], xmm1
movzx eax, BYTE PTR [rsp - 56]
movzx ebx, BYTE PTR [rsp - 55]
movzx ecx, BYTE PTR [rsp - 54]
movzx edx, BYTE PTR [rsp - 53]
movzx esi, BYTE PTR [rsp - 52]
movzx edi, BYTE PTR [rsp - 51]
movzx ebp, BYTE PTR [rsp - 50]
movzx r8d, BYTE PTR [rsp - 49]
movzx r9d, BYTE PTR [rsp - 48]
movzx r10d, BYTE PTR [rsp - 47]
movzx r11d, BYTE PTR [rsp - 46]
movzx r12d, BYTE PTR [rsp - 45]
movzx r13d, BYTE PTR [rsp - 44]
movzx r14d, BYTE PTR [rsp - 43]
movzx r15d, BYTE PTR [rsp - 42]
movzx eax, BYTE PTR [rsp + rax - 24]
movzx ebx, BYTE PTR [rsp + rbx - 24]
movzx ecx, BYTE PTR [rsp + rcx - 24]
movzx edx, BYTE PTR [rsp + rdx - 24]
movzx esi, BYTE PTR [rsp + rsi - 24]
movzx edi, BYTE PTR [rsp + rdi - 24]
movzx ebp, BYTE PTR [rsp + rbp - 24]
movzx r8d, BYTE PTR [rsp + r8 - 24]
movd xmm0, eax
movzx eax, BYTE PTR [rsp - 41]
movzx r9d, BYTE PTR [rsp + r9 - 24]
movzx r10d, BYTE PTR [rsp + r10 - 24]
movzx r11d, BYTE PTR [rsp + r11 - 24]
movzx r12d, BYTE PTR [rsp + r12 - 24]
movzx r13d, BYTE PTR [rsp + r13 - 24]
movzx r14d, BYTE PTR [rsp + r14 - 24]
movzx r15d, BYTE PTR [rsp + r15 - 24]
movzx eax, BYTE PTR [rsp + rax - 24]
movd xmm1, ebx
movd xmm2, ecx
movd xmm3, edx
movd xmm4, esi
movd xmm5, edi
movd xmm6, ebp
movd xmm7, r8d
movd xmm8, r9d
movd xmm9, r10d
movd xmm10, r11d
movd xmm11, r12d
movd xmm12, r13d
movd xmm13, r14d
movd xmm14, r15d
movd xmm15, eax
punpcklbw xmm0, xmm1
punpcklbw xmm2, xmm3
punpcklbw xmm4, xmm5
punpcklbw xmm6, xmm7
punpcklbw xmm8, xmm9
punpcklbw xmm10, xmm11
punpcklbw xmm12, xmm13
punpcklbw xmm14, xmm15
punpcklwd xmm0, xmm2
punpcklwd xmm4, xmm6
punpcklwd xmm8, xmm10
punpcklwd xmm12, xmm14
punpckldq xmm0, xmm4
punpckldq xmm8, xmm12
punpcklqdq xmm0, xmm8
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
The second version of the SSE2ShuffleI8 op above requires 79 instructions and
only requires 80 bytes of stack, compared to the first version of the
SSE2ShuffleI8 op (which is generated by GCC 13.2.0), which allocates 160 bytes
of stack and uses an additional 120 bytes of stack in the red zone and
generates 114 instructions.
More information about the Gcc-bugs
mailing list