This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug c++/77287] New: Much worse code generated compared to clang (stack alignment and spills)
- From: "kobalicek.petr at gmail dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Thu, 18 Aug 2016 10:35:34 +0000
- Subject: [Bug c++/77287] New: Much worse code generated compared to clang (stack alignment and spills)
- Authentication-results: sourceware.org; auth=none
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77287
Bug ID: 77287
Summary: Much worse code generated compared to clang (stack
alignment and spills)
Product: gcc
Version: 6.1.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c++
Assignee: unassigned at gcc dot gnu.org
Reporter: kobalicek.petr at gmail dot com
Target Milestone: ---
A simple function (artificial code):
#include <immintrin.h>
int fn(
const int* px, const int* py,
const int* pz, const int* pw,
const int* pa, const int* pb,
const int* pc, const int* pd) {
__m256i a0 = _mm256_loadu_si256((__m256i*)px);
__m256i a1 = _mm256_loadu_si256((__m256i*)py);
__m256i a2 = _mm256_loadu_si256((__m256i*)pz);
__m256i a3 = _mm256_loadu_si256((__m256i*)pw);
__m256i a4 = _mm256_loadu_si256((__m256i*)pa);
__m256i b0 = _mm256_loadu_si256((__m256i*)pb);
__m256i b1 = _mm256_loadu_si256((__m256i*)pc);
__m256i b2 = _mm256_loadu_si256((__m256i*)pd);
__m256i b3 = _mm256_loadu_si256((__m256i*)pc + 1);
__m256i b4 = _mm256_loadu_si256((__m256i*)pd + 1);
__m256i x0 = _mm256_packus_epi16(a0, b0);
__m256i x1 = _mm256_packus_epi16(a1, b1);
__m256i x2 = _mm256_packus_epi16(a2, b2);
__m256i x3 = _mm256_packus_epi16(a3, b3);
__m256i x4 = _mm256_packus_epi16(a4, b4);
x0 = _mm256_add_epi16(x0, a0);
x1 = _mm256_add_epi16(x1, a1);
x2 = _mm256_add_epi16(x2, a2);
x3 = _mm256_add_epi16(x3, a3);
x4 = _mm256_add_epi16(x4, a4);
x0 = _mm256_sub_epi16(x0, b0);
x1 = _mm256_sub_epi16(x1, b1);
x2 = _mm256_sub_epi16(x2, b2);
x3 = _mm256_sub_epi16(x3, b3);
x4 = _mm256_sub_epi16(x4, b4);
x0 = _mm256_packus_epi16(x0, x1);
x0 = _mm256_packus_epi16(x0, x2);
x0 = _mm256_packus_epi16(x0, x3);
x0 = _mm256_packus_epi16(x0, x4);
return _mm256_extract_epi32(x0, 1);
}
Produces the following asm when compiled by GCC (annotated by me):
; GCC 6.1 -O2 -Wall -mavx2 -m32 -fomit-frame-pointer
lea ecx, [esp+4] ; Return address
and esp, -32 ; Align the stack to 32 bytes
push DWORD PTR [ecx-4] ; Push returned address
push ebp ; Save frame-pointer even if I
told GCC to not to
mov ebp, esp
push edi ; Save GP regs
push esi
push ebx
push ecx
sub esp, 296 ; Reserve stack for YMM spills
mov eax, DWORD PTR [ecx+16] ; LOAD 'pa'
mov esi, DWORD PTR [ecx+4] ; LOAD 'py'
mov edi, DWORD PTR [ecx] ; LOAD 'px'
mov ebx, DWORD PTR [ecx+8] ; LOAD 'pz'
mov edx, DWORD PTR [ecx+12] ; LOAD 'pw'
mov DWORD PTR [ebp-120], eax ; SPILL 'pa'
mov eax, DWORD PTR [ecx+20] ; LOAD 'pb'
mov DWORD PTR [ebp-152], eax ; SPILL 'pb'
mov eax, DWORD PTR [ecx+24] ; LOAD 'pc'
vmovdqu ymm4, YMMWORD PTR [esi]
mov ecx, DWORD PTR [ecx+28] ; LOAD 'pd'
vmovdqu ymm7, YMMWORD PTR [edi]
vmovdqa YMMWORD PTR [ebp-56], ymm4 ; SPILL VEC
vmovdqu ymm4, YMMWORD PTR [ebx]
mov ebx, DWORD PTR [ebp-152] ; LOAD 'pb'
vmovdqa YMMWORD PTR [ebp-88], ymm4 ; SPILL VEC
vmovdqu ymm4, YMMWORD PTR [edx]
mov edx, DWORD PTR [ebp-120] ; LOAD 'pa'
vmovdqu ymm6, YMMWORD PTR [edx]
vmovdqa YMMWORD PTR [ebp-120], ymm6 ; SPILL VEC
vmovdqu ymm0, YMMWORD PTR [ecx]
vmovdqu ymm6, YMMWORD PTR [ebx]
vmovdqa ymm5, ymm0 ; Why to move anything when using
AVX?
vmovdqu ymm0, YMMWORD PTR [eax+32]
vmovdqu ymm2, YMMWORD PTR [eax]
vmovdqa ymm1, ymm0 ; Why to move anything when using
AVX?
vmovdqu ymm0, YMMWORD PTR [ecx+32]
vmovdqa YMMWORD PTR [ebp-152], ymm2
vmovdqa ymm3, ymm0 ; Why to move anything when using
AVX?
vpackuswb ymm0, ymm7, ymm6
vmovdqa YMMWORD PTR [ebp-184], ymm5 ; SPILL VEC
vmovdqa YMMWORD PTR [ebp-248], ymm3 ; SPILL VEC
vmovdqa YMMWORD PTR [ebp-280], ymm0 ; SPILL VEC
vmovdqa ymm0, YMMWORD PTR [ebp-56] ; ALLOC VEC
vmovdqa YMMWORD PTR [ebp-216], ymm1 ; SPILL VEC
vpackuswb ymm2, ymm0, YMMWORD PTR [ebp-152] ; Uses SPILL slot
vmovdqa ymm0, YMMWORD PTR [ebp-88] ; ALLOC VEC
vpackuswb ymm1, ymm4, YMMWORD PTR [ebp-216] ; Uses SPILL slot
vpackuswb ymm5, ymm0, YMMWORD PTR [ebp-184] ; Uses SPILL slot
vmovdqa ymm0, YMMWORD PTR [ebp-120] ; ALLOC VEC
vpaddw ymm2, ymm2, YMMWORD PTR [ebp-56] ; Uses SPILL slot
vpsubw ymm2, ymm2, YMMWORD PTR [ebp-152] ; Uses SPILL slot
vpackuswb ymm3, ymm0, YMMWORD PTR [ebp-248] ; Uses SPILL slot
vpaddw ymm0, ymm7, YMMWORD PTR [ebp-280] ; Uses SPILL slot
vpsubw ymm0, ymm0, ymm6
vmovdqa ymm7, YMMWORD PTR [ebp-120] ; ALLOC VEC
vpackuswb ymm0, ymm0, ymm2
vpaddw ymm2, ymm4, ymm1
vpsubw ymm2, ymm2, YMMWORD PTR [ebp-216] ; Uses SPILL slot
vmovdqa YMMWORD PTR [ebp-312], ymm3 ; SPILL VEC
vpaddw ymm3, ymm5, YMMWORD PTR [ebp-88] ; Uses SPILL slot
vpsubw ymm3, ymm3, YMMWORD PTR [ebp-184] ; Uses SPILL slot
vpackuswb ymm0, ymm0, ymm3
vpaddw ymm1, ymm7, YMMWORD PTR [ebp-312] ; Uses SPILL slot
vpsubw ymm1, ymm1, YMMWORD PTR [ebp-248] ; Uses SPILL slot
vpackuswb ymm0, ymm0, ymm2
vpackuswb ymm0, ymm0, ymm1
vpextrd eax, xmm0, 1 ; Return value
vzeroupper
add esp, 296
pop ecx
pop ebx
pop esi
pop edi
pop ebp
lea esp, [ecx-4]
ret
While clang produces just this:
; Clang 3.8 -O2 -Wall -mavx2 -m32 -fomit-frame-pointer
mov eax, dword ptr [esp + 32] ; LOAD 'pd'
mov ecx, dword ptr [esp + 4] ; LOAD 'px'
vmovdqu ymm0, ymmword ptr [ecx]
mov ecx, dword ptr [esp + 8] ; LOAD 'py'
vmovdqu ymm1, ymmword ptr [ecx]
mov ecx, dword ptr [esp + 12] ; LOAD 'pz'
vmovdqu ymm2, ymmword ptr [ecx]
mov ecx, dword ptr [esp + 16] ; LOAD 'pw'
vmovdqu ymm3, ymmword ptr [ecx]
mov ecx, dword ptr [esp + 20] ; LOAD 'pa'
vmovdqu ymm4, ymmword ptr [ecx]
mov ecx, dword ptr [esp + 24] ; LOAD 'pb'
vmovdqu ymm5, ymmword ptr [ecx]
mov ecx, dword ptr [esp + 28] ; LOAD 'pc'
vpackuswb ymm6, ymm0, ymm5
vpsubw ymm0, ymm0, ymm5
vmovdqu ymm5, ymmword ptr [ecx]
vpaddw ymm0, ymm0, ymm6
vpackuswb ymm6, ymm1, ymm5
vpsubw ymm1, ymm1, ymm5
vmovdqu ymm5, ymmword ptr [eax]
vpaddw ymm1, ymm1, ymm6
vpackuswb ymm6, ymm2, ymm5
vpsubw ymm2, ymm2, ymm5
vmovdqu ymm5, ymmword ptr [ecx + 32]
vpaddw ymm2, ymm2, ymm6
vpackuswb ymm6, ymm3, ymm5
vpsubw ymm3, ymm3, ymm5
vmovdqu ymm5, ymmword ptr [eax + 32]
vpaddw ymm3, ymm3, ymm6
vpackuswb ymm6, ymm4, ymm5
vpsubw ymm4, ymm4, ymm5
vpaddw ymm4, ymm4, ymm6
vpackuswb ymm0, ymm0, ymm1
vpackuswb ymm0, ymm0, ymm2
vpackuswb ymm0, ymm0, ymm3
vpackuswb ymm0, ymm0, ymm4
vpextrd eax, xmm0, 1 ; Return value
vzeroupper
ret
I have written about this in my blog here:
https://asmbits.blogspot.com/2016/08/comparing-register-allocator-of-gcc-and.html
Problems summary:
1. Spilling GPRs in our case is not needed at all
2. Spilling YMMs is also questionable as some instructions can be reordered,
see clang output
3. Frame pointer is preserved even when I compiled with -fomit-frame-pointer
4. Using [ebp-X] instead of [esp+Y] produces longer code when `X > 128 && Y <
128`.
You can quickly verify the outputs by pasting the source here:
https://gcc.godbolt.org/