A simple function (artificial code): #include <immintrin.h> int fn( const int* px, const int* py, const int* pz, const int* pw, const int* pa, const int* pb, const int* pc, const int* pd) { __m256i a0 = _mm256_loadu_si256((__m256i*)px); __m256i a1 = _mm256_loadu_si256((__m256i*)py); __m256i a2 = _mm256_loadu_si256((__m256i*)pz); __m256i a3 = _mm256_loadu_si256((__m256i*)pw); __m256i a4 = _mm256_loadu_si256((__m256i*)pa); __m256i b0 = _mm256_loadu_si256((__m256i*)pb); __m256i b1 = _mm256_loadu_si256((__m256i*)pc); __m256i b2 = _mm256_loadu_si256((__m256i*)pd); __m256i b3 = _mm256_loadu_si256((__m256i*)pc + 1); __m256i b4 = _mm256_loadu_si256((__m256i*)pd + 1); __m256i x0 = _mm256_packus_epi16(a0, b0); __m256i x1 = _mm256_packus_epi16(a1, b1); __m256i x2 = _mm256_packus_epi16(a2, b2); __m256i x3 = _mm256_packus_epi16(a3, b3); __m256i x4 = _mm256_packus_epi16(a4, b4); x0 = _mm256_add_epi16(x0, a0); x1 = _mm256_add_epi16(x1, a1); x2 = _mm256_add_epi16(x2, a2); x3 = _mm256_add_epi16(x3, a3); x4 = _mm256_add_epi16(x4, a4); x0 = _mm256_sub_epi16(x0, b0); x1 = _mm256_sub_epi16(x1, b1); x2 = _mm256_sub_epi16(x2, b2); x3 = _mm256_sub_epi16(x3, b3); x4 = _mm256_sub_epi16(x4, b4); x0 = _mm256_packus_epi16(x0, x1); x0 = _mm256_packus_epi16(x0, x2); x0 = _mm256_packus_epi16(x0, x3); x0 = _mm256_packus_epi16(x0, x4); return _mm256_extract_epi32(x0, 1); } Produces the following asm when compiled by GCC (annotated by me): ; GCC 6.1 -O2 -Wall -mavx2 -m32 -fomit-frame-pointer lea ecx, [esp+4] ; Return address and esp, -32 ; Align the stack to 32 bytes push DWORD PTR [ecx-4] ; Push returned address push ebp ; Save frame-pointer even if I told GCC to not to mov ebp, esp push edi ; Save GP regs push esi push ebx push ecx sub esp, 296 ; Reserve stack for YMM spills mov eax, DWORD PTR [ecx+16] ; LOAD 'pa' mov esi, DWORD PTR [ecx+4] ; LOAD 'py' mov edi, DWORD PTR [ecx] ; LOAD 'px' mov ebx, DWORD PTR [ecx+8] ; LOAD 'pz' mov edx, DWORD PTR [ecx+12] ; LOAD 'pw' mov DWORD PTR [ebp-120], eax ; SPILL 'pa' mov eax, DWORD PTR [ecx+20] ; LOAD 'pb' mov DWORD PTR [ebp-152], eax ; SPILL 'pb' mov eax, DWORD PTR [ecx+24] ; LOAD 'pc' vmovdqu ymm4, YMMWORD PTR [esi] mov ecx, DWORD PTR [ecx+28] ; LOAD 'pd' vmovdqu ymm7, YMMWORD PTR [edi] vmovdqa YMMWORD PTR [ebp-56], ymm4 ; SPILL VEC vmovdqu ymm4, YMMWORD PTR [ebx] mov ebx, DWORD PTR [ebp-152] ; LOAD 'pb' vmovdqa YMMWORD PTR [ebp-88], ymm4 ; SPILL VEC vmovdqu ymm4, YMMWORD PTR [edx] mov edx, DWORD PTR [ebp-120] ; LOAD 'pa' vmovdqu ymm6, YMMWORD PTR [edx] vmovdqa YMMWORD PTR [ebp-120], ymm6 ; SPILL VEC vmovdqu ymm0, YMMWORD PTR [ecx] vmovdqu ymm6, YMMWORD PTR [ebx] vmovdqa ymm5, ymm0 ; Why to move anything when using AVX? vmovdqu ymm0, YMMWORD PTR [eax+32] vmovdqu ymm2, YMMWORD PTR [eax] vmovdqa ymm1, ymm0 ; Why to move anything when using AVX? vmovdqu ymm0, YMMWORD PTR [ecx+32] vmovdqa YMMWORD PTR [ebp-152], ymm2 vmovdqa ymm3, ymm0 ; Why to move anything when using AVX? vpackuswb ymm0, ymm7, ymm6 vmovdqa YMMWORD PTR [ebp-184], ymm5 ; SPILL VEC vmovdqa YMMWORD PTR [ebp-248], ymm3 ; SPILL VEC vmovdqa YMMWORD PTR [ebp-280], ymm0 ; SPILL VEC vmovdqa ymm0, YMMWORD PTR [ebp-56] ; ALLOC VEC vmovdqa YMMWORD PTR [ebp-216], ymm1 ; SPILL VEC vpackuswb ymm2, ymm0, YMMWORD PTR [ebp-152] ; Uses SPILL slot vmovdqa ymm0, YMMWORD PTR [ebp-88] ; ALLOC VEC vpackuswb ymm1, ymm4, YMMWORD PTR [ebp-216] ; Uses SPILL slot vpackuswb ymm5, ymm0, YMMWORD PTR [ebp-184] ; Uses SPILL slot vmovdqa ymm0, YMMWORD PTR [ebp-120] ; ALLOC VEC vpaddw ymm2, ymm2, YMMWORD PTR [ebp-56] ; Uses SPILL slot vpsubw ymm2, ymm2, YMMWORD PTR [ebp-152] ; Uses SPILL slot vpackuswb ymm3, ymm0, YMMWORD PTR [ebp-248] ; Uses SPILL slot vpaddw ymm0, ymm7, YMMWORD PTR [ebp-280] ; Uses SPILL slot vpsubw ymm0, ymm0, ymm6 vmovdqa ymm7, YMMWORD PTR [ebp-120] ; ALLOC VEC vpackuswb ymm0, ymm0, ymm2 vpaddw ymm2, ymm4, ymm1 vpsubw ymm2, ymm2, YMMWORD PTR [ebp-216] ; Uses SPILL slot vmovdqa YMMWORD PTR [ebp-312], ymm3 ; SPILL VEC vpaddw ymm3, ymm5, YMMWORD PTR [ebp-88] ; Uses SPILL slot vpsubw ymm3, ymm3, YMMWORD PTR [ebp-184] ; Uses SPILL slot vpackuswb ymm0, ymm0, ymm3 vpaddw ymm1, ymm7, YMMWORD PTR [ebp-312] ; Uses SPILL slot vpsubw ymm1, ymm1, YMMWORD PTR [ebp-248] ; Uses SPILL slot vpackuswb ymm0, ymm0, ymm2 vpackuswb ymm0, ymm0, ymm1 vpextrd eax, xmm0, 1 ; Return value vzeroupper add esp, 296 pop ecx pop ebx pop esi pop edi pop ebp lea esp, [ecx-4] ret While clang produces just this: ; Clang 3.8 -O2 -Wall -mavx2 -m32 -fomit-frame-pointer mov eax, dword ptr [esp + 32] ; LOAD 'pd' mov ecx, dword ptr [esp + 4] ; LOAD 'px' vmovdqu ymm0, ymmword ptr [ecx] mov ecx, dword ptr [esp + 8] ; LOAD 'py' vmovdqu ymm1, ymmword ptr [ecx] mov ecx, dword ptr [esp + 12] ; LOAD 'pz' vmovdqu ymm2, ymmword ptr [ecx] mov ecx, dword ptr [esp + 16] ; LOAD 'pw' vmovdqu ymm3, ymmword ptr [ecx] mov ecx, dword ptr [esp + 20] ; LOAD 'pa' vmovdqu ymm4, ymmword ptr [ecx] mov ecx, dword ptr [esp + 24] ; LOAD 'pb' vmovdqu ymm5, ymmword ptr [ecx] mov ecx, dword ptr [esp + 28] ; LOAD 'pc' vpackuswb ymm6, ymm0, ymm5 vpsubw ymm0, ymm0, ymm5 vmovdqu ymm5, ymmword ptr [ecx] vpaddw ymm0, ymm0, ymm6 vpackuswb ymm6, ymm1, ymm5 vpsubw ymm1, ymm1, ymm5 vmovdqu ymm5, ymmword ptr [eax] vpaddw ymm1, ymm1, ymm6 vpackuswb ymm6, ymm2, ymm5 vpsubw ymm2, ymm2, ymm5 vmovdqu ymm5, ymmword ptr [ecx + 32] vpaddw ymm2, ymm2, ymm6 vpackuswb ymm6, ymm3, ymm5 vpsubw ymm3, ymm3, ymm5 vmovdqu ymm5, ymmword ptr [eax + 32] vpaddw ymm3, ymm3, ymm6 vpackuswb ymm6, ymm4, ymm5 vpsubw ymm4, ymm4, ymm5 vpaddw ymm4, ymm4, ymm6 vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm2 vpackuswb ymm0, ymm0, ymm3 vpackuswb ymm0, ymm0, ymm4 vpextrd eax, xmm0, 1 ; Return value vzeroupper ret I have written about this in my blog here: https://asmbits.blogspot.com/2016/08/comparing-register-allocator-of-gcc-and.html Problems summary: 1. Spilling GPRs in our case is not needed at all 2. Spilling YMMs is also questionable as some instructions can be reordered, see clang output 3. Frame pointer is preserved even when I compiled with -fomit-frame-pointer 4. Using [ebp-X] instead of [esp+Y] produces longer code when `X > 128 && Y < 128`. You can quickly verify the outputs by pasting the source here: https://gcc.godbolt.org/
Try adding -march=intel or-mtune=intel . The default tuning for gcc is generic which is combination of Intel and amd tuning. And because amd tuning needs not to use gprs and SIMD registers at the same time spilling is faster there. It tunes for that.
With '-mtune=intel' the push/pop sequence is gone, but YMM register management remains the same - 24 memory accesses more than clang.
-fschedule-insns improves things here - and LRA seems to be more happy to spill/reload rather than rematerialize. But in the end the testcase requires careful scheduling of the operations to reduce register lifetime and thus allow optimal RA with the limited number of registers available. We force a frame pointer because we have to re-align the stack for possible spills.
Adding -fschedule-insns is definitely a huge improvement in this case. I wonder why this doesn't happen by default at -O2 and -Os, as it really improves things and makes shorter output, or it's just in this particular case? Here is the assembly produced by gcc with -fschedule-insns: push ebp mov ebp, esp and esp, -32 lea esp, [esp-32] mov ecx, DWORD PTR [ebp+8] mov edx, DWORD PTR [ebp+32] mov eax, DWORD PTR [ebp+36] vmovdqu ymm5, YMMWORD PTR [ecx] mov ecx, DWORD PTR [ebp+12] vmovdqu ymm3, YMMWORD PTR [edx] vmovdqu ymm6, YMMWORD PTR [eax] vmovdqu ymm2, YMMWORD PTR [ecx] mov ecx, DWORD PTR [ebp+28] vpackuswb ymm7, ymm2, ymm3 vpaddw ymm7, ymm7, ymm2 vpsubw ymm7, ymm7, ymm3 vmovdqu ymm4, YMMWORD PTR [ecx] mov ecx, DWORD PTR [ebp+16] vpackuswb ymm0, ymm5, ymm4 vpaddw ymm0, ymm0, ymm5 vpsubw ymm0, ymm0, ymm4 vmovdqu ymm1, YMMWORD PTR [ecx] vpackuswb ymm0, ymm0, ymm7 mov ecx, DWORD PTR [ebp+20] vpackuswb ymm2, ymm1, ymm6 vmovdqu ymm4, YMMWORD PTR [edx+32] vpaddw ymm1, ymm2, ymm1 mov edx, DWORD PTR [ebp+24] vpsubw ymm1, ymm1, ymm6 vmovdqu ymm5, YMMWORD PTR [ecx] vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm3, ymm5, ymm4 vmovdqa YMMWORD PTR [esp], ymm3 vmovdqu ymm2, YMMWORD PTR [eax+32] ; LOOK HERE vpaddw ymm5, ymm5, YMMWORD PTR [esp] vmovdqu ymm3, YMMWORD PTR [edx] ; AND HERE vpsubw ymm4, ymm5, ymm4 vpackuswb ymm7, ymm3, ymm2 vpackuswb ymm0, ymm0, ymm4 vpaddw ymm3, ymm7, ymm3 vpsubw ymm2, ymm3, ymm2 vpackuswb ymm2, ymm0, ymm2 vpextrd eax, xmm2, 1 vzeroupper leave ret Which is pretty close to clang already, however, look at this part: vmovdqa YMMWORD PTR [esp], ymm3 ; Spill YMM3 vmovdqu ymm2, YMMWORD PTR [eax+32] vpaddw ymm5, ymm5, YMMWORD PTR [esp] ; Mem instead of YMM3? vmovdqu ymm3, YMMWORD PTR [edx] ; Old YMM3 becomes dead here The spill is completely unnecessary in our case, and it's the only reason why the prolog/epilog requires code to perform dynamic stack alignment. I mean if this one thing is eliminated then GCC basically generates a comparable code to clang. But thanks for -fschedule-insns hint, I didn't know about it.
clang can now produce: mov eax, dword ptr [esp + 16] mov ecx, dword ptr [esp + 28] vmovdqu xmm0, xmmword ptr [ecx + 32] vmovdqu xmm1, xmmword ptr [eax] vpackuswb xmm2, xmm1, xmm0 vpsubw xmm0, xmm1, xmm0 vpaddw xmm0, xmm0, xmm2 vpackuswb xmm0, xmm0, xmm0 vpackuswb xmm0, xmm0, xmm0 vpextrd eax, xmm0, 1 ret I suspect if the back-end is able to "fold" at the gimple level the builtins into gimple, GCC will do a much better job. Currently we have stuff like: _27 = __builtin_ia32_vextractf128_si256 (_28, 0); _26 = __builtin_ia32_vec_ext_v4si (_27, 1); [tail call] I think both are just a BIT_FIELD_REF really and even more can be simplified to just one bitfield extraction rather than what we do now: vpackuswb %ymm1, %ymm0, %ymm0 vpextrd $1, %xmm0, %eax Plus it looks like with __builtin_ia32_vextractf128_si256 (_28, 0), clang is able to remove half of the code due to only needing 128 bytes stuff :).
Yes, the code is not really doing anything useful, I only wrote it to demonstrate the spills problem. Clang actually outsmarted me by removing half of the code :) I think this issue can be closed, I cannot repro this with the newest GCC.
(In reply to Andrew Pinski from comment #5) > clang can now produce: > mov eax, dword ptr [esp + 16] > mov ecx, dword ptr [esp + 28] > vmovdqu xmm0, xmmword ptr [ecx + 32] > vmovdqu xmm1, xmmword ptr [eax] > vpackuswb xmm2, xmm1, xmm0 > vpsubw xmm0, xmm1, xmm0 > vpaddw xmm0, xmm0, xmm2 > vpackuswb xmm0, xmm0, xmm0 > vpackuswb xmm0, xmm0, xmm0 > vpextrd eax, xmm0, 1 > ret > > I suspect if the back-end is able to "fold" at the gimple level the builtins > into gimple, GCC will do a much better job. > Currently we have stuff like: > _27 = __builtin_ia32_vextractf128_si256 (_28, 0); > _26 = __builtin_ia32_vec_ext_v4si (_27, 1); [tail call] > > I think both are just a BIT_FIELD_REF really and even more can be simplified > to just one bitfield extraction rather than what we do now: > vpackuswb %ymm1, %ymm0, %ymm0 > vpextrd $1, %xmm0, %eax > > Plus it looks like with __builtin_ia32_vextractf128_si256 (_28, 0), clang is > able to remove half of the code due to only needing 128 bytes stuff :). Yes, let's me try this.
(In reply to Hongtao.liu from comment #7) > (In reply to Andrew Pinski from comment #5) > > clang can now produce: > > mov eax, dword ptr [esp + 16] > > mov ecx, dword ptr [esp + 28] > > vmovdqu xmm0, xmmword ptr [ecx + 32] > > vmovdqu xmm1, xmmword ptr [eax] > > vpackuswb xmm2, xmm1, xmm0 > > vpsubw xmm0, xmm1, xmm0 > > vpaddw xmm0, xmm0, xmm2 > > vpackuswb xmm0, xmm0, xmm0 > > vpackuswb xmm0, xmm0, xmm0 > > vpextrd eax, xmm0, 1 > > ret > > > > I suspect if the back-end is able to "fold" at the gimple level the builtins > > into gimple, GCC will do a much better job. > > Currently we have stuff like: > > _27 = __builtin_ia32_vextractf128_si256 (_28, 0); > > _26 = __builtin_ia32_vec_ext_v4si (_27, 1); [tail call] > > > > I think both are just a BIT_FIELD_REF really and even more can be simplified > > to just one bitfield extraction rather than what we do now: > > vpackuswb %ymm1, %ymm0, %ymm0 > > vpextrd $1, %xmm0, %eax > > > > Plus it looks like with __builtin_ia32_vextractf128_si256 (_28, 0), clang is > > able to remove half of the code due to only needing 128 bytes stuff :). > > Yes, let's me try this. Do we have IR for unsigned/signed saturation in gimple level?
(In reply to Hongtao.liu from comment #8) > Do we have IR for unsigned/signed saturation in gimple level? Not yet. I was just looking for that today due because of PR 51492.
(In reply to Andrew Pinski from comment #9) > (In reply to Hongtao.liu from comment #8) > > Do we have IR for unsigned/signed saturation in gimple level? > > Not yet. I was just looking for that today due because of PR 51492. But there is a RFC out for it: https://gcc.gnu.org/pipermail/gcc/2021-May/236015.html
(In reply to Andrew Pinski from comment #10) > (In reply to Andrew Pinski from comment #9) > > (In reply to Hongtao.liu from comment #8) > > > Do we have IR for unsigned/signed saturation in gimple level? > > > > Not yet. I was just looking for that today due because of PR 51492. > > But there is a RFC out for it: > https://gcc.gnu.org/pipermail/gcc/2021-May/236015.html Oh, VEC_PACK_SAT_EXPR is exact what i needed for _mm256_packus_epi16, thanks for the pointer.
(In reply to Hongtao.liu from comment #11) > (In reply to Andrew Pinski from comment #10) > > (In reply to Andrew Pinski from comment #9) > > > (In reply to Hongtao.liu from comment #8) > > > > Do we have IR for unsigned/signed saturation in gimple level? > > > > > > Not yet. I was just looking for that today due because of PR 51492. > > > > But there is a RFC out for it: > > https://gcc.gnu.org/pipermail/gcc/2021-May/236015.html > > Oh, VEC_PACK_SAT_EXPR is exact what i needed for _mm256_packus_epi16, thanks > for the pointer. And ‘vec_pack_ssat_m’, ‘vec_pack_usat_m’ for optab.
;; Function fn (fn, funcdef_no=5484, decl_uid=32317, cgraph_uid=5485, symbol_order=5484) int fn (const int * px, const int * py, const int * pz, const int * pw, const int * pa, const int * pb, const int * pc, const int * pd) { vector(16) short unsigned int _3; vector(16) short unsigned int _5; vector(16) short int _7; vector(16) short int _9; vector(32) char _12; vector(32) unsigned char _14; vector(16) short unsigned int _16; vector(16) short unsigned int _17; vector(16) short int _18; vector(16) short int _19; vector(32) char _20; vector(32) unsigned char _21; vector(16) short unsigned int _22; vector(16) short unsigned int _23; vector(16) short int _24; vector(16) short int _25; vector(32) char _26; vector(32) unsigned char _27; vector(16) short unsigned int _28; vector(16) short unsigned int _29; vector(16) short int _30; vector(16) short int _31; int _32; vector(4) int _33; vector(8) int _34; vector(32) unsigned char _35; vector(32) char _36; vector(16) short unsigned int _37; vector(16) short unsigned int _38; vector(16) short unsigned int _39; vector(16) short unsigned int _40; vector(16) short unsigned int _41; vector(16) short unsigned int _42; vector(16) short unsigned int _43; vector(16) short unsigned int _44; vector(16) short unsigned int _45; vector(16) short unsigned int _46; vector(16) short unsigned int _47; vector(16) short unsigned int _48; vector(16) short unsigned int _50; vector(16) short unsigned int _51; vector(16) short unsigned int _53; vector(16) short unsigned int _54; vector(16) short unsigned int _56; vector(16) short unsigned int _57; vector(16) short unsigned int _59; vector(16) short unsigned int _60; vector(16) short int _62; vector(16) short int _63; vector(16) short unsigned int _64; vector(16) short unsigned int _65; vector(32) unsigned char _66; vector(32) char _67; vector(16) short int _68; vector(16) short int _69; vector(16) short unsigned int _70; vector(16) short unsigned int _71; vector(32) unsigned char _72; vector(32) char _73; vector(16) short int _74; vector(16) short int _75; vector(16) short unsigned int _76; vector(16) short unsigned int _77; vector(32) unsigned char _78; vector(32) char _79; vector(16) short int _80; vector(16) short int _81; vector(16) short unsigned int _82; vector(16) short unsigned int _83; vector(32) unsigned char _84; vector(32) char _85; vector(16) short int _86; vector(16) short int _87; vector(16) short unsigned int _88; vector(16) short unsigned int _89; vector(32) unsigned char _90; vector(32) char _91; vector(4) long long int _92; vector(4) long long int _93; vector(4) long long int _94; vector(4) long long int _95; vector(4) long long int _96; vector(4) long long int _97; vector(4) long long int _98; vector(4) long long int _99; vector(4) long long int _100; vector(4) long long int _101; vector(16) short unsigned int _107; vector(16) short unsigned int _108; vector(16) short unsigned int _109; vector(16) short unsigned int _110; vector(16) short unsigned int _111; <bb 2> [local count: 1073741824]: _101 = MEM[(const __m256i_u * {ref-all})px_2(D)]; _100 = MEM[(const __m256i_u * {ref-all})py_4(D)]; _99 = MEM[(const __m256i_u * {ref-all})pz_6(D)]; _98 = MEM[(const __m256i_u * {ref-all})pw_8(D)]; _97 = MEM[(const __m256i_u * {ref-all})pa_10(D)]; _96 = MEM[(const __m256i_u * {ref-all})pb_11(D)]; _95 = MEM[(const __m256i_u * {ref-all})pc_13(D)]; _94 = MEM[(const __m256i_u * {ref-all})pd_15(D)]; _93 = MEM[(const __m256i_u * {ref-all})pc_13(D) + 32B]; _92 = MEM[(const __m256i_u * {ref-all})pd_15(D) + 32B]; _86 = VIEW_CONVERT_EXPR<vector(16) short int>(_96); _87 = VIEW_CONVERT_EXPR<vector(16) short int>(_101); _88 = (vector(16) short unsigned int) _87; _89 = (vector(16) short unsigned int) _86; _90 = VEC_PACK_SAT_EXPR <_88, _89>; _91 = (vector(32) char) _90; _80 = VIEW_CONVERT_EXPR<vector(16) short int>(_95); _81 = VIEW_CONVERT_EXPR<vector(16) short int>(_100); _82 = (vector(16) short unsigned int) _81; _83 = (vector(16) short unsigned int) _80; _84 = VEC_PACK_SAT_EXPR <_82, _83>; _85 = (vector(32) char) _84; _74 = VIEW_CONVERT_EXPR<vector(16) short int>(_94); _75 = VIEW_CONVERT_EXPR<vector(16) short int>(_99); _76 = (vector(16) short unsigned int) _75; _77 = (vector(16) short unsigned int) _74; _78 = VEC_PACK_SAT_EXPR <_76, _77>; _79 = (vector(32) char) _78; _68 = VIEW_CONVERT_EXPR<vector(16) short int>(_93); _69 = VIEW_CONVERT_EXPR<vector(16) short int>(_98); _70 = (vector(16) short unsigned int) _69; _71 = (vector(16) short unsigned int) _68; _72 = VEC_PACK_SAT_EXPR <_70, _71>; _73 = (vector(32) char) _72; _62 = VIEW_CONVERT_EXPR<vector(16) short int>(_92); _63 = VIEW_CONVERT_EXPR<vector(16) short int>(_97); _64 = (vector(16) short unsigned int) _63; _65 = (vector(16) short unsigned int) _62; _66 = VEC_PACK_SAT_EXPR <_64, _65>; _67 = (vector(32) char) _66; _59 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_91); _60 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_101); _56 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_85); _57 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_100); _53 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_79); _54 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_99); _50 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_73); _51 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_98); _47 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_67); _48 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_97); _45 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_96); _111 = _60 - _45; _46 = _59 + _111; _43 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_95); _110 = _57 - _43; _44 = _56 + _110; _41 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_94); _109 = _54 - _41; _42 = _53 + _109; _39 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_93); _108 = _51 - _39; _40 = _50 + _108; _37 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_92); _107 = _48 - _37; _38 = _47 + _107; _9 = VIEW_CONVERT_EXPR<vector(16) short int>(_44); _7 = VIEW_CONVERT_EXPR<vector(16) short int>(_46); _5 = (vector(16) short unsigned int) _7; _3 = (vector(16) short unsigned int) _9; _35 = VEC_PACK_SAT_EXPR <_5, _3>; _36 = (vector(32) char) _35; _19 = VIEW_CONVERT_EXPR<vector(16) short int>(_42); _18 = VIEW_CONVERT_EXPR<vector(16) short int>(_36); _17 = (vector(16) short unsigned int) _18; _16 = (vector(16) short unsigned int) _19; _14 = VEC_PACK_SAT_EXPR <_17, _16>; _12 = (vector(32) char) _14; _25 = VIEW_CONVERT_EXPR<vector(16) short int>(_40); _24 = VIEW_CONVERT_EXPR<vector(16) short int>(_12); _23 = (vector(16) short unsigned int) _24; _22 = (vector(16) short unsigned int) _25; _21 = VEC_PACK_SAT_EXPR <_23, _22>; _20 = (vector(32) char) _21; _31 = VIEW_CONVERT_EXPR<vector(16) short int>(_38); _30 = VIEW_CONVERT_EXPR<vector(16) short int>(_20); _29 = (vector(16) short unsigned int) _30; _28 = (vector(16) short unsigned int) _31; _27 = VEC_PACK_SAT_EXPR <_29, _28>; _26 = (vector(32) char) _27; _34 = VIEW_CONVERT_EXPR<vector(8) int>(_26); _33 = __builtin_ia32_vextractf128_si256 (_34, 0); _32 = __builtin_ia32_vec_ext_v4si (_33, 1); [tail call] return _32; } After folding _mm256_packus_epi16, gimple still doesn't simplify it. I guess gcc only functionally supports vec_pack_sat_expr, but does not optimize it