Bug 77287

Summary: Much worse code generated compared to clang (stack alignment and spills)
Product: gcc Reporter: Petr <kobalicek.petr>
Component: targetAssignee: Not yet assigned to anyone <unassigned>
Status: NEW ---    
Severity: normal CC: crazylht, hjl.tools, vmakarov, wwwhhhyyy333
Priority: P3 Keywords: missed-optimization, ra
Version: 6.1.0   
Target Milestone: ---   
Host: Target: x86_64-*-*, i?86-*-*
Build: Known to work:
Known to fail: Last reconfirmed: 2016-08-18 00:00:00

Description Petr 2016-08-18 10:35:34 UTC
A simple function (artificial code):

#include <immintrin.h>

int fn(
  const int* px, const int* py,
  const int* pz, const int* pw,
  const int* pa, const int* pb,
  const int* pc, const int* pd) {

  __m256i a0 = _mm256_loadu_si256((__m256i*)px);
  __m256i a1 = _mm256_loadu_si256((__m256i*)py);
  __m256i a2 = _mm256_loadu_si256((__m256i*)pz);
  __m256i a3 = _mm256_loadu_si256((__m256i*)pw);
  __m256i a4 = _mm256_loadu_si256((__m256i*)pa);
  __m256i b0 = _mm256_loadu_si256((__m256i*)pb);
  __m256i b1 = _mm256_loadu_si256((__m256i*)pc);
  __m256i b2 = _mm256_loadu_si256((__m256i*)pd);
  __m256i b3 = _mm256_loadu_si256((__m256i*)pc + 1);
  __m256i b4 = _mm256_loadu_si256((__m256i*)pd + 1);
  
  __m256i x0 = _mm256_packus_epi16(a0, b0);
  __m256i x1 = _mm256_packus_epi16(a1, b1);
  __m256i x2 = _mm256_packus_epi16(a2, b2);
  __m256i x3 = _mm256_packus_epi16(a3, b3);
  __m256i x4 = _mm256_packus_epi16(a4, b4);
  
  x0 = _mm256_add_epi16(x0, a0);
  x1 = _mm256_add_epi16(x1, a1);
  x2 = _mm256_add_epi16(x2, a2);
  x3 = _mm256_add_epi16(x3, a3);
  x4 = _mm256_add_epi16(x4, a4);

  x0 = _mm256_sub_epi16(x0, b0);
  x1 = _mm256_sub_epi16(x1, b1);
  x2 = _mm256_sub_epi16(x2, b2);
  x3 = _mm256_sub_epi16(x3, b3);
  x4 = _mm256_sub_epi16(x4, b4);
  
  x0 = _mm256_packus_epi16(x0, x1);
  x0 = _mm256_packus_epi16(x0, x2);
  x0 = _mm256_packus_epi16(x0, x3);
  x0 = _mm256_packus_epi16(x0, x4);
  return _mm256_extract_epi32(x0, 1);
}

Produces the following asm when compiled by GCC (annotated by me):

  ; GCC 6.1 -O2 -Wall -mavx2 -m32 -fomit-frame-pointer
  lea       ecx, [esp+4]                      ; Return address
  and       esp, -32                          ; Align the stack to 32 bytes
  push      DWORD PTR [ecx-4]                 ; Push returned address
  push      ebp                               ; Save frame-pointer even if I told GCC to not to
  mov       ebp, esp
  push      edi                               ; Save GP regs
  push      esi
  push      ebx
  push      ecx
  sub       esp, 296                          ; Reserve stack for YMM spills
  mov       eax, DWORD PTR [ecx+16]           ; LOAD 'pa'
  mov       esi, DWORD PTR [ecx+4]            ; LOAD 'py'
  mov       edi, DWORD PTR [ecx]              ; LOAD 'px'
  mov       ebx, DWORD PTR [ecx+8]            ; LOAD 'pz'
  mov       edx, DWORD PTR [ecx+12]           ; LOAD 'pw'
  mov       DWORD PTR [ebp-120], eax          ; SPILL 'pa'
  mov       eax, DWORD PTR [ecx+20]           ; LOAD 'pb'
  mov       DWORD PTR [ebp-152], eax          ; SPILL 'pb'
  mov       eax, DWORD PTR [ecx+24]           ; LOAD 'pc'
  vmovdqu   ymm4, YMMWORD PTR [esi]
  mov       ecx, DWORD PTR [ecx+28]           ; LOAD 'pd'
  vmovdqu   ymm7, YMMWORD PTR [edi]
  vmovdqa   YMMWORD PTR [ebp-56], ymm4        ; SPILL VEC
  vmovdqu   ymm4, YMMWORD PTR [ebx]
  mov       ebx, DWORD PTR [ebp-152]          ; LOAD 'pb'
  vmovdqa   YMMWORD PTR [ebp-88], ymm4        ; SPILL VEC
  vmovdqu   ymm4, YMMWORD PTR [edx]
  mov       edx, DWORD PTR [ebp-120]          ; LOAD 'pa'
  vmovdqu   ymm6, YMMWORD PTR [edx]
  vmovdqa   YMMWORD PTR [ebp-120], ymm6       ; SPILL VEC
  vmovdqu   ymm0, YMMWORD PTR [ecx]
  vmovdqu   ymm6, YMMWORD PTR [ebx]
  vmovdqa   ymm5, ymm0                        ; Why to move anything when using AVX?
  vmovdqu   ymm0, YMMWORD PTR [eax+32]
  vmovdqu   ymm2, YMMWORD PTR [eax]
  vmovdqa   ymm1, ymm0                        ; Why to move anything when using AVX?
  vmovdqu   ymm0, YMMWORD PTR [ecx+32]
  vmovdqa   YMMWORD PTR [ebp-152], ymm2
  vmovdqa   ymm3, ymm0                        ; Why to move anything when using AVX?
  vpackuswb ymm0, ymm7, ymm6
  vmovdqa   YMMWORD PTR [ebp-184], ymm5       ; SPILL VEC
  vmovdqa   YMMWORD PTR [ebp-248], ymm3       ; SPILL VEC
  vmovdqa   YMMWORD PTR [ebp-280], ymm0       ; SPILL VEC
  vmovdqa   ymm0, YMMWORD PTR [ebp-56]        ; ALLOC VEC
  vmovdqa   YMMWORD PTR [ebp-216], ymm1       ; SPILL VEC
  vpackuswb ymm2, ymm0, YMMWORD PTR [ebp-152] ; Uses SPILL slot
  vmovdqa   ymm0, YMMWORD PTR [ebp-88]        ; ALLOC VEC
  vpackuswb ymm1, ymm4, YMMWORD PTR [ebp-216] ; Uses SPILL slot
  vpackuswb ymm5, ymm0, YMMWORD PTR [ebp-184] ; Uses SPILL slot
  vmovdqa   ymm0, YMMWORD PTR [ebp-120]       ; ALLOC VEC
  vpaddw    ymm2, ymm2, YMMWORD PTR [ebp-56]  ; Uses SPILL slot
  vpsubw    ymm2, ymm2, YMMWORD PTR [ebp-152] ; Uses SPILL slot
  vpackuswb ymm3, ymm0, YMMWORD PTR [ebp-248] ; Uses SPILL slot
  vpaddw    ymm0, ymm7, YMMWORD PTR [ebp-280] ; Uses SPILL slot
  vpsubw    ymm0, ymm0, ymm6
  vmovdqa   ymm7, YMMWORD PTR [ebp-120]       ; ALLOC VEC
  vpackuswb ymm0, ymm0, ymm2
  vpaddw    ymm2, ymm4, ymm1
  vpsubw    ymm2, ymm2, YMMWORD PTR [ebp-216] ; Uses SPILL slot
  vmovdqa   YMMWORD PTR [ebp-312], ymm3       ; SPILL VEC
  vpaddw    ymm3, ymm5, YMMWORD PTR [ebp-88]  ; Uses SPILL slot
  vpsubw    ymm3, ymm3, YMMWORD PTR [ebp-184] ; Uses SPILL slot
  vpackuswb ymm0, ymm0, ymm3
  vpaddw    ymm1, ymm7, YMMWORD PTR [ebp-312] ; Uses SPILL slot
  vpsubw    ymm1, ymm1, YMMWORD PTR [ebp-248] ; Uses SPILL slot
  vpackuswb ymm0, ymm0, ymm2
  vpackuswb ymm0, ymm0, ymm1
  vpextrd   eax, xmm0, 1                      ; Return value
  vzeroupper
  add       esp, 296
  pop       ecx
  pop       ebx
  pop       esi
  pop       edi
  pop       ebp
  lea       esp, [ecx-4]
  ret

While clang produces just this:

  ; Clang 3.8 -O2 -Wall -mavx2 -m32 -fomit-frame-pointer
  mov       eax, dword ptr [esp + 32]     ; LOAD 'pd'
  mov       ecx, dword ptr [esp + 4]      ; LOAD 'px'
  vmovdqu   ymm0, ymmword ptr [ecx]
  mov       ecx, dword ptr [esp + 8]      ; LOAD 'py'
  vmovdqu   ymm1, ymmword ptr [ecx]
  mov       ecx, dword ptr [esp + 12]     ; LOAD 'pz'
  vmovdqu  ymm2, ymmword ptr [ecx]
  mov       ecx, dword ptr [esp + 16]     ; LOAD 'pw'
  vmovdqu   ymm3, ymmword ptr [ecx]
  mov       ecx, dword ptr [esp + 20]     ; LOAD 'pa'
  vmovdqu   ymm4, ymmword ptr [ecx]
  mov       ecx, dword ptr [esp + 24]     ; LOAD 'pb'
  vmovdqu   ymm5, ymmword ptr [ecx]
  mov       ecx, dword ptr [esp + 28]     ; LOAD 'pc'
  vpackuswb ymm6, ymm0, ymm5
  vpsubw    ymm0, ymm0, ymm5
  vmovdqu   ymm5, ymmword ptr [ecx]
  vpaddw    ymm0, ymm0, ymm6
  vpackuswb ymm6, ymm1, ymm5
  vpsubw    ymm1, ymm1, ymm5
  vmovdqu   ymm5, ymmword ptr [eax]
  vpaddw    ymm1, ymm1, ymm6
  vpackuswb ymm6, ymm2, ymm5
  vpsubw    ymm2, ymm2, ymm5
  vmovdqu   ymm5, ymmword ptr [ecx + 32]
  vpaddw    ymm2, ymm2, ymm6
  vpackuswb ymm6, ymm3, ymm5
  vpsubw    ymm3, ymm3, ymm5
  vmovdqu   ymm5, ymmword ptr [eax + 32]
  vpaddw    ymm3, ymm3, ymm6
  vpackuswb ymm6, ymm4, ymm5
  vpsubw    ymm4, ymm4, ymm5
  vpaddw    ymm4, ymm4, ymm6
  vpackuswb ymm0, ymm0, ymm1
  vpackuswb ymm0, ymm0, ymm2
  vpackuswb ymm0, ymm0, ymm3
  vpackuswb ymm0, ymm0, ymm4
  vpextrd   eax, xmm0, 1                  ; Return value
  vzeroupper
  ret

I have written about this in my blog here:  
  https://asmbits.blogspot.com/2016/08/comparing-register-allocator-of-gcc-and.html

Problems summary:

  1. Spilling GPRs in our case is not needed at all
  2. Spilling YMMs is also questionable as some instructions can be reordered, see clang output
  3. Frame pointer is preserved even when I compiled with -fomit-frame-pointer
  4. Using [ebp-X] instead of [esp+Y] produces longer code when `X > 128 && Y < 128`.

You can quickly verify the outputs by pasting the source here: https://gcc.godbolt.org/
Comment 1 Andrew Pinski 2016-08-18 10:48:56 UTC
Try adding -march=intel or-mtune=intel . The default tuning for gcc is generic which is combination of Intel and amd tuning. And because amd tuning needs not to use gprs and SIMD registers at the same time spilling is faster there. It tunes for that.
Comment 2 Petr 2016-08-18 11:07:24 UTC
With '-mtune=intel' the push/pop sequence is gone, but YMM register management remains the same - 24 memory accesses more than clang.
Comment 3 Richard Biener 2016-08-18 12:01:56 UTC
-fschedule-insns improves things here - and LRA seems to be more happy to spill/reload rather than rematerialize.  But in the end the testcase requires
careful scheduling of the operations to reduce register lifetime and thus
allow optimal RA with the limited number of registers available.

We force a frame pointer because we have to re-align the stack for possible spills.
Comment 4 Petr 2016-08-20 18:09:55 UTC
Adding -fschedule-insns is definitely a huge improvement in this case. I wonder why this doesn't happen by default at -O2 and -Os, as it really improves things and makes shorter output, or it's just in this particular case?

Here is the assembly produced by gcc with -fschedule-insns:
  push      ebp
  mov       ebp, esp
  and       esp, -32
  lea       esp, [esp-32]
  mov       ecx, DWORD PTR [ebp+8]
  mov       edx, DWORD PTR [ebp+32]
  mov       eax, DWORD PTR [ebp+36]
  vmovdqu   ymm5, YMMWORD PTR [ecx]
  mov       ecx, DWORD PTR [ebp+12]
  vmovdqu   ymm3, YMMWORD PTR [edx]
  vmovdqu   ymm6, YMMWORD PTR [eax]
  vmovdqu   ymm2, YMMWORD PTR [ecx]
  mov       ecx, DWORD PTR [ebp+28]
  vpackuswb ymm7, ymm2, ymm3
  vpaddw    ymm7, ymm7, ymm2
  vpsubw    ymm7, ymm7, ymm3
  vmovdqu   ymm4, YMMWORD PTR [ecx]
  mov       ecx, DWORD PTR [ebp+16]
  vpackuswb ymm0, ymm5, ymm4
  vpaddw    ymm0, ymm0, ymm5
  vpsubw    ymm0, ymm0, ymm4
  vmovdqu   ymm1, YMMWORD PTR [ecx]
  vpackuswb ymm0, ymm0, ymm7
  mov       ecx, DWORD PTR [ebp+20]
  vpackuswb ymm2, ymm1, ymm6
  vmovdqu   ymm4, YMMWORD PTR [edx+32]
  vpaddw    ymm1, ymm2, ymm1
  mov       edx, DWORD PTR [ebp+24]
  vpsubw    ymm1, ymm1, ymm6
  vmovdqu   ymm5, YMMWORD PTR [ecx]
  vpackuswb ymm0, ymm0, ymm1
  vpackuswb ymm3, ymm5, ymm4
  vmovdqa   YMMWORD PTR [esp], ymm3
  vmovdqu   ymm2, YMMWORD PTR [eax+32]    ; LOOK HERE
  vpaddw    ymm5, ymm5, YMMWORD PTR [esp]
  vmovdqu   ymm3, YMMWORD PTR [edx]       ; AND HERE
  vpsubw    ymm4, ymm5, ymm4
  vpackuswb ymm7, ymm3, ymm2
  vpackuswb ymm0, ymm0, ymm4
  vpaddw    ymm3, ymm7, ymm3
  vpsubw    ymm2, ymm3, ymm2
  vpackuswb ymm2, ymm0, ymm2
  vpextrd   eax, xmm2, 1
  vzeroupper
  leave
  ret

Which is pretty close to clang already, however, look at this part:

  vmovdqa   YMMWORD PTR [esp], ymm3       ; Spill YMM3
  vmovdqu   ymm2, YMMWORD PTR [eax+32]
  vpaddw    ymm5, ymm5, YMMWORD PTR [esp] ; Mem instead of YMM3?
  vmovdqu   ymm3, YMMWORD PTR [edx]       ; Old YMM3 becomes dead here

The spill is completely unnecessary in our case, and it's the only reason why the prolog/epilog requires code to perform dynamic stack alignment. I mean if this one thing is eliminated then GCC basically generates a comparable code to clang.

But thanks for -fschedule-insns hint, I didn't know about it.
Comment 5 Andrew Pinski 2021-08-14 02:56:05 UTC
clang can now produce:
        mov     eax, dword ptr [esp + 16]
        mov     ecx, dword ptr [esp + 28]
        vmovdqu xmm0, xmmword ptr [ecx + 32]
        vmovdqu xmm1, xmmword ptr [eax]
        vpackuswb       xmm2, xmm1, xmm0
        vpsubw  xmm0, xmm1, xmm0
        vpaddw  xmm0, xmm0, xmm2
        vpackuswb       xmm0, xmm0, xmm0
        vpackuswb       xmm0, xmm0, xmm0
        vpextrd eax, xmm0, 1
        ret

I suspect if the back-end is able to "fold" at the gimple level the builtins into gimple, GCC will do a much better job.
Currently we have stuff like:
_27 = __builtin_ia32_vextractf128_si256 (_28, 0);
_26 = __builtin_ia32_vec_ext_v4si (_27, 1); [tail call]

I think both are just a BIT_FIELD_REF really and even more can be simplified to just one bitfield extraction rather than what we do now:
        vpackuswb       %ymm1, %ymm0, %ymm0
        vpextrd $1, %xmm0, %eax

Plus it looks like with __builtin_ia32_vextractf128_si256 (_28, 0), clang is able to remove half of the code due to only needing 128 bytes stuff :).
Comment 6 Petr 2021-08-24 23:21:16 UTC
Yes, the code is not really doing anything useful, I only wrote it to demonstrate the spills problem. Clang actually outsmarted me by removing half of the code :)

I think this issue can be closed, I cannot repro this with the newest GCC.
Comment 7 Hongtao.liu 2021-08-25 03:25:44 UTC
(In reply to Andrew Pinski from comment #5)
> clang can now produce:
>         mov     eax, dword ptr [esp + 16]
>         mov     ecx, dword ptr [esp + 28]
>         vmovdqu xmm0, xmmword ptr [ecx + 32]
>         vmovdqu xmm1, xmmword ptr [eax]
>         vpackuswb       xmm2, xmm1, xmm0
>         vpsubw  xmm0, xmm1, xmm0
>         vpaddw  xmm0, xmm0, xmm2
>         vpackuswb       xmm0, xmm0, xmm0
>         vpackuswb       xmm0, xmm0, xmm0
>         vpextrd eax, xmm0, 1
>         ret
> 
> I suspect if the back-end is able to "fold" at the gimple level the builtins
> into gimple, GCC will do a much better job.
> Currently we have stuff like:
> _27 = __builtin_ia32_vextractf128_si256 (_28, 0);
> _26 = __builtin_ia32_vec_ext_v4si (_27, 1); [tail call]
> 
> I think both are just a BIT_FIELD_REF really and even more can be simplified
> to just one bitfield extraction rather than what we do now:
>         vpackuswb       %ymm1, %ymm0, %ymm0
>         vpextrd $1, %xmm0, %eax
> 
> Plus it looks like with __builtin_ia32_vextractf128_si256 (_28, 0), clang is
> able to remove half of the code due to only needing 128 bytes stuff :).

Yes, let's me try this.
Comment 8 Hongtao.liu 2021-08-25 03:30:13 UTC
(In reply to Hongtao.liu from comment #7)
> (In reply to Andrew Pinski from comment #5)
> > clang can now produce:
> >         mov     eax, dword ptr [esp + 16]
> >         mov     ecx, dword ptr [esp + 28]
> >         vmovdqu xmm0, xmmword ptr [ecx + 32]
> >         vmovdqu xmm1, xmmword ptr [eax]
> >         vpackuswb       xmm2, xmm1, xmm0
> >         vpsubw  xmm0, xmm1, xmm0
> >         vpaddw  xmm0, xmm0, xmm2
> >         vpackuswb       xmm0, xmm0, xmm0
> >         vpackuswb       xmm0, xmm0, xmm0
> >         vpextrd eax, xmm0, 1
> >         ret
> > 
> > I suspect if the back-end is able to "fold" at the gimple level the builtins
> > into gimple, GCC will do a much better job.
> > Currently we have stuff like:
> > _27 = __builtin_ia32_vextractf128_si256 (_28, 0);
> > _26 = __builtin_ia32_vec_ext_v4si (_27, 1); [tail call]
> > 
> > I think both are just a BIT_FIELD_REF really and even more can be simplified
> > to just one bitfield extraction rather than what we do now:
> >         vpackuswb       %ymm1, %ymm0, %ymm0
> >         vpextrd $1, %xmm0, %eax
> > 
> > Plus it looks like with __builtin_ia32_vextractf128_si256 (_28, 0), clang is
> > able to remove half of the code due to only needing 128 bytes stuff :).
> 
> Yes, let's me try this.


Do we have IR for unsigned/signed saturation in gimple level?
Comment 9 Andrew Pinski 2021-08-25 03:43:33 UTC
(In reply to Hongtao.liu from comment #8)
> Do we have IR for unsigned/signed saturation in gimple level?

Not yet. I was just looking for that today due because of PR 51492.
Comment 10 Andrew Pinski 2021-08-25 03:46:53 UTC
(In reply to Andrew Pinski from comment #9)
> (In reply to Hongtao.liu from comment #8)
> > Do we have IR for unsigned/signed saturation in gimple level?
> 
> Not yet. I was just looking for that today due because of PR 51492.

But there is a RFC out for it:
https://gcc.gnu.org/pipermail/gcc/2021-May/236015.html
Comment 11 Hongtao.liu 2021-08-25 04:25:32 UTC
(In reply to Andrew Pinski from comment #10)
> (In reply to Andrew Pinski from comment #9)
> > (In reply to Hongtao.liu from comment #8)
> > > Do we have IR for unsigned/signed saturation in gimple level?
> > 
> > Not yet. I was just looking for that today due because of PR 51492.
> 
> But there is a RFC out for it:
> https://gcc.gnu.org/pipermail/gcc/2021-May/236015.html

Oh, VEC_PACK_SAT_EXPR is exact what i needed for _mm256_packus_epi16, thanks for the pointer.
Comment 12 Hongtao.liu 2021-08-25 04:27:04 UTC
(In reply to Hongtao.liu from comment #11)
> (In reply to Andrew Pinski from comment #10)
> > (In reply to Andrew Pinski from comment #9)
> > > (In reply to Hongtao.liu from comment #8)
> > > > Do we have IR for unsigned/signed saturation in gimple level?
> > > 
> > > Not yet. I was just looking for that today due because of PR 51492.
> > 
> > But there is a RFC out for it:
> > https://gcc.gnu.org/pipermail/gcc/2021-May/236015.html
> 
> Oh, VEC_PACK_SAT_EXPR is exact what i needed for _mm256_packus_epi16, thanks
> for the pointer.

And ‘vec_pack_ssat_m’, ‘vec_pack_usat_m’ for optab.
Comment 13 Hongtao.liu 2021-08-25 06:22:52 UTC
;; Function fn (fn, funcdef_no=5484, decl_uid=32317, cgraph_uid=5485, symbol_order=5484)

int fn (const int * px, const int * py, const int * pz, const int * pw, const int * pa, const int * pb, const int * pc, const int * pd)
{
  vector(16) short unsigned int _3;
  vector(16) short unsigned int _5;
  vector(16) short int _7;
  vector(16) short int _9;
  vector(32) char _12;
  vector(32) unsigned char _14;
  vector(16) short unsigned int _16;
  vector(16) short unsigned int _17;
  vector(16) short int _18;
  vector(16) short int _19;
  vector(32) char _20;
  vector(32) unsigned char _21;
  vector(16) short unsigned int _22;
  vector(16) short unsigned int _23;
  vector(16) short int _24;
  vector(16) short int _25;
  vector(32) char _26;
  vector(32) unsigned char _27;
  vector(16) short unsigned int _28;
  vector(16) short unsigned int _29;
  vector(16) short int _30;
  vector(16) short int _31;
  int _32;
  vector(4) int _33;
  vector(8) int _34;
  vector(32) unsigned char _35;
  vector(32) char _36;
  vector(16) short unsigned int _37;
  vector(16) short unsigned int _38;
  vector(16) short unsigned int _39;
  vector(16) short unsigned int _40;
  vector(16) short unsigned int _41;
  vector(16) short unsigned int _42;
  vector(16) short unsigned int _43;
  vector(16) short unsigned int _44;
  vector(16) short unsigned int _45;
  vector(16) short unsigned int _46;
  vector(16) short unsigned int _47;
  vector(16) short unsigned int _48;
  vector(16) short unsigned int _50;
  vector(16) short unsigned int _51;
  vector(16) short unsigned int _53;
  vector(16) short unsigned int _54;
  vector(16) short unsigned int _56;
  vector(16) short unsigned int _57;
  vector(16) short unsigned int _59;
  vector(16) short unsigned int _60;
  vector(16) short int _62;
  vector(16) short int _63;
  vector(16) short unsigned int _64;
  vector(16) short unsigned int _65;
  vector(32) unsigned char _66;
  vector(32) char _67;
  vector(16) short int _68;
  vector(16) short int _69;
  vector(16) short unsigned int _70;
  vector(16) short unsigned int _71;
  vector(32) unsigned char _72;
  vector(32) char _73;
  vector(16) short int _74;
  vector(16) short int _75;
  vector(16) short unsigned int _76;
  vector(16) short unsigned int _77;
  vector(32) unsigned char _78;
  vector(32) char _79;
  vector(16) short int _80;
  vector(16) short int _81;
  vector(16) short unsigned int _82;
  vector(16) short unsigned int _83;
  vector(32) unsigned char _84;
  vector(32) char _85;
  vector(16) short int _86;
  vector(16) short int _87;
  vector(16) short unsigned int _88;
  vector(16) short unsigned int _89;
  vector(32) unsigned char _90;
  vector(32) char _91;
  vector(4) long long int _92;
  vector(4) long long int _93;
  vector(4) long long int _94;
  vector(4) long long int _95;
  vector(4) long long int _96;
  vector(4) long long int _97;
  vector(4) long long int _98;
  vector(4) long long int _99;
  vector(4) long long int _100;
  vector(4) long long int _101;
  vector(16) short unsigned int _107;
  vector(16) short unsigned int _108;
  vector(16) short unsigned int _109;
  vector(16) short unsigned int _110;
  vector(16) short unsigned int _111;

  <bb 2> [local count: 1073741824]:
  _101 = MEM[(const __m256i_u * {ref-all})px_2(D)];
  _100 = MEM[(const __m256i_u * {ref-all})py_4(D)];
  _99 = MEM[(const __m256i_u * {ref-all})pz_6(D)];
  _98 = MEM[(const __m256i_u * {ref-all})pw_8(D)];
  _97 = MEM[(const __m256i_u * {ref-all})pa_10(D)];
  _96 = MEM[(const __m256i_u * {ref-all})pb_11(D)];
  _95 = MEM[(const __m256i_u * {ref-all})pc_13(D)];
  _94 = MEM[(const __m256i_u * {ref-all})pd_15(D)];
  _93 = MEM[(const __m256i_u * {ref-all})pc_13(D) + 32B];
  _92 = MEM[(const __m256i_u * {ref-all})pd_15(D) + 32B];
  _86 = VIEW_CONVERT_EXPR<vector(16) short int>(_96);
  _87 = VIEW_CONVERT_EXPR<vector(16) short int>(_101);
  _88 = (vector(16) short unsigned int) _87;
  _89 = (vector(16) short unsigned int) _86;
  _90 = VEC_PACK_SAT_EXPR <_88, _89>;
  _91 = (vector(32) char) _90;
  _80 = VIEW_CONVERT_EXPR<vector(16) short int>(_95);
  _81 = VIEW_CONVERT_EXPR<vector(16) short int>(_100);
  _82 = (vector(16) short unsigned int) _81;
  _83 = (vector(16) short unsigned int) _80;
  _84 = VEC_PACK_SAT_EXPR <_82, _83>;
  _85 = (vector(32) char) _84;
  _74 = VIEW_CONVERT_EXPR<vector(16) short int>(_94);
  _75 = VIEW_CONVERT_EXPR<vector(16) short int>(_99);
  _76 = (vector(16) short unsigned int) _75;
  _77 = (vector(16) short unsigned int) _74;
  _78 = VEC_PACK_SAT_EXPR <_76, _77>;
  _79 = (vector(32) char) _78;
  _68 = VIEW_CONVERT_EXPR<vector(16) short int>(_93);
  _69 = VIEW_CONVERT_EXPR<vector(16) short int>(_98);
  _70 = (vector(16) short unsigned int) _69;
  _71 = (vector(16) short unsigned int) _68;
  _72 = VEC_PACK_SAT_EXPR <_70, _71>;
  _73 = (vector(32) char) _72;
  _62 = VIEW_CONVERT_EXPR<vector(16) short int>(_92);
  _63 = VIEW_CONVERT_EXPR<vector(16) short int>(_97);
  _64 = (vector(16) short unsigned int) _63;
  _65 = (vector(16) short unsigned int) _62;
  _66 = VEC_PACK_SAT_EXPR <_64, _65>;
  _67 = (vector(32) char) _66;
  _59 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_91);
  _60 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_101);
  _56 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_85);
  _57 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_100);
  _53 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_79);
  _54 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_99);
  _50 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_73);
  _51 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_98);
  _47 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_67);
  _48 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_97);
  _45 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_96);
  _111 = _60 - _45;
  _46 = _59 + _111;
  _43 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_95);
  _110 = _57 - _43;
  _44 = _56 + _110;
  _41 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_94);
  _109 = _54 - _41;
  _42 = _53 + _109;
  _39 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_93);
  _108 = _51 - _39;
  _40 = _50 + _108;
  _37 = VIEW_CONVERT_EXPR<vector(16) short unsigned int>(_92);
  _107 = _48 - _37;
  _38 = _47 + _107;
  _9 = VIEW_CONVERT_EXPR<vector(16) short int>(_44);
  _7 = VIEW_CONVERT_EXPR<vector(16) short int>(_46);
  _5 = (vector(16) short unsigned int) _7;
  _3 = (vector(16) short unsigned int) _9;
  _35 = VEC_PACK_SAT_EXPR <_5, _3>;
  _36 = (vector(32) char) _35;
  _19 = VIEW_CONVERT_EXPR<vector(16) short int>(_42);
  _18 = VIEW_CONVERT_EXPR<vector(16) short int>(_36);
  _17 = (vector(16) short unsigned int) _18;
  _16 = (vector(16) short unsigned int) _19;
  _14 = VEC_PACK_SAT_EXPR <_17, _16>;
  _12 = (vector(32) char) _14;
  _25 = VIEW_CONVERT_EXPR<vector(16) short int>(_40);
  _24 = VIEW_CONVERT_EXPR<vector(16) short int>(_12);
  _23 = (vector(16) short unsigned int) _24;
  _22 = (vector(16) short unsigned int) _25;
  _21 = VEC_PACK_SAT_EXPR <_23, _22>;
  _20 = (vector(32) char) _21;
  _31 = VIEW_CONVERT_EXPR<vector(16) short int>(_38);
  _30 = VIEW_CONVERT_EXPR<vector(16) short int>(_20);
  _29 = (vector(16) short unsigned int) _30;
  _28 = (vector(16) short unsigned int) _31;
  _27 = VEC_PACK_SAT_EXPR <_29, _28>;
  _26 = (vector(32) char) _27;
  _34 = VIEW_CONVERT_EXPR<vector(8) int>(_26);
  _33 = __builtin_ia32_vextractf128_si256 (_34, 0);
  _32 = __builtin_ia32_vec_ext_v4si (_33, 1); [tail call]
  return _32;

}

After folding _mm256_packus_epi16, gimple still doesn't simplify it.
I guess gcc only functionally supports vec_pack_sat_expr, but does not optimize it