#include <stdint.h> void pack_high8_baseline(uint8_t *__restrict__ dst, const uint16_t *__restrict__ src, size_t bytes) { uint8_t *end_dst = dst + bytes; do{ *dst++ = *src++ >> 8; } while(dst < end_dst); } https://godbolt.org/g/yoJZ3C gcc -O3 auto-vectorizes to this inner loop: .L5: movdqa (%rdx,%rax,2), %xmm0 movdqa 16(%rdx,%rax,2), %xmm1 psrlw $8, %xmm0 pand %xmm2, %xmm0 # Redundant with the shift psrlw $8, %xmm1 pand %xmm2, %xmm1 # Redundant with the shift packuswb %xmm1, %xmm0 movups %xmm0, (%rcx,%rax) addq $16, %rax cmpq %rsi, %rax jne .L5 This is mostly good, but the PAND instructions are redundant, because psrlw by 8 already leaves the high byte of each 16-bit element zeroed. The same extra AND is present when auto-vectorizing for AVX2 (but not AVX512, where it uses a different strategy.) Other than that, the AVX2 vectorization strategy looks very good (packus ymm, then vpermq to fix the result). If the input is 32B-aligned (or 64B aligned for AVX2), one of the load+shifts can be *replaced* with an AND + unaligned load offset by -1. This avoids a bottleneck on shift throughput (at least with unrolling it does; without unrolling we bottleneck on the front-end except on Ryzen). It's even better with AVX, because load+AND can fold into one instruction. See https://stackoverflow.com/a/46477080/224132 for more details. This C source produces the inner loop that I think should be very good across K10, Bulldozer, Ryzen, Nehalem, Sandybridge, HSW/SKL, Jaguar, Atom, and Silvermont. (With SSE2 or AVX.) i.e. this should be great for tune=generic after reaching a 32B boundary. Not great on Core2 or K8 where non-cache-line-crossing movdqu costs more. // take both args as uint8_t* so we can offset by 1 byte to replace a shift with an AND // if src is 32B-aligned, we never have cache-line splits void pack_high8_alignhack(uint8_t *restrict dst, const uint8_t *restrict src, size_t bytes) { uint8_t *end_dst = dst + bytes; do{ __m128i v0 = _mm_loadu_si128((__m128i*)src); // this load should be aligned __m128i v1_offset = _mm_loadu_si128(1+(__m128i*)(src-1)); v0 = _mm_srli_epi16(v0, 8); __m128i v1 = _mm_and_si128(v1_offset, _mm_set1_epi16(0x00FF)); __m128i pack = _mm_packus_epi16(v0, v1); _mm_storeu_si128((__m128i*)dst, pack); dst += 16; src += 32; // 32 bytes } while(dst < end_dst); }
The vectorizer itself just emits <bb 13> [81.00%]: # ivtmp.30_143 = PHI <0(12), ivtmp.30_144(13)> # ivtmp.35_145 = PHI <0(12), ivtmp.35_146(13)> vect__1.15_75 = MEM[base: vectp_src.14_71, index: ivtmp.35_145, step: 2, offset: 0B]; vect__1.16_77 = MEM[base: vectp_src.14_71, index: ivtmp.35_145, step: 2, offset: 16B]; vect__2.17_78 = vect__1.15_75 >> 8; vect__2.17_79 = vect__1.16_77 >> 8; vect__3.18_80 = VEC_PACK_TRUNC_EXPR <vect__2.17_78, vect__2.17_79>; MEM[base: vectp_dst.20_81, index: ivtmp.35_145, offset: 0B] = vect__3.18_80; ivtmp.30_144 = ivtmp.30_143 + 1; ivtmp.35_146 = ivtmp.35_145 + 16; if (bnd.9_65 > ivtmp.30_144) goto <bb 13>; [83.34%] it seems the ands come from VEC_PAC_TRUNC_EXPR expansion somehow. (insn 150 149 151 (set (reg:V8HI 219) (and:V8HI (reg:V8HI 221) (reg:V8HI 214 [ vect__2.17 ]))) "t.c":5 -1 (nil)) (insn 151 150 152 (set (reg:V8HI 220) (and:V8HI (reg:V8HI 221) (reg:V8HI 216 [ vect__2.17 ]))) "t.c":5 -1 (nil)) (insn 152 151 153 (set (reg:V16QI 218 [ vect__3.18 ]) (vec_concat:V16QI (us_truncate:V8QI (reg:V8HI 219)) (us_truncate:V8QI (reg:V8HI 220)))) "t.c":5 -1 (nil)) and combine / simplify-rtx are not able to remove them again (no wonders, simplify-rtx is notoriously unaware of vectors).
With -O3 -fno-move-loop-invariants -fno-gcse We get Trying 38, 34 -> 39: 38: r160:V8HI=[`*.LC0'] REG_EQUAL const_vector 34: r154:V8HI=r153:V8HI 0>>0x8 REG_DEAD r153:V8HI 39: r158:V8HI=r160:V8HI&r154:V8HI REG_DEAD r154:V8HI Failed to match this instruction: (parallel [ (set (reg:V8HI 158) (and:V8HI (lshiftrt:V8HI (reg:V8HI 153 [ vect__1.12 ]) (const_int 8 [0x8])) (const_vector:V8HI [ (const_int 255 [0xff]) repeated x8 ]))) (set (reg:V8HI 160) (mem/u/c:V8HI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S16 A128])) ]) Failed to match this instruction: (parallel [ (set (reg:V8HI 158) (and:V8HI (lshiftrt:V8HI (reg:V8HI 153 [ vect__1.12 ]) (const_int 8 [0x8])) (const_vector:V8HI [ (const_int 255 [0xff]) repeated x8 ]))) (set (reg:V8HI 160) (mem/u/c:V8HI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S16 A128])) ]) Successfully matched this instruction: (set (reg:V8HI 160) (mem/u/c:V8HI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S16 A128])) Failed to match this instruction: (set (reg:V8HI 158) (and:V8HI (lshiftrt:V8HI (reg:V8HI 153 [ vect__1.12 ]) (const_int 8 [0x8])) (const_vector:V8HI [ (const_int 255 [0xff]) repeated x8 ]))) Which we don't recongize the and part is not needed. Also since we don't emit REG_EQUAL when using vector_csts, we needed to use "-fno-move-loop-invariants -fno-gcse ".