#include <stdint.h> void reference_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int n) { for (int i=0; i<n; i++) { int r = *src++; int g = *src++; int b = *src++; int y = (r*77)+(g*151)+(b*28); *dest++ = y/256; } } $ arm-gcc -march=armv7-a -O3 -ffast-math -fopt-info-vec-omp-optimized-missed gives the following notes on the loop line: unsupported data-type int can't determine vectorization factor. vector alignment may not be reachable Aligned load, but unsupported type. not vectorized: relevant stmt not supported: _1 = *src_31; bad operation or unsupported loop bound. vector alignment may not be reachable no array mode for DI[3] extract even/odd not supported by target bad operation or unsupported loop bound. Vectorization works for x64. https://godbolt.org/z/FPG3k_
Something like -march=armv8-a -mfpu=neon-fp-armv8 does not work either. https://godbolt.org/z/MpBQ0I
On x86_64 we manage to vectorize this with quite absymal code (for core-avx2) with a vectorization factor of 32: .L4: vmovdqu (%rax), %ymm1 vmovdqu 64(%rax), %ymm4 addq $32, %rcx addq $96, %rax vmovdqu -64(%rax), %ymm5 vpshufb %ymm14, %ymm1, %ymm0 vpermq $78, %ymm0, %ymm2 vpshufb %ymm13, %ymm1, %ymm0 vpshufb %ymm12, %ymm5, %ymm3 vpor %ymm2, %ymm0, %ymm0 vpshufb %ymm11, %ymm4, %ymm2 vpor %ymm3, %ymm0, %ymm0 vpermq $78, %ymm2, %ymm3 vpshufb .LC5(%rip), %ymm4, %ymm2 vpshufb .LC4(%rip), %ymm0, %ymm0 vpor %ymm3, %ymm2, %ymm2 vpshufb .LC6(%rip), %ymm1, %ymm3 vpermq $78, %ymm3, %ymm15 vpor %ymm2, %ymm0, %ymm0 vpshufb .LC7(%rip), %ymm1, %ymm3 vpshufb .LC8(%rip), %ymm5, %ymm2 vpor %ymm15, %ymm3, %ymm3 vpshufb .LC11(%rip), %ymm4, %ymm15 vpshufb .LC14(%rip), %ymm5, %ymm5 vpor %ymm2, %ymm3, %ymm3 vpshufb .LC9(%rip), %ymm4, %ymm2 vpermq $78, %ymm2, %ymm2 vpshufb %ymm10, %ymm3, %ymm3 vpor %ymm2, %ymm15, %ymm2 vpor %ymm2, %ymm3, %ymm3 vpshufb .LC12(%rip), %ymm1, %ymm2 vpshufb .LC13(%rip), %ymm1, %ymm1 vpermq $78, %ymm2, %ymm2 vpor %ymm2, %ymm1, %ymm2 vpshufb .LC15(%rip), %ymm4, %ymm1 vpshufb .LC16(%rip), %ymm4, %ymm4 vpermq $78, %ymm1, %ymm1 vpor %ymm5, %ymm2, %ymm2 vpor %ymm1, %ymm4, %ymm4 vpshufb %ymm10, %ymm2, %ymm2 vpmovzxbw %xmm0, %ymm1 vpor %ymm4, %ymm2, %ymm2 vpmovzxbw %xmm3, %ymm4 vextracti128 $0x1, %ymm0, %xmm0 vpmullw %ymm7, %ymm4, %ymm4 vpmullw %ymm8, %ymm1, %ymm1 vextracti128 $0x1, %ymm3, %xmm3 vpmovzxbw %xmm0, %ymm0 vpmovzxbw %xmm3, %ymm3 vpmullw %ymm8, %ymm0, %ymm0 vpmullw %ymm7, %ymm3, %ymm3 vpaddw %ymm4, %ymm1, %ymm1 vpmovzxbw %xmm2, %ymm4 vextracti128 $0x1, %ymm2, %xmm2 vpmovzxbw %xmm2, %ymm2 vpmullw %ymm6, %ymm4, %ymm4 vpmullw %ymm6, %ymm2, %ymm2 vpaddw %ymm3, %ymm0, %ymm0 vpaddw %ymm4, %ymm1, %ymm1 vpaddw %ymm2, %ymm0, %ymm0 vpsrlw $8, %ymm1, %ymm1 vpsrlw $8, %ymm0, %ymm0 vpand %ymm1, %ymm9, %ymm1 vpand %ymm0, %ymm9, %ymm0 vpackuswb %ymm0, %ymm1, %ymm0 vpermq $216, %ymm0, %ymm0 vmovdqu %ymm0, -32(%rcx) cmpq %r8, %rcx jne .L4 Maybe you can post what you think arm can do better here?
A few NEON instructions are sufficient: https://web.archive.org/web/20170227190422/http://hilbert-space.de/?p=22 clang seems to generate similar code, see the godbolt links.
On x64 indeed both compilers generate a huge amount of code. https://godbolt.org/z/TH7mqn
I see vectorisation for arm (and aarch64 FWIW): -O3 -march=armv8-a -mfpu=neon-fp-armv8 -mfloat-abi=hard gives the loop: .L4: mov r3, lr add lr, lr, #48 vld3.8 {d16, d18, d20}, [r3]! vld3.8 {d17, d19, d21}, [r3] vmull.u8 q12, d16, d30 vmull.u8 q1, d18, d28 vmull.u8 q2, d19, d29 vmull.u8 q11, d17, d31 vmull.u8 q3, d20, d26 vadd.i16 q12, q12, q1 vmull.u8 q10, d21, d27 vadd.i16 q8, q11, q2 vadd.i16 q9, q12, q3 vadd.i16 q8, q8, q10 vshr.u16 q9, q9, #8 vshr.u16 q8, q8, #8 vmovn.i16 d20, q9 vmovn.i16 d21, q8 vst1.8 {q10}, [ip]! cmp ip, r4 bne .L4 Though of course it's not as tight as the assembly given in the link
-mfloat-abi=hard was missing indeed. It's a pity there's no warning like when trying to use the intrinsics. Still I see a lot more instructions, maybe that got fixed after v7.2? https://godbolt.org/z/OWzgXi vld3.8 {d16, d18, d20}, [r3] add ip, r3, #24 add lr, lr, #1 add r3, r3, #48 cmp lr, r5 vld3.8 {d17, d19, d21}, [ip] vmovl.u8 q5, d16 vmovl.u8 q15, d18 vmovl.u8 q11, d17 vmovl.u8 q4, d19 vmovl.u8 q0, d20 vmovl.u8 q1, d21 vmull.s16 q6, d10, d28 vmull.s16 q3, d22, d28 vmull.s16 q2, d30, d26 vmull.s16 q11, d23, d29 vmull.s16 q15, d31, d27 vmull.s16 q5, d11, d29 vmull.s16 q9, d8, d26 vmull.s16 q8, d9, d27 vadd.i32 q2, q6, q2 vadd.i32 q10, q5, q15 vadd.i32 q9, q3, q9 vmull.s16 q15, d0, d24 vadd.i32 q8, q11, q8 vmull.s16 q3, d2, d24 vmull.s16 q0, d1, d25 vmull.s16 q1, d3, d25 vadd.i32 q11, q2, q15 vadd.i32 q9, q9, q3 vadd.i32 q10, q10, q0 vadd.i32 q8, q8, q1 vshr.s32 q11, q11, #8 vshr.s32 q9, q9, #8 vshr.s32 q10, q10, #8 vshr.s32 q8, q8, #8 vmovn.i32 d30, q11 vmovn.i32 d31, q10 vmovn.i32 d20, q9 vmovn.i32 d21, q8 vmovn.i16 d16, q15 vmovn.i16 d17, q10 vst1.8 {q8}, [r4]
I tried current trunk (future GCC 9) GCC 9 learned to avoid excessive widening during vectorisation, which is what accounts for the large number of instructions you see.
> vshr.u16 q9, q9, #8 > vshr.u16 q8, q8, #8 > vmovn.i16 d20, q9 > vmovn.i16 d21, q8 Isn't that "just" a missing combine pattern to get us vshrn in both backends ? Ramana
(In reply to ktkachov from comment #7) > I tried current trunk (future GCC 9) > GCC 9 learned to avoid excessive widening during vectorisation, which is > what accounts for the large number of instructions you see. Confirmed, the loop is now as described in comment #5 with trunk gcc. Still with vshr+vmovn as mentioned by Ramana. But by the way, the tail is completely unrolled, 15x the following, seems quite excessive to me: ldrb ip, [r1, #1] @ zero_extendqisi2 movs r6, #151 ldrb lr, [r1] @ zero_extendqisi2 movs r5, #77 ldrb r7, [r1, #2] @ zero_extendqisi2 movs r4, #28 smulbb ip, ip, r6 smlabb lr, r5, lr, ip add ip, r3, #1 smlabb r7, r4, r7, lr cmp ip, r2 asr r7, r7, #8 strb r7, [r0] bge .L1 assert(n >= 16) helps a bit, but n % 16 == 0 doesn't.