[hjl@gnu-skx-1 v64-2]$ cat y.i void rsqrt(char* restrict r, char* restrict a){ for (int i = 0; i < 8; i++){ r[i] += a[i]; } } [hjl@gnu-skx-1 v64-2]$ gcc -S -O2 y.i [hjl@gnu-skx-1 v64-2]$ cat y.s .file "y.i" .text .p2align 4,,15 .globl rsqrt .type rsqrt, @function rsqrt: .LFB0: .cfi_startproc xorl %eax, %eax .p2align 4,,10 .p2align 3 .L2: movzbl (%rsi,%rax), %edx addb %dl, (%rdi,%rax) addq $1, %rax cmpq $8, %rax jne .L2 ret .cfi_endproc .LFE0: .size rsqrt, .-rsqrt .ident "GCC: (GNU) 8.2.1 20190109 (Red Hat 8.2.1-7)" .section .note.GNU-stack,"",@progbits [hjl@gnu-skx-1 v64-2]$
Of course we do not vectorize at -O2. At -O3 the issue is the target doesn't advertise word_mode as vector size to use and the vectorizer doesn't support vectorization using half of a vector. If you'd do Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (revision 268010) +++ gcc/config/i386/i386.c (working copy) @@ -50153,6 +50153,11 @@ ix86_autovectorize_vector_sizes (vector_ sizes->safe_push (32); sizes->safe_push (16); } + else + { + sizes->safe_push (16); + sizes->safe_push (8); + } } /* Implemenation of targetm.vectorize.get_mask_mode. */ you get vectorization using DImode regs: rsqrt: .LFB0: .cfi_startproc movabsq $9187201950435737471, %rdx movq (%rdi), %rax movq (%rsi), %rsi movq %rdx, %rcx andq %rax, %rcx andq %rsi, %rdx xorq %rsi, %rax addq %rcx, %rdx movabsq $-9187201950435737472, %rcx andq %rcx, %rax xorq %rdx, %rax movq %rax, (%rdi) ret not exactly what you wanted I guess ;) Anything else would require vectorizer adjustments.
I am working on a patch to generate: [hjl@gnu-hsw-1 pr89028]$ cat x.i void foo (char* restrict r, char* restrict a){ for (int i = 0; i < 8; i++){ r[i] += a[i]; } } [hjl@gnu-hsw-1 pr89028]$ make x.s /export/build/gnu/tools-build/gcc-mmx/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-mmx/build-x86_64-linux/gcc/ -O3 -S x.i [hjl@gnu-hsw-1 pr89028]$ cat x.s .file "x.i" .text .p2align 4 .globl foo .type foo, @function foo: .LFB0: .cfi_startproc movq (%rdi), %xmm0 movq (%rsi), %xmm1 paddb %xmm1, %xmm0 movq %xmm0, (%rdi) ret .cfi_endproc .LFE0: .size foo, .-foo .ident "GCC: (GNU) 9.0.1 20190124 (experimental)" .section .note.GNU-stack,"",@progbits [hjl@gnu-hsw-1 pr89028]$
Fixed in GCC 10 by r10-1361.
Fixed.