Hello, we noticed that there seems to be a missing vectorization for the code below. reduced code: https://godbolt.org/z/YaEPdWM3n int data[100]; void f(int arr1[100], int arr2[100]) { for(int i = 0; i < 100; i++){ arr2[i] = -arr1[i]; int k = arr1[i] + arr2[i]; data[i] = data[i + k] + 1; } } GCC -O3 -fno-vect-cost-model -fwrapv: f(int*, int*): xor eax, eax .L2: mov ecx, DWORD PTR [rdi+rax*4] mov edx, ecx neg edx mov DWORD PTR [rsi+rax*4], edx mov edx, eax sub edx, ecx add edx, DWORD PTR [rdi+rax*4] movsx rdx, edx mov edx, DWORD PTR data[0+rdx*4] add edx, 1 mov DWORD PTR data[0+rax*4], edx add rax, 1 cmp rax, 100 jne .L2 ret Possibly due to omission: _8 = _7 + _26; ==> _8 = i_22 _4 = MEM[(int *)arr1_15(D) + ivtmp.5_21 * 4]; _6 = -_4; MEM[(int *)arr2_16(D) + ivtmp.5_21 * 4] = _6; # DEBUG BEGIN_STMT _7 = MEM[(int *)arr1_15(D) + ivtmp.5_21 * 4]; # DEBUG D#1 => -_4 # DEBUG k => _7 + D#1 # DEBUG BEGIN_STMT _26 = i_22 - _4; _8 = _7 + _26; _9 = data[_8]; Expected code: vectorize the loop body. For example, when assign k=0 directly ('int k = 0;'), gcc: f(int*, int*): lea rdx, [rdi+4] mov eax, OFFSET FLAT:data sub rax, rdx cmp rax, 8 mov rax, rsi seta cl sub rax, rdx cmp rax, 8 seta al test cl, al je .L5 mov rax, rsi sub rax, OFFSET FLAT:data-15 cmp rax, 30 jbe .L5 pcmpeqd xmm1, xmm1 xor eax, eax psrld xmm1, 31 .L3: movdqu xmm3, XMMWORD PTR [rdi+rax] pxor xmm0, xmm0 psubd xmm0, xmm3 movups XMMWORD PTR [rsi+rax], xmm0 movdqa xmm0, XMMWORD PTR data[rax] add rax, 16 paddd xmm0, xmm1 movaps XMMWORD PTR data[rax-16], xmm0 cmp rax, 400 jne .L3 ret .L5: xor eax, eax .L2: mov edx, DWORD PTR [rdi+rax] neg edx mov DWORD PTR [rsi+rax], edx add DWORD PTR data[rax], 1 add rax, 4 cmp rax, 400 jne .L2 ret Thank you very much for your time and effort! We look forward to hearing from you.
As 'k' is arr1[i] + arr2[i] you can at most use vector gather to implement data[i+k] vectorized but then you have t.c:5:17: missed: possible alias involving gather/scatter between *_5 and data[_8] t.c:4:22: missed: bad data dependence. altering the testcase slightly to int data[100]; void f(int *arr1, int * __restrict arr2) { for(int i = 0; i < 100; i++){ arr2[i] = -arr1[i]; int k = arr1[i] + arr2[i]; data[i] = data[i + k] + 1; } } get's you the desired simplification k == 0. Otherwise the store to arr2[i] can alter arr1[i] since arr1 and arr2 can alias.