For --- double a[1024]; float b[1024]; int c[1024]; void dependence_distance_4_mixed_0 (void) { int i; for (i = 0; i < 1020; ++i) a[i + 4] = a[i] + a[i + 4] + c[i]; } --- with -O3 -ffast-math -mavx, vect256 branch generates: .L2: vmovapd a(%rax,%rax), %ymm0 vcvtdq2pd c(%rax), %ymm1 vaddpd a+32(%rax,%rax), %ymm0, %ymm0 vaddpd %ymm1, %ymm0, %ymm0 vmovapd %ymm0, a+32(%rax,%rax) addq $16, %rax cmpq $4080, %rax jne .L2 Trunk at revision 165455 generates .L2: vmovapd 16(%rax), %xmm2 vaddpd -16(%rax), %xmm2, %xmm2 vmovdqa (%rdx), %xmm0 addq $16, %rdx vpshufd $238, %xmm0, %xmm1 vcvtdq2pd %xmm0, %xmm0 vcvtdq2pd %xmm1, %xmm1 vaddpd %xmm1, %xmm2, %xmm1 vmovapd (%rax), %xmm2 vaddpd -32(%rax), %xmm2, %xmm2 vmovapd %xmm1, 16(%rax) vaddpd %xmm0, %xmm2, %xmm0 vmovapd %xmm0, (%rax) addq $32, %rax cmpq %rax, %rcx jne .L2
Related to PR46011
Link to vectorizer missed-optimization meta-bug.