typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],aa[LEN_2D][LEN_2D]; void main() { // linear dependence testing // no dependence - vectorizable for (int nl = 0; nl < 2*iterations; nl++) { for (int i = 1; i < LEN_1D; i += 2) { a[i] = a[i - 1] + b[i]; } dummy(); } } takes 0.73s with -march=native -Ofast -mprefer-avx128 and 0.81s with -march=native -Ofast 128bit version is: main: .LFB0: .cfi_startproc pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 movl $200000, %ebx .L2: xorl %eax, %eax .p2align 4 .p2align 3 .L4: vmovaps a(%rax), %xmm2 vmovups b+4(%rax), %xmm3 addq $32, %rax vshufps $136, a-16(%rax), %xmm2, %xmm0 vshufps $136, b-12(%rax), %xmm3, %xmm1 vaddps %xmm1, %xmm0, %xmm0 vmovss %xmm0, a-28(%rax) vextractps $1, %xmm0, a-20(%rax) vextractps $2, %xmm0, a-12(%rax) vextractps $3, %xmm0, a-4(%rax) cmpq $127968, %rax jne .L4 vmovss b+127972(%rip), %xmm0 xorl %eax, %eax vaddss a+127968(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127972(%rip) vmovss a+127976(%rip), %xmm0 vaddss b+127980(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127980(%rip) vmovss a+127984(%rip), %xmm0 vaddss b+127988(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127988(%rip) vmovss a+127992(%rip), %xmm0 vaddss b+127996(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127996(%rip) call dummy main: .LFB0: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 pushq %rbx .cfi_offset 3, -24 movl $200000, %ebx andq $-32, %rsp .p2align 4 .p2align 3 .L2: xorl %eax, %eax .p2align 4 .p2align 3 .L4: vmovaps a(%rax), %ymm4 vmovups b+4(%rax), %ymm5 addq $64, %rax vshufps $136, a-32(%rax), %ymm4, %ymm1 vperm2f128 $3, %ymm1, %ymm1, %ymm2 vshufps $68, %ymm2, %ymm1, %ymm0 vshufps $238, %ymm2, %ymm1, %ymm2 vshufps $136, b-28(%rax), %ymm5, %ymm1 vinsertf128 $1, %xmm2, %ymm0, %ymm0 vperm2f128 $3, %ymm1, %ymm1, %ymm2 vshufps $68, %ymm2, %ymm1, %ymm3 vshufps $238, %ymm2, %ymm1, %ymm2 vinsertf128 $1, %xmm2, %ymm3, %ymm1 vaddps %ymm1, %ymm0, %ymm0 vmovss %xmm0, a-60(%rax) vextractps $1, %xmm0, a-52(%rax) vextractps $2, %xmm0, a-44(%rax) vextractps $3, %xmm0, a-36(%rax) vextractf128 $0x1, %ymm0, %xmm0 vmovss %xmm0, a-28(%rax) vextractps $1, %xmm0, a-20(%rax) vextractps $2, %xmm0, a-12(%rax) vextractps $3, %xmm0, a-4(%rax) cmpq $127936, %rax jne .L4 vmovaps a+127936(%rip), %xmm6 vmovups b+127940(%rip), %xmm7 xorl %eax, %eax vshufps $136, a+127952(%rip), %xmm6, %xmm0 vshufps $136, b+127956(%rip), %xmm7, %xmm1 vaddps %xmm1, %xmm0, %xmm0 vmovss %xmm0, a+127940(%rip) vextractps $1, %xmm0, a+127948(%rip) vextractps $2, %xmm0, a+127956(%rip) vextractps $3, %xmm0, a+127964(%rip) vmovss b+127972(%rip), %xmm0 vaddss a+127968(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127972(%rip) vmovss b+127980(%rip), %xmm0 vaddss a+127976(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127980(%rip) vmovss b+127988(%rip), %xmm0 vaddss a+127984(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127988(%rip) vmovss a+127992(%rip), %xmm0 vaddss b+127996(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127996(%rip) vzeroupper call dummy
We're using quite inefficient vectorization here and the lack of cross-lane interleaving permutes are harmful to AVX vectorization since there's no extract even / extract odd available and we do not factor this in when costing. Surprisingly not vectorizing is still slower (1.4s vs 1.35s with AVX for me), but -funroll-loops w/o vectorizing is comparable to vectorizing with SSE and unrolling ontop of SSE vectorizing doesn't help. In the end what we miss (apart from the bad use of interleaving) is the opportunity to use masked stores (and loads). Which would halve the number of usable lanes but likely provide a speedup over scalar unrolled code.