in the following example matmul and matmul2 do not vectorize the manual unroll does c++ -std=c++11 -Ofast -S m3x10.cc -march=corei7-avx -fopt-info-vec-all gcc version 4.9.0 20131011 (experimental) [trunk revision 203426] (GCC) cat m3x10.cc const int nrow=3; alignas(32) double tmp[nrow][10]; alignas(32) double param[nrow]; alignas(32) double frame[10]; void matmul() { for (int j=0; j<nrow; ++j) for (int i=0; i<10; ++i) param[j] += tmp[j][i]*frame[i]; } void matmul2() { for (int j=0; j<nrow; ++j) { double s=0; for (int i=0; i<10; ++i) s += tmp[j][i]*frame[i]; param[j] =s; } } void matmul3() { for (int i=0; i<10; ++i) { param[0] += tmp[0][i]*frame[i]; param[1] += tmp[1][i]*frame[i]; param[2] += tmp[2][i]*frame[i]; } } double vmul0() { double s=0; for (int i=0; i<10; ++i) s += tmp[0][i]*frame[i]; return s; } double vmul1() { double s=0; for (int i=0; i<10; ++i) s += tmp[1][i]*frame[i]; return s; }