Best regards,
Bernard
(*)
// g++ -o vect_acc vect_acc.cxx -lstdc++ -std=c++0x
-ftree-vectorizer-verbose=99 -O4 -march=native
#include <stdlib.h> //atol to avoid dependance on
boost::lexical_cast<>
#include <iterator>
#include <iostream>
#include <numeric>
template<int vect_size, typename In, typename T> T acc(In b, In e, T
init){
int const vect_elts(vect_size/sizeof(T));
std::size_t n(e-b), r(n%vect_elts);
T vect[vect_elts];
// should be unrolled a compile-time
for(std::size_t i(0); i!=vect_elts; ++i){ vect[i]= 0.;}
// should also be unrolled at compile time, should also handle
alignment issues so the r values
// should be taken from beginning and end of sequence accordingly
for(std::size_t i(0); i!=r; ++i, ++b){ init+= *b; }
for(; b!=e; b+= vect_elts){
//should be vectorized
for(std::size_t i(0); i!= vect_elts; ++i)
{ vect[i] += b[i]; }
}
// should be unrolled a compile-time
for(std::size_t i(0); i!=vect_elts; ++i){ init+=vect[i];}
return init;
}
int main(int argc, char* argv[]){
typedef int data_t;
long n(atol(argv[1])), loops(argc > 2 ? atol(argv[2]):1);
data_t const* const d= new data_t[n];
volatile data_t res;// volatile in case optimizer would want to be
to agressive
if(argc > 3){
for(long i(0); i!= loops; ++i)
{ res= acc<16*8>(d, d+n, 0.); }
}else{
for(long i(0); i!= loops; ++i)
{ res= std::accumulate(d, d+n, 0.); }
}
std::cout<< res <<std::endl;
return 0;
}