This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug libgomp/58482] gomp4: user defined reduction produce wrong result
- From: "vincenzo.innocente at cern dot ch" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Fri, 20 Sep 2013 13:19:57 +0000
- Subject: [Bug libgomp/58482] gomp4: user defined reduction produce wrong result
- Auto-submitted: auto-generated
- References: <bug-58482-4 at http dot gcc dot gnu dot org/bugzilla/>
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58482
--- Comment #2 from vincenzo Innocente <vincenzo.innocente at cern dot ch> ---
Thanks Jakub for the clear answer.
The reduction operator should be strictly commutative!
and I now understand the meaning of
omp declare reduction (I hope)
so I modified it as you can see below
results ok.
but omp does not vectorize while good old -Ofast doesâ
shall I prepare a simple example with corresponding assembler and submit?
c++ -std=c++11 ured_omp4.cpp -O2 -ftree-vectorizer-verbose=4 -fopenmp
Analyzing loop at ured_omp4.cpp:39
ured_omp4.cpp:39:22: note: ===== analyze_loop_nest =====
ured_omp4.cpp:39:22: note: === vect_analyze_loop_form ===
ured_omp4.cpp:39:22: note: === get_loop_niters ===
ured_omp4.cpp:39:22: note: ==> get_loop_niters:(unsigned int) NN_15(D)
ured_omp4.cpp:39:22: note: Symbolic number of iterations is (unsigned int)
NN_15(D)
ured_omp4.cpp:39:22: note: === vect_analyze_data_refs ===
ured_omp4.cpp:39:22: note: got vectype for stmt: _20 = *_19;
const vector(4) float
ured_omp4.cpp:39:22: note: got vectype for stmt: _30 = MEM[(struct TwoInt
*)&D.63343][_16].a;
vector(4) float
ured_omp4.cpp:39:22: note: not vectorized: not suitable for gather load _30 =
MEM[(struct TwoInt *)&D.63343][_16].a;
ured_omp4.cpp:39:22: note: bad data references.
ured_omp4.cpp:35:8: note: vectorized 0 loops in function.
ured_omp4.cpp:35:8: note: loop turned into non-loop; it never loops
ured_omp4.cpp:35:8: note: loop turned into non-loop; it never loops
I spare you the 4 pages of dump in case of -Ofast
pb-d-128-141-131-94:vectorize innocent$ c++ -std=c++11 ured_omp4.cpp -O2
-ftree-vectorizer-verbose=1 -fopenmp; ./a.out
ured_omp4.cpp:35:8: note: loop turned into non-loop; it never loops
ured_omp4.cpp:35:8: note: loop turned into non-loop; it never loops
523776,-523776
523776,-523776
pb-d-128-141-131-94:vectorize innocent$ c++ -std=c++11 ured_omp4.cpp -Ofast
-ftree-vectorizer-verbose=1; ./a.out
ured_omp4.cpp:38:3: note: loop vectorized
ured_omp4.cpp:38:3: note: loop peeled for vectorization to enhance alignment
ured_omp4.cpp:38:3: note: loop with 3 iterations completely unrolled
ured_omp4.cpp:35:8: note: loop with 6 iterations completely unrolled
ured_omp4.cpp:46:13: note: loop vectorized
ured_omp4.cpp:45:8: note: loop with 2 iterations completely unrolled
ured_omp4.cpp:38:3: note: loop vectorized
ured_omp4.cpp:63:3: note: loop vectorized
ured_omp4.cpp:63:3: note: loop with 4 iterations completely unrolled
523776,-523776
523776,-523776
cat ured_omp4.cpp
#include<algorithm>
#define Type float
struct TwoInt {
Type a=0;
Type b=0;
#pragma omp declare simd
TwoInt & operator+=(TwoInt rh) {
a+=rh.a;
b-=rh.b;
}
#pragma omp declare simd
TwoInt & add(TwoInt rh) {
a+=rh.a;
b-=rh.b;
return *this;
}
#pragma omp declare simd
TwoInt & reduce(TwoInt rh) {
a+=rh.a;
b+=rh.b;
return *this;
}
};
#pragma omp declare reduction (foo:struct TwoInt: omp_out.reduce(omp_in))
TwoInt sum(Type const * q, int NN) {
TwoInt s;
#pragma omp simd aligned(q: 16) reduction(foo:s)
for (int i=0;i<NN;++i) {
TwoInt l; l.a=q[i]; l.b = q[i];
s.add(l);
}
return s;
}
TwoInt sum4(Type const * q, int NN) {
TwoInt s[4];
for (int i=0;i<NN;i+=4) {
for (int j=0;j<std::min(4,NN-i);++j) {
TwoInt l; l.a=q[i+j]; l.b = q[i+j];
s[j].add(l);
}
}
s[0].reduce(s[1]); s[3].reduce(s[2]); s[3].reduce(s[0]);
return s[3];
}
#include<iostream>
int main() {
constexpr int NN=1024;
Type q[NN];
Type a=0;
for (auto & e: q) e=a++;
auto s = sum(q,NN);
std::cout << s.a << "," << s.b << std::endl;
s = sum4(q,NN);
std::cout << s.a << "," << s.b << std::endl;
return 0;
}