[Bug target/87077] missed optimization for horizontal add for x86 SSE
trashyankes at wp dot pl
gcc-bugzilla@gcc.gnu.org
Fri Aug 24 11:43:00 GMT 2018
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87077
--- Comment #3 from trashyankes at wp dot pl ---
(In reply to Richard Biener from comment #2)
> Can you attach the source please? These stupid Web 2.0 sites do not allow
> to save it to a file.
Code:
```
#include <pmmintrin.h>
#include <immintrin.h>
struct alignas(32) Vx
{
float x[4];
};
struct alignas(32) Mx
{
Vx x[4];
};
#define M_COMMON_ATTR() __attribute__ ((target("fma"),
optimize("-ffast-math")))
M_COMMON_ATTR()
Vx mul(const Mx& mtx, const Vx& vec)
{
Vx res;
for (int i = 0; i < 4; ++i)
{
auto r = 0.0f;
for (int j = 0; j < 4; ++j)
{
r += mtx.x[i].x[j] * vec.x[j];
}
res.x[i] = r;
}
return res;
}
M_COMMON_ATTR()
Vx mulSSE(const Mx& mtx, const Vx& vec)
{
Vx res;
auto v0 = _mm_load_ps(vec.x);
auto m0 = _mm_load_ps(mtx.x[0].x);
auto m1 = _mm_load_ps(mtx.x[1].x);
auto m2 = _mm_load_ps(mtx.x[2].x);
auto m3 = _mm_load_ps(mtx.x[3].x);
m0 = _mm_mul_ps(m0, v0);
m1 = _mm_mul_ps(m1, v0);
m2 = _mm_mul_ps(m2, v0);
m3 = _mm_mul_ps(m3, v0);
m0 = _mm_hadd_ps(m0, m1);
m2 = _mm_hadd_ps(m2, m3);
m0 = _mm_hadd_ps(m0, m2);
_mm_store_ps(res.x, m0);
return res;
}
```
`mul` use GCC optimalizer
`mulSSE` use hand written code that I expect from `mul`
I use `optimize("-ffast-math")` to eliminate case where compiler is forbid to
change order of summing (because `(a + b) + c != a + (b + c)`).
Similar with `target("fma")` it needed to enable `_mm_hadd_ps`.
More information about the Gcc-bugs
mailing list