[Bug target/87077] missed optimization for horizontal add for x86 SSE

trashyankes at wp dot pl gcc-bugzilla@gcc.gnu.org
Fri Aug 24 11:43:00 GMT 2018


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87077

--- Comment #3 from trashyankes at wp dot pl ---
(In reply to Richard Biener from comment #2)
> Can you attach the source please?  These stupid Web 2.0 sites do not allow
> to save it to a file.

Code:

```
#include <pmmintrin.h>
#include <immintrin.h>

struct alignas(32) Vx
{
    float x[4];
};

struct alignas(32) Mx
{
    Vx x[4];
};
#define M_COMMON_ATTR() __attribute__ ((target("fma"),
optimize("-ffast-math")))

M_COMMON_ATTR()
Vx mul(const Mx& mtx, const Vx& vec)
{
    Vx res;
    for (int i = 0; i < 4; ++i)
    {
        auto r = 0.0f;
        for (int j = 0; j < 4; ++j)
        {
            r += mtx.x[i].x[j] * vec.x[j];
        }
        res.x[i] = r;
    }
    return res;
}

M_COMMON_ATTR()
Vx mulSSE(const Mx& mtx, const Vx& vec)
{
    Vx res;
    auto v0 = _mm_load_ps(vec.x);
    auto m0 = _mm_load_ps(mtx.x[0].x);
    auto m1 = _mm_load_ps(mtx.x[1].x);
    auto m2 = _mm_load_ps(mtx.x[2].x);
    auto m3 = _mm_load_ps(mtx.x[3].x);
    m0 = _mm_mul_ps(m0, v0);
    m1 = _mm_mul_ps(m1, v0);
    m2 = _mm_mul_ps(m2, v0);
    m3 = _mm_mul_ps(m3, v0);
    m0 = _mm_hadd_ps(m0, m1);
    m2 = _mm_hadd_ps(m2, m3);
    m0 = _mm_hadd_ps(m0, m2);
    _mm_store_ps(res.x, m0);
    return res;
}

```

`mul` use GCC optimalizer
`mulSSE` use hand written code that I expect from `mul`

I use `optimize("-ffast-math")` to eliminate case where compiler is forbid to
change order of summing (because `(a + b) + c != a + (b + c)`).

Similar with `target("fma")` it needed to enable `_mm_hadd_ps`.


More information about the Gcc-bugs mailing list