Hello gcc team, I once wrote a small test case to show the problems with the autovectorizer https://godbolt.org/z/xs35P45MM . In particular, the += operator is not vectorized. The + operator works in the same context. I do not understand that. If you decrement the arraysize in foo from 2 to 1 it doesn't work at all anymore - scalar operations are always generated for ARR_2x. In general, I made the experience that the autovectorizer starts much too late. It should always do this from 2 values, even if these are much smaller than a simd register. This also saves a lot of memory accesses - especially when the data is linear in the memory (as in the example). Usually, however, vectorization is only carried out when the data is at least as large as a simd register, but often only when it is twice or even four times as large. I think you should urgently update/optimize the autovectorizer. thx & regards Gero
There is an aliasing issue with the += case. I Noticed that even clang does not auto-vectorizes the exe_self_* cases either.
Well, the issue is that we end up with (for the simplest case): <bb 2> [local count: 357878152]: _15 = MEM <const double[2]> [(const value_type &)arg_3(D)][0]; _16 = MEM <const double[2]> [(value_type &)out_2(D)][0]; _17 = _15 + _16; MEM <const double[2]> [(value_type &)out_2(D)][0] = _17; _22 = MEM <const double[2]> [(const value_type &)arg_3(D)][1]; _23 = MEM <const double[2]> [(value_type &)out_2(D)][1]; _24 = _22 + _23; MEM <const double[2]> [(value_type &)out_2(D)][1] = _24; return; and the first store into out[0] can end up writing to arg[1]. I don't see what we can easily do here. Path based disambiguation could maybe argue that partial overlaps of value_type are not allowed.
Compared to the non-self case where we see <bb 2> [local count: 357878152]: _19 = MEM <const double[2]> [(const value_type &)arg1_3(D)][0]; _20 = MEM <const double[2]> [(const value_type &)arg2_4(D)][0]; _21 = _19 + _20; _26 = MEM <const double[2]> [(const value_type &)arg1_3(D)][1]; _27 = MEM <const double[2]> [(const value_type &)arg2_4(D)][1]; _28 = _26 + _27; res ={v} {CLOBBER}; MEM[(struct value_type *)out_2(D)][0].value._M_elems[0] = _21; MEM[(struct value_type *)out_2(D)][0].value._M_elems[1] = _28; return; here intermediate optimizations have elided 'res'.
testcase: #include <array> #include <cmath> template <typename Type> class foo { public: using array_type = std::array<Type, 2>; array_type value; inline constexpr foo& operator+=(const foo& arg) noexcept { for (size_t i=0; i<value.size(); ++i) value[i] += arg.value[i]; return *this; } inline constexpr foo operator+(const foo& arg) const noexcept { foo res; for (size_t i=0; i<res.value.size(); ++i) res.value[i] = value[i] + arg.value[i]; return res; } }; // operator-calls inline constexpr void exe_self(auto& out, const auto& arg) noexcept { for (size_t i=0; i<out.size(); ++i) out[i] += arg[i]; } inline constexpr void exe(auto& out, const auto& arg1, const auto& arg2) noexcept { for (size_t i=0; i<out.size(); ++i) out[i] = arg1[i] + arg2[i]; } // test-cases // float64 using ARR_1D = std::array<foo<double>, 1>; void exe_self_1d(ARR_1D& out, const ARR_1D& arg) noexcept { exe_self(out, arg); } void exe_1d(ARR_1D& out, const ARR_1D& arg1, const ARR_1D& arg2) noexcept { exe(out, arg1, arg2); } using ARR_2D = std::array<foo<double>, 2>; void exe_self_2d(ARR_2D& out, const ARR_2D& arg) noexcept { exe_self(out, arg); } void exe_2d(ARR_2D& out, const ARR_2D& arg1, const ARR_2D& arg2) noexcept { exe(out, arg1, arg2); } using ARR_4D = std::array<foo<double>, 4>; void exe_self_4d(ARR_4D& out, const ARR_4D& arg) noexcept { exe_self(out, arg); } void exe_4d(ARR_4D& out, const ARR_4D& arg1, const ARR_4D& arg2) noexcept { exe(out, arg1, arg2); } // float32 using ARR_1F = std::array<foo<float>, 1>; void exe_self_1f(ARR_1F& out, const ARR_1F& arg) noexcept { exe_self(out, arg); } void exe_1f(ARR_1F& out, const ARR_1F& arg1, const ARR_1F& arg2) noexcept { exe(out, arg1, arg2); } using ARR_2F = std::array<foo<float>, 2>; void exe_self_2f(ARR_2F& out, const ARR_2F& arg) noexcept { exe_self(out, arg); } void exe_2f(ARR_2F& out, const ARR_2F& arg1, const ARR_2F& arg2) noexcept { exe(out, arg1, arg2); } using ARR_4F = std::array<foo<float>, 4>; void exe_self_4f(ARR_4F& out, const ARR_4F& arg) noexcept { exe_self(out, arg); } void exe_4f(ARR_4F& out, const ARR_4F& arg1, const ARR_4F& arg2) noexcept { exe(out, arg1, arg2); } // int64 using ARR_1i64 = std::array<foo<int64_t>, 1>; void exe_self_1i64(ARR_1i64& out, const ARR_1i64& arg) noexcept { exe_self(out, arg); } void exe_1i64(ARR_1i64& out, const ARR_1i64& arg1, const ARR_1i64& arg2) noexcept { exe(out, arg1, arg2); } using ARR_2i64 = std::array<foo<int64_t>, 2>; void exe_self_2i64(ARR_2i64& out, const ARR_2i64& arg) noexcept { exe_self(out, arg); } void exe_2i64(ARR_2i64& out, const ARR_2i64& arg1, const ARR_2i64& arg2) noexcept { exe(out, arg1, arg2); } using ARR_4i64 = std::array<foo<int64_t>, 4>; void exe_self_4i64(ARR_4i64& out, const ARR_4i64& arg) noexcept { exe_self(out, arg); } void exe_4i64(ARR_4i64& out, const ARR_4i64& arg1, const ARR_4i64& arg2) noexcept { exe(out, arg1, arg2); } // int32 using ARR_1i32 = std::array<foo<int32_t>, 1>; void exe_self_1i32(ARR_1i32& out, const ARR_1i32& arg) noexcept { exe_self(out, arg); } void exe_1i32(ARR_1i32& out, const ARR_1i32& arg1, const ARR_1i32& arg2) noexcept { exe(out, arg1, arg2); } using ARR_2i32 = std::array<foo<int32_t>, 2>; void exe_self_2i32(ARR_2i32& out, const ARR_2i32& arg) noexcept { exe_self(out, arg); } void exe_2i32(ARR_2i32& out, const ARR_2i32& arg1, const ARR_2i32& arg2) noexcept { exe(out, arg1, arg2); } using ARR_4i32 = std::array<foo<int32_t>, 4>; void exe_self_4i32(ARR_4i32& out, const ARR_4i32& arg) noexcept { exe_self(out, arg); } void exe_4i32(ARR_4i32& out, const ARR_4i32& arg1, const ARR_4i32& arg2) noexcept { exe(out, arg1, arg2); } // int16 using ARR_1i16 = std::array<foo<int16_t>, 1>; void exe_self_1i16(ARR_1i16& out, const ARR_1i16& arg) noexcept { exe_self(out, arg); } void exe_1i16(ARR_1i16& out, const ARR_1i16& arg1, const ARR_1i16& arg2) noexcept { exe(out, arg1, arg2); } using ARR_2i16 = std::array<foo<int16_t>, 2>; void exe_self_2i16(ARR_2i16& out, const ARR_2i16& arg) noexcept { exe_self(out, arg); } void exe_2i16(ARR_2i16& out, const ARR_2i16& arg1, const ARR_2i16& arg2) noexcept { exe(out, arg1, arg2); } using ARR_4i16 = std::array<foo<int16_t>, 4>; void exe_self_4i16(ARR_4i16& out, const ARR_4i16& arg) noexcept { exe_self(out, arg); } void exe_4i16(ARR_4i16& out, const ARR_4i16& arg1, const ARR_4i16& arg2) noexcept { exe(out, arg1, arg2); } // int8 using ARR_1i8 = std::array<foo<int8_t>, 1>; void exe_self_1i8(ARR_1i8& out, const ARR_1i8& arg) noexcept { exe_self(out, arg); } void exe_1i8(ARR_1i8& out, const ARR_1i8& arg1, const ARR_1i8& arg2) noexcept { exe(out, arg1, arg2); } using ARR_2i8 = std::array<foo<int8_t>, 2>; void exe_self_2i8(ARR_2i8& out, const ARR_2i8& arg) noexcept { exe_self(out, arg); } void exe_2i8(ARR_2i8& out, const ARR_2i8& arg1, const ARR_2i8& arg2) noexcept { exe(out, arg1, arg2); } using ARR_4i8 = std::array<foo<int8_t>, 4>; void exe_self_4i8(ARR_4i8& out, const ARR_4i8& arg) noexcept { exe_self(out, arg); } void exe_4i8(ARR_4i8& out, const ARR_4i8& arg1, const ARR_4i8& arg2) noexcept { exe(out, arg1, arg2); }