[Bug target/85048] New: [missed optimization] vector conversions
kretz at kde dot org
gcc-bugzilla@gcc.gnu.org
Fri Mar 23 11:25:00 GMT 2018
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85048
Bug ID: 85048
Summary: [missed optimization] vector conversions
Product: gcc
Version: 8.0.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: kretz at kde dot org
Target Milestone: ---
The following testcase lists all integer and/or float conversions applied to
vector builtins of the same number of elements. All of those functions can be
compiled to a single instruction (the function's name plus `ret`) when
`-march=skylake-avx512` is active. AFAICS many conversion instructions in the
SSE and AVX ISA extensions are also unsupported.
I would expect this code to compile to optimal conversion sequences even on -O2
(and lower) since the conversion is applied directly on vector builtins. If
this is not in scope, I'd like to open a feature request for something like
clang's __builtin_convertvector (could be even done via static_cast) that
produces optimal conversion instruction sequences on vector builtins without
the auto-vectorizer.
#include <cstdint>
template <class T, int N, int Size = N * sizeof(T)>
using V [[gnu::vector_size(Size)]] = T;
template <class From, class To> V<To, 2> cvt2(V<From, 2> x) {
return V<To, 2>{To(x[0]), To(x[1])};
}
template <class From, class To> V<To, 4> cvt4(V<From, 4> x) {
return V<To, 4>{To(x[0]), To(x[1]), To(x[2]), To(x[3])};
}
template <class From, class To> V<To, 8> cvt8(V<From, 8> x) {
return V<To, 8>{
To(x[0]), To(x[1]), To(x[2]), To(x[3]),
To(x[4]), To(x[5]), To(x[6]), To(x[7])
};
}
template <class From, class To> V<To, 16> cvt16(V<From, 16> x) {
return V<To, 16>{
To(x[0]), To(x[1]), To(x[2]), To(x[3]),
To(x[4]), To(x[5]), To(x[6]), To(x[7]),
To(x[8]), To(x[9]), To(x[10]), To(x[11]),
To(x[12]), To(x[13]), To(x[14]), To(x[15])
};
}
template <class From, class To> V<To, 32> cvt32(V<From, 32> x) {
return V<To, 32>{
To(x[0]), To(x[1]), To(x[2]), To(x[3]),
To(x[4]), To(x[5]), To(x[6]), To(x[7]),
To(x[8]), To(x[9]), To(x[10]), To(x[11]),
To(x[12]), To(x[13]), To(x[14]), To(x[15]),
To(x[16]), To(x[17]), To(x[18]), To(x[19]),
To(x[20]), To(x[21]), To(x[22]), To(x[23]),
To(x[24]), To(x[25]), To(x[26]), To(x[27]),
To(x[28]), To(x[29]), To(x[30]), To(x[31])
};
}
template <class From, class To> V<To, 64> cvt64(V<From, 64> x) {
return V<To, 64>{
To(x[ 0]), To(x[ 1]), To(x[ 2]), To(x[ 3]),
To(x[ 4]), To(x[ 5]), To(x[ 6]), To(x[ 7]),
To(x[ 8]), To(x[ 9]), To(x[10]), To(x[11]),
To(x[12]), To(x[13]), To(x[14]), To(x[15]),
To(x[16]), To(x[17]), To(x[18]), To(x[19]),
To(x[20]), To(x[21]), To(x[22]), To(x[23]),
To(x[24]), To(x[25]), To(x[26]), To(x[27]),
To(x[28]), To(x[29]), To(x[30]), To(x[31]),
To(x[32]), To(x[33]), To(x[34]), To(x[35]),
To(x[36]), To(x[37]), To(x[38]), To(x[39]),
To(x[40]), To(x[41]), To(x[42]), To(x[43]),
To(x[44]), To(x[45]), To(x[46]), To(x[47]),
To(x[48]), To(x[49]), To(x[50]), To(x[51]),
To(x[52]), To(x[53]), To(x[54]), To(x[55]),
To(x[56]), To(x[57]), To(x[58]), To(x[59]),
To(x[60]), To(x[61]), To(x[62]), To(x[63]),
};
}
#define _(name, from, to, size) \
auto name(V<from, size> x) { return cvt##size<from, to>(x); }
// integral -> integral; truncation
_(vpmovqd , uint64_t, uint32_t, 2)
_(vpmovqd , uint64_t, uint32_t, 4)
_(vpmovqd , uint64_t, uint32_t, 8)
_(vpmovqd , int64_t, uint32_t, 2)
_(vpmovqd , int64_t, uint32_t, 4)
_(vpmovqd , int64_t, uint32_t, 8)
_(vpmovqd_, uint64_t, int32_t, 2)
_(vpmovqd_, uint64_t, int32_t, 4)
_(vpmovqd_, uint64_t, int32_t, 8)
_(vpmovqd_, int64_t, int32_t, 2)
_(vpmovqd_, int64_t, int32_t, 4)
_(vpmovqd_, int64_t, int32_t, 8)
_(vpmovqw , uint64_t, uint16_t, 2)
_(vpmovqw , uint64_t, uint16_t, 4)
_(vpmovqw , uint64_t, uint16_t, 8)
_(vpmovqw , int64_t, uint16_t, 2)
_(vpmovqw , int64_t, uint16_t, 4)
_(vpmovqw , int64_t, uint16_t, 8)
_(vpmovqw_, uint64_t, int16_t, 2)
_(vpmovqw_, uint64_t, int16_t, 4)
_(vpmovqw_, uint64_t, int16_t, 8)
_(vpmovqw_, int64_t, int16_t, 2)
_(vpmovqw_, int64_t, int16_t, 4)
_(vpmovqw_, int64_t, int16_t, 8)
_(vpmovqb , uint64_t, uint8_t, 2)
_(vpmovqb , uint64_t, uint8_t, 4)
_(vpmovqb , uint64_t, uint8_t, 8)
_(vpmovqb , int64_t, uint8_t, 2)
_(vpmovqb , int64_t, uint8_t, 4)
_(vpmovqb , int64_t, uint8_t, 8)
_(vpmovqb_, uint64_t, int8_t, 2)
_(vpmovqb_, uint64_t, int8_t, 4)
_(vpmovqb_, uint64_t, int8_t, 8)
_(vpmovqb_, int64_t, int8_t, 2)
_(vpmovqb_, int64_t, int8_t, 4)
_(vpmovqb_, int64_t, int8_t, 8)
_(vpmovdw , uint32_t, uint16_t, 4)
_(vpmovdw , uint32_t, uint16_t, 8)
_(vpmovdw , uint32_t, uint16_t, 16)
_(vpmovdw , int32_t, uint16_t, 4)
_(vpmovdw , int32_t, uint16_t, 8)
_(vpmovdw , int32_t, uint16_t, 16)
_(vpmovdw_, uint32_t, int16_t, 4)
_(vpmovdw_, uint32_t, int16_t, 8)
_(vpmovdw_, uint32_t, int16_t, 16)
_(vpmovdw_, int32_t, int16_t, 4)
_(vpmovdw_, int32_t, int16_t, 8)
_(vpmovdw_, int32_t, int16_t, 16)
_(vpmovdb , uint32_t, uint8_t, 4)
_(vpmovdb , uint32_t, uint8_t, 8)
_(vpmovdb , uint32_t, uint8_t, 16)
_(vpmovdb , int32_t, uint8_t, 4)
_(vpmovdb , int32_t, uint8_t, 8)
_(vpmovdb , int32_t, uint8_t, 16)
_(vpmovdb_, uint32_t, int8_t, 4)
_(vpmovdb_, uint32_t, int8_t, 8)
_(vpmovdb_, uint32_t, int8_t, 16)
_(vpmovdb_, int32_t, int8_t, 4)
_(vpmovdb_, int32_t, int8_t, 8)
_(vpmovdb_, int32_t, int8_t, 16)
_(vpmovwb , uint16_t, uint8_t, 8)
_(vpmovwb , uint16_t, uint8_t, 16)
_(vpmovwb , uint16_t, uint8_t, 32)
_(vpmovwb , int16_t, uint8_t, 8)
_(vpmovwb , int16_t, uint8_t, 16)
_(vpmovwb , int16_t, uint8_t, 32)
_(vpmovwb_, uint16_t, int8_t, 8)
_(vpmovwb_, uint16_t, int8_t, 16)
_(vpmovwb_, uint16_t, int8_t, 32)
_(vpmovwb_, int16_t, int8_t, 8)
_(vpmovwb_, int16_t, int8_t, 16)
_(vpmovwb_, int16_t, int8_t, 32)
// integral -> integral; zero extension
_(vpmovzxbw , uint8_t, int16_t, 8)
_(vpmovzxbw , uint8_t, int16_t, 16)
_(vpmovzxbw , uint8_t, int16_t, 32)
_(vpmovzxbw_, uint8_t, uint16_t, 8)
_(vpmovzxbw_, uint8_t, uint16_t, 16)
_(vpmovzxbw_, uint8_t, uint16_t, 32)
_(vpmovzxbd , uint8_t, int32_t, 4)
_(vpmovzxbd , uint8_t, int32_t, 8)
_(vpmovzxbd , uint8_t, int32_t, 16)
_(vpmovzxwd , uint16_t, int32_t, 4)
_(vpmovzxwd , uint16_t, int32_t, 8)
_(vpmovzxwd , uint16_t, int32_t, 16)
_(vpmovzxbd_, uint8_t, uint32_t, 4)
_(vpmovzxbd_, uint8_t, uint32_t, 8)
_(vpmovzxbd_, uint8_t, uint32_t, 16)
_(vpmovzxwd_, uint16_t, uint32_t, 4)
_(vpmovzxwd_, uint16_t, uint32_t, 8)
_(vpmovzxwd_, uint16_t, uint32_t, 16)
_(vpmovzxbq , uint8_t, int64_t, 2)
_(vpmovzxbq , uint8_t, int64_t, 4)
_(vpmovzxbq , uint8_t, int64_t, 8)
_(vpmovzxwq , uint16_t, int64_t, 2)
_(vpmovzxwq , uint16_t, int64_t, 4)
_(vpmovzxwq , uint16_t, int64_t, 8)
_(vpmovzxdq , uint32_t, int64_t, 2)
_(vpmovzxdq , uint32_t, int64_t, 4)
_(vpmovzxdq , uint32_t, int64_t, 8)
_(vpmovzxbq_, uint8_t, uint64_t, 2)
_(vpmovzxbq_, uint8_t, uint64_t, 4)
_(vpmovzxbq_, uint8_t, uint64_t, 8)
_(vpmovzxwq_, uint16_t, uint64_t, 2)
_(vpmovzxwq_, uint16_t, uint64_t, 4)
_(vpmovzxwq_, uint16_t, uint64_t, 8)
_(vpmovzxdq_, uint32_t, uint64_t, 2)
_(vpmovzxdq_, uint32_t, uint64_t, 4)
_(vpmovzxdq_, uint32_t, uint64_t, 8)
// integral -> integral; sign extension
_(vpmovsxbw , int8_t, int16_t, 8)
_(vpmovsxbw , int8_t, int16_t, 16)
_(vpmovsxbw , int8_t, int16_t, 32)
_(vpmovsxbw_, int8_t, uint16_t, 8)
_(vpmovsxbw_, int8_t, uint16_t, 16)
_(vpmovsxbw_, int8_t, uint16_t, 32)
_(vpmovsxbd , int8_t, int32_t, 4)
_(vpmovsxbd , int8_t, int32_t, 8)
_(vpmovsxbd , int8_t, int32_t, 16)
_(vpmovsxwd , int16_t, int32_t, 4)
_(vpmovsxwd , int16_t, int32_t, 8)
_(vpmovsxwd , int16_t, int32_t, 16)
_(vpmovsxbd_, int8_t, uint32_t, 4)
_(vpmovsxbd_, int8_t, uint32_t, 8)
_(vpmovsxbd_, int8_t, uint32_t, 16)
_(vpmovsxwd_, int16_t, uint32_t, 4)
_(vpmovsxwd_, int16_t, uint32_t, 8)
_(vpmovsxwd_, int16_t, uint32_t, 16)
_(vpmovsxbq , int8_t, int64_t, 2)
_(vpmovsxbq , int8_t, int64_t, 4)
_(vpmovsxbq , int8_t, int64_t, 8)
_(vpmovsxwq , int16_t, int64_t, 2)
_(vpmovsxwq , int16_t, int64_t, 4)
_(vpmovsxwq , int16_t, int64_t, 8)
_(vpmovsxdq , int32_t, int64_t, 2)
_(vpmovsxdq , int32_t, int64_t, 4)
_(vpmovsxdq , int32_t, int64_t, 8)
_(vpmovsxbq_, int8_t, uint64_t, 2)
_(vpmovsxbq_, int8_t, uint64_t, 4)
_(vpmovsxbq_, int8_t, uint64_t, 8)
_(vpmovsxwq_, int16_t, uint64_t, 2)
_(vpmovsxwq_, int16_t, uint64_t, 4)
_(vpmovsxwq_, int16_t, uint64_t, 8)
_(vpmovsxdq_, int32_t, uint64_t, 2)
_(vpmovsxdq_, int32_t, uint64_t, 4)
_(vpmovsxdq_, int32_t, uint64_t, 8)
// integral -> double
_(vcvtdq2pd , int32_t, double, 2)
_(vcvtdq2pd , int32_t, double, 4)
_(vcvtdq2pd , int32_t, double, 8)
_(vcvtudq2pd, uint32_t, double, 2)
_(vcvtudq2pd, uint32_t, double, 4)
_(vcvtudq2pd, uint32_t, double, 8)
_(vcvtqq2pd , int64_t, double, 2)
_(vcvtqq2pd , int64_t, double, 4)
_(vcvtqq2pd , int64_t, double, 8)
_(vcvtuqq2pd, uint64_t, double, 2)
_(vcvtuqq2pd, uint64_t, double, 4)
_(vcvtuqq2pd, uint64_t, double, 8)
// integral -> float
_(vcvtdq2ps , int32_t, float, 4)
_(vcvtdq2ps , int32_t, float, 8)
_(vcvtdq2ps , int32_t, float, 16)
_(vcvtudq2ps, uint32_t, float, 4)
_(vcvtudq2ps, uint32_t, float, 8)
_(vcvtudq2ps, uint32_t, float, 16)
_(vcvtqq2ps , int64_t, float, 4)
_(vcvtqq2ps , int64_t, float, 8)
_(vcvtqq2ps , int64_t, float, 16)
_(vcvtuqq2ps, uint64_t, float, 4)
_(vcvtuqq2ps, uint64_t, float, 8)
_(vcvtuqq2ps, uint64_t, float, 16)
// float <-> double
_( cvttpd2ps, double, float, 2)
_(vcvttpd2ps, double, float, 4)
_(vcvttpd2ps, double, float, 8)
_( cvttps2pd, float, double, 2)
_(vcvttps2pd, float, double, 4)
_(vcvttps2pd, float, double, 8)
// float -> integral
_( cvttps2dq, float, int32_t, 4)
_(vcvttps2dq, float, int32_t, 8)
_(vcvttps2dq, float, int32_t, 16)
_( cvttps2qq, float, int64_t, 4)
_(vcvttps2qq, float, int64_t, 8)
_(vcvttps2qq, float, int64_t, 16)
_( cvttps2udq, float, uint32_t, 4)
_(vcvttps2udq, float, uint32_t, 8)
_(vcvttps2udq, float, uint32_t, 16)
_( cvttps2uqq, float, uint64_t, 4)
_(vcvttps2uqq, float, uint64_t, 8)
_(vcvttps2uqq, float, uint64_t, 16)
// double -> integral
_( cvttpd2dq, double, int32_t, 2)
_(vcvttpd2dq, double, int32_t, 4)
_(vcvttpd2dq, double, int32_t, 8)
_(vcvttpd2qq, double, int64_t, 2)
_(vcvttpd2qq, double, int64_t, 4)
_(vcvttpd2qq, double, int64_t, 8)
_(vcvttpd2udq, double, uint32_t, 2)
_(vcvttpd2udq, double, uint32_t, 4)
_(vcvttpd2udq, double, uint32_t, 8)
_(vcvttpd2uqq, double, uint64_t, 2)
_(vcvttpd2uqq, double, uint64_t, 4)
_(vcvttpd2uqq, double, uint64_t, 8)
// no change in type; nop
_(nop, int8_t, int8_t, 16)
_(nop, uint8_t, uint8_t, 16)
_(nop, int8_t, int8_t, 32)
_(nop, uint8_t, uint8_t, 32)
_(nop, int8_t, int8_t, 64)
_(nop, uint8_t, uint8_t, 64)
_(nop, int16_t, int16_t, 8)
_(nop, uint16_t, uint16_t, 8)
_(nop, int16_t, int16_t, 16)
_(nop, uint16_t, uint16_t, 16)
_(nop, int16_t, int16_t, 32)
_(nop, uint16_t, uint16_t, 32)
_(nop, int32_t, int32_t, 4)
_(nop, uint32_t, uint32_t, 4)
_(nop, int32_t, int32_t, 8)
_(nop, uint32_t, uint32_t, 8)
_(nop, int32_t, int32_t, 16)
_(nop, uint32_t, uint32_t, 16)
_(nop, int64_t, int64_t, 2)
_(nop, uint64_t, uint64_t, 2)
_(nop, int64_t, int64_t, 4)
_(nop, uint64_t, uint64_t, 4)
_(nop, int64_t, int64_t, 8)
_(nop, uint64_t, uint64_t, 8)
_(nop, double, double, 2)
_(nop, double, double, 4)
_(nop, double, double, 8)
_(nop, float, float, 4)
_(nop, float, float, 8)
_(nop, float, float, 16)
More information about the Gcc-bugs
mailing list