[Bug target/85048] New: [missed optimization] vector conversions

Fri Mar 23 11:25:00 GMT 2018

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85048

            Bug ID: 85048
           Summary: [missed optimization] vector conversions
           Product: gcc
           Version: 8.0.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: kretz at kde dot org
  Target Milestone: ---

The following testcase lists all integer and/or float conversions applied to
vector builtins of the same number of elements. All of those functions can be
compiled to a single instruction (the function's name plus `ret`) when
`-march=skylake-avx512` is active. AFAICS many conversion instructions in the
SSE and AVX ISA extensions are also unsupported.

I would expect this code to compile to optimal conversion sequences even on -O2
(and lower) since the conversion is applied directly on vector builtins. If
this is not in scope, I'd like to open a feature request for something like
clang's __builtin_convertvector (could be even done via static_cast) that
produces optimal conversion instruction sequences on vector builtins without
the auto-vectorizer.

#include <cstdint>

template <class T, int N, int Size = N * sizeof(T)>
using V [[gnu::vector_size(Size)]] = T;

template <class From, class To> V<To, 2> cvt2(V<From, 2> x) {
    return V<To, 2>{To(x[0]), To(x[1])};
}
template <class From, class To> V<To, 4> cvt4(V<From, 4> x) {
    return V<To, 4>{To(x[0]), To(x[1]), To(x[2]), To(x[3])};
}
template <class From, class To> V<To, 8> cvt8(V<From, 8> x) {
    return V<To, 8>{
        To(x[0]), To(x[1]), To(x[2]), To(x[3]),
        To(x[4]), To(x[5]), To(x[6]), To(x[7])
    };
}
template <class From, class To> V<To, 16> cvt16(V<From, 16> x) {
    return V<To, 16>{
        To(x[0]), To(x[1]), To(x[2]), To(x[3]),
        To(x[4]), To(x[5]), To(x[6]), To(x[7]),
        To(x[8]), To(x[9]), To(x[10]), To(x[11]),
        To(x[12]), To(x[13]), To(x[14]), To(x[15])
    };
}
template <class From, class To> V<To, 32> cvt32(V<From, 32> x) {
    return V<To, 32>{
        To(x[0]), To(x[1]), To(x[2]), To(x[3]),
        To(x[4]), To(x[5]), To(x[6]), To(x[7]),
        To(x[8]), To(x[9]), To(x[10]), To(x[11]),
        To(x[12]), To(x[13]), To(x[14]), To(x[15]),
        To(x[16]), To(x[17]), To(x[18]), To(x[19]),
        To(x[20]), To(x[21]), To(x[22]), To(x[23]),
        To(x[24]), To(x[25]), To(x[26]), To(x[27]),
        To(x[28]), To(x[29]), To(x[30]), To(x[31])
    };
}
template <class From, class To> V<To, 64> cvt64(V<From, 64> x) {
    return V<To, 64>{
        To(x[ 0]), To(x[ 1]), To(x[ 2]), To(x[ 3]),
        To(x[ 4]), To(x[ 5]), To(x[ 6]), To(x[ 7]),
        To(x[ 8]), To(x[ 9]), To(x[10]), To(x[11]),
        To(x[12]), To(x[13]), To(x[14]), To(x[15]),
        To(x[16]), To(x[17]), To(x[18]), To(x[19]),
        To(x[20]), To(x[21]), To(x[22]), To(x[23]),
        To(x[24]), To(x[25]), To(x[26]), To(x[27]),
        To(x[28]), To(x[29]), To(x[30]), To(x[31]),
        To(x[32]), To(x[33]), To(x[34]), To(x[35]),
        To(x[36]), To(x[37]), To(x[38]), To(x[39]),
        To(x[40]), To(x[41]), To(x[42]), To(x[43]),
        To(x[44]), To(x[45]), To(x[46]), To(x[47]),
        To(x[48]), To(x[49]), To(x[50]), To(x[51]),
        To(x[52]), To(x[53]), To(x[54]), To(x[55]),
        To(x[56]), To(x[57]), To(x[58]), To(x[59]),
        To(x[60]), To(x[61]), To(x[62]), To(x[63]),
    };
}

#define _(name, from, to, size) \
auto name(V<from, size> x) { return cvt##size<from, to>(x); }
// integral -> integral; truncation
_(vpmovqd , uint64_t, uint32_t,  2)
_(vpmovqd , uint64_t, uint32_t,  4)
_(vpmovqd , uint64_t, uint32_t,  8)
_(vpmovqd ,  int64_t, uint32_t,  2)
_(vpmovqd ,  int64_t, uint32_t,  4)
_(vpmovqd ,  int64_t, uint32_t,  8)
_(vpmovqd_, uint64_t,  int32_t,  2)
_(vpmovqd_, uint64_t,  int32_t,  4)
_(vpmovqd_, uint64_t,  int32_t,  8)
_(vpmovqd_,  int64_t,  int32_t,  2)
_(vpmovqd_,  int64_t,  int32_t,  4)
_(vpmovqd_,  int64_t,  int32_t,  8)

_(vpmovqw , uint64_t, uint16_t,  2)
_(vpmovqw , uint64_t, uint16_t,  4)
_(vpmovqw , uint64_t, uint16_t,  8)
_(vpmovqw ,  int64_t, uint16_t,  2)
_(vpmovqw ,  int64_t, uint16_t,  4)
_(vpmovqw ,  int64_t, uint16_t,  8)
_(vpmovqw_, uint64_t,  int16_t,  2)
_(vpmovqw_, uint64_t,  int16_t,  4)
_(vpmovqw_, uint64_t,  int16_t,  8)
_(vpmovqw_,  int64_t,  int16_t,  2)
_(vpmovqw_,  int64_t,  int16_t,  4)
_(vpmovqw_,  int64_t,  int16_t,  8)

_(vpmovqb , uint64_t,  uint8_t,  2)
_(vpmovqb , uint64_t,  uint8_t,  4)
_(vpmovqb , uint64_t,  uint8_t,  8)
_(vpmovqb ,  int64_t,  uint8_t,  2)
_(vpmovqb ,  int64_t,  uint8_t,  4)
_(vpmovqb ,  int64_t,  uint8_t,  8)
_(vpmovqb_, uint64_t,   int8_t,  2)
_(vpmovqb_, uint64_t,   int8_t,  4)
_(vpmovqb_, uint64_t,   int8_t,  8)
_(vpmovqb_,  int64_t,   int8_t,  2)
_(vpmovqb_,  int64_t,   int8_t,  4)
_(vpmovqb_,  int64_t,   int8_t,  8)

_(vpmovdw , uint32_t, uint16_t,  4)
_(vpmovdw , uint32_t, uint16_t,  8)
_(vpmovdw , uint32_t, uint16_t, 16)
_(vpmovdw ,  int32_t, uint16_t,  4)
_(vpmovdw ,  int32_t, uint16_t,  8)
_(vpmovdw ,  int32_t, uint16_t, 16)
_(vpmovdw_, uint32_t,  int16_t,  4)
_(vpmovdw_, uint32_t,  int16_t,  8)
_(vpmovdw_, uint32_t,  int16_t, 16)
_(vpmovdw_,  int32_t,  int16_t,  4)
_(vpmovdw_,  int32_t,  int16_t,  8)
_(vpmovdw_,  int32_t,  int16_t, 16)

_(vpmovdb , uint32_t,  uint8_t,  4)
_(vpmovdb , uint32_t,  uint8_t,  8)
_(vpmovdb , uint32_t,  uint8_t, 16)
_(vpmovdb ,  int32_t,  uint8_t,  4)
_(vpmovdb ,  int32_t,  uint8_t,  8)
_(vpmovdb ,  int32_t,  uint8_t, 16)
_(vpmovdb_, uint32_t,   int8_t,  4)
_(vpmovdb_, uint32_t,   int8_t,  8)
_(vpmovdb_, uint32_t,   int8_t, 16)
_(vpmovdb_,  int32_t,   int8_t,  4)
_(vpmovdb_,  int32_t,   int8_t,  8)
_(vpmovdb_,  int32_t,   int8_t, 16)

_(vpmovwb , uint16_t,  uint8_t,  8)
_(vpmovwb , uint16_t,  uint8_t, 16)
_(vpmovwb , uint16_t,  uint8_t, 32)
_(vpmovwb ,  int16_t,  uint8_t,  8)
_(vpmovwb ,  int16_t,  uint8_t, 16)
_(vpmovwb ,  int16_t,  uint8_t, 32)
_(vpmovwb_, uint16_t,   int8_t,  8)
_(vpmovwb_, uint16_t,   int8_t, 16)
_(vpmovwb_, uint16_t,   int8_t, 32)
_(vpmovwb_,  int16_t,   int8_t,  8)
_(vpmovwb_,  int16_t,   int8_t, 16)
_(vpmovwb_,  int16_t,   int8_t, 32)

// integral -> integral; zero extension
_(vpmovzxbw , uint8_t,  int16_t,  8)
_(vpmovzxbw , uint8_t,  int16_t, 16)
_(vpmovzxbw , uint8_t,  int16_t, 32)
_(vpmovzxbw_, uint8_t, uint16_t,  8)
_(vpmovzxbw_, uint8_t, uint16_t, 16)
_(vpmovzxbw_, uint8_t, uint16_t, 32)

_(vpmovzxbd ,  uint8_t,  int32_t,  4)
_(vpmovzxbd ,  uint8_t,  int32_t,  8)
_(vpmovzxbd ,  uint8_t,  int32_t, 16)
_(vpmovzxwd , uint16_t,  int32_t,  4)
_(vpmovzxwd , uint16_t,  int32_t,  8)
_(vpmovzxwd , uint16_t,  int32_t, 16)
_(vpmovzxbd_,  uint8_t, uint32_t,  4)
_(vpmovzxbd_,  uint8_t, uint32_t,  8)
_(vpmovzxbd_,  uint8_t, uint32_t, 16)
_(vpmovzxwd_, uint16_t, uint32_t,  4)
_(vpmovzxwd_, uint16_t, uint32_t,  8)
_(vpmovzxwd_, uint16_t, uint32_t, 16)

_(vpmovzxbq ,  uint8_t,  int64_t, 2)
_(vpmovzxbq ,  uint8_t,  int64_t, 4)
_(vpmovzxbq ,  uint8_t,  int64_t, 8)
_(vpmovzxwq , uint16_t,  int64_t, 2)
_(vpmovzxwq , uint16_t,  int64_t, 4)
_(vpmovzxwq , uint16_t,  int64_t, 8)
_(vpmovzxdq , uint32_t,  int64_t, 2)
_(vpmovzxdq , uint32_t,  int64_t, 4)
_(vpmovzxdq , uint32_t,  int64_t, 8)
_(vpmovzxbq_,  uint8_t, uint64_t, 2)
_(vpmovzxbq_,  uint8_t, uint64_t, 4)
_(vpmovzxbq_,  uint8_t, uint64_t, 8)
_(vpmovzxwq_, uint16_t, uint64_t, 2)
_(vpmovzxwq_, uint16_t, uint64_t, 4)
_(vpmovzxwq_, uint16_t, uint64_t, 8)
_(vpmovzxdq_, uint32_t, uint64_t, 2)
_(vpmovzxdq_, uint32_t, uint64_t, 4)
_(vpmovzxdq_, uint32_t, uint64_t, 8)

// integral -> integral; sign extension
_(vpmovsxbw , int8_t,  int16_t,  8)
_(vpmovsxbw , int8_t,  int16_t, 16)
_(vpmovsxbw , int8_t,  int16_t, 32)
_(vpmovsxbw_, int8_t, uint16_t,  8)
_(vpmovsxbw_, int8_t, uint16_t, 16)
_(vpmovsxbw_, int8_t, uint16_t, 32)

_(vpmovsxbd ,  int8_t,  int32_t,  4)
_(vpmovsxbd ,  int8_t,  int32_t,  8)
_(vpmovsxbd ,  int8_t,  int32_t, 16)
_(vpmovsxwd , int16_t,  int32_t,  4)
_(vpmovsxwd , int16_t,  int32_t,  8)
_(vpmovsxwd , int16_t,  int32_t, 16)
_(vpmovsxbd_,  int8_t, uint32_t,  4)
_(vpmovsxbd_,  int8_t, uint32_t,  8)
_(vpmovsxbd_,  int8_t, uint32_t, 16)
_(vpmovsxwd_, int16_t, uint32_t,  4)
_(vpmovsxwd_, int16_t, uint32_t,  8)
_(vpmovsxwd_, int16_t, uint32_t, 16)

_(vpmovsxbq ,  int8_t,  int64_t, 2)
_(vpmovsxbq ,  int8_t,  int64_t, 4)
_(vpmovsxbq ,  int8_t,  int64_t, 8)
_(vpmovsxwq , int16_t,  int64_t, 2)
_(vpmovsxwq , int16_t,  int64_t, 4)
_(vpmovsxwq , int16_t,  int64_t, 8)
_(vpmovsxdq , int32_t,  int64_t, 2)
_(vpmovsxdq , int32_t,  int64_t, 4)
_(vpmovsxdq , int32_t,  int64_t, 8)
_(vpmovsxbq_,  int8_t, uint64_t, 2)
_(vpmovsxbq_,  int8_t, uint64_t, 4)
_(vpmovsxbq_,  int8_t, uint64_t, 8)
_(vpmovsxwq_, int16_t, uint64_t, 2)
_(vpmovsxwq_, int16_t, uint64_t, 4)
_(vpmovsxwq_, int16_t, uint64_t, 8)
_(vpmovsxdq_, int32_t, uint64_t, 2)
_(vpmovsxdq_, int32_t, uint64_t, 4)
_(vpmovsxdq_, int32_t, uint64_t, 8)

// integral -> double
_(vcvtdq2pd ,  int32_t, double, 2)
_(vcvtdq2pd ,  int32_t, double, 4)
_(vcvtdq2pd ,  int32_t, double, 8)
_(vcvtudq2pd, uint32_t, double, 2)
_(vcvtudq2pd, uint32_t, double, 4)
_(vcvtudq2pd, uint32_t, double, 8)
_(vcvtqq2pd ,  int64_t, double, 2)
_(vcvtqq2pd ,  int64_t, double, 4)
_(vcvtqq2pd ,  int64_t, double, 8)
_(vcvtuqq2pd, uint64_t, double, 2)
_(vcvtuqq2pd, uint64_t, double, 4)
_(vcvtuqq2pd, uint64_t, double, 8)

// integral -> float
_(vcvtdq2ps ,  int32_t, float,  4)
_(vcvtdq2ps ,  int32_t, float,  8)
_(vcvtdq2ps ,  int32_t, float, 16)
_(vcvtudq2ps, uint32_t, float,  4)
_(vcvtudq2ps, uint32_t, float,  8)
_(vcvtudq2ps, uint32_t, float, 16)
_(vcvtqq2ps ,  int64_t, float,  4)
_(vcvtqq2ps ,  int64_t, float,  8)
_(vcvtqq2ps ,  int64_t, float, 16)
_(vcvtuqq2ps, uint64_t, float,  4)
_(vcvtuqq2ps, uint64_t, float,  8)
_(vcvtuqq2ps, uint64_t, float, 16)

// float <-> double
_( cvttpd2ps, double, float,  2)
_(vcvttpd2ps, double, float,  4)
_(vcvttpd2ps, double, float,  8)
_( cvttps2pd, float, double,  2)
_(vcvttps2pd, float, double,  4)
_(vcvttps2pd, float, double,  8)

// float -> integral
_( cvttps2dq, float, int32_t,  4)
_(vcvttps2dq, float, int32_t,  8)
_(vcvttps2dq, float, int32_t, 16)
_( cvttps2qq, float, int64_t,  4)
_(vcvttps2qq, float, int64_t,  8)
_(vcvttps2qq, float, int64_t, 16)

_( cvttps2udq, float, uint32_t,  4)
_(vcvttps2udq, float, uint32_t,  8)
_(vcvttps2udq, float, uint32_t, 16)
_( cvttps2uqq, float, uint64_t,  4)
_(vcvttps2uqq, float, uint64_t,  8)
_(vcvttps2uqq, float, uint64_t, 16)

// double -> integral
_( cvttpd2dq, double, int32_t, 2)
_(vcvttpd2dq, double, int32_t, 4)
_(vcvttpd2dq, double, int32_t, 8)
_(vcvttpd2qq, double, int64_t, 2)
_(vcvttpd2qq, double, int64_t, 4)
_(vcvttpd2qq, double, int64_t, 8)

_(vcvttpd2udq, double, uint32_t, 2)
_(vcvttpd2udq, double, uint32_t, 4)
_(vcvttpd2udq, double, uint32_t, 8)
_(vcvttpd2uqq, double, uint64_t, 2)
_(vcvttpd2uqq, double, uint64_t, 4)
_(vcvttpd2uqq, double, uint64_t, 8)

// no change in type; nop
_(nop,   int8_t,   int8_t, 16)
_(nop,  uint8_t,  uint8_t, 16)
_(nop,   int8_t,   int8_t, 32)
_(nop,  uint8_t,  uint8_t, 32)
_(nop,   int8_t,   int8_t, 64)
_(nop,  uint8_t,  uint8_t, 64)
_(nop,  int16_t,  int16_t,  8)
_(nop, uint16_t, uint16_t,  8)
_(nop,  int16_t,  int16_t, 16)
_(nop, uint16_t, uint16_t, 16)
_(nop,  int16_t,  int16_t, 32)
_(nop, uint16_t, uint16_t, 32)
_(nop,  int32_t,  int32_t,  4)
_(nop, uint32_t, uint32_t,  4)
_(nop,  int32_t,  int32_t,  8)
_(nop, uint32_t, uint32_t,  8)
_(nop,  int32_t,  int32_t, 16)
_(nop, uint32_t, uint32_t, 16)
_(nop,  int64_t,  int64_t,  2)
_(nop, uint64_t, uint64_t,  2)
_(nop,  int64_t,  int64_t,  4)
_(nop, uint64_t, uint64_t,  4)
_(nop,  int64_t,  int64_t,  8)
_(nop, uint64_t, uint64_t,  8)
_(nop,   double,   double,  2)
_(nop,   double,   double,  4)
_(nop,   double,   double,  8)
_(nop,    float,    float,  4)
_(nop,    float,    float,  8)
_(nop,    float,    float, 16)