typedef float V __attribute__((altivec(vector__))); struct SimdFloatTag {}; struct SimdFloat { SimdFloat () {} SimdFloat (float f) : simdInternal_(__builtin_vec_splats (f)) {} SimdFloat (V simd) : simdInternal_(simd) {} V simdInternal_; }; template <typename T> struct SimdTraits {}; template <> struct SimdTraits <SimdFloat> { using type = float; static constexpr int width = 4; using tag = SimdFloatTag; }; static inline SimdFloat simdLoad (const float *m, SimdFloatTag = {}) { return { *reinterpret_cast <const V *>(m) }; } static inline SimdFloat simdLoadU (const float *m, SimdFloatTag = {}) { return { *reinterpret_cast <const V *>(m) }; } static inline void store (float *m, SimdFloat a) { *reinterpret_cast <V *>(m) = a.simdInternal_; } template <typename T> static inline T load (const typename SimdTraits <T>::type *m) { return simdLoad (m, typename SimdTraits <T>::tag ()); } template <typename T> static inline T loadU (const typename SimdTraits <T>::type *m) { return simdLoadU (m, typename SimdTraits <T>::tag ()); } template <typename T, typename TSimd, int simdWidth> void loadStoreTester (TSimd loadFn (const T *mem), void storeFn (T *mem, TSimd), const int loadOffset, const int storeOffset) { alignas(16) T src[simdWidth * 4]; alignas(16) T dst[simdWidth * 4]; T *pCopySrc = src + simdWidth + loadOffset; T *pCopyDst = dst + simdWidth + storeOffset; int i; for (i = 0; i < simdWidth * 4; i++) { src[i] = 1 + i; dst[i] = -1 - i; } storeFn (pCopyDst, loadFn (pCopySrc)); for (i = 0; i < simdWidth; i++) if (pCopySrc[i] != pCopyDst[i]) __builtin_abort (); for (i = 0; i < simdWidth*4; i++) { if (src[i] != (T) (1 + i)) __builtin_abort (); if (dst + i < pCopyDst || dst + i >= pCopyDst + simdWidth) if (dst[i] != (T) (-1 - i)) __builtin_abort (); } } template <typename T, typename TSimd> TSimd loadWrapper (const T *m) { return load <TSimd> (m); } template <typename T, typename TSimd> TSimd loadUWrapper (const T *m) { return loadU <TSimd> (m); } int main () { loadStoreTester <float, SimdFloat, 4> (loadWrapper, store, 0, 0); for (int i = 0; i < 4; i++) loadStoreTester <float, SimdFloat, 4> (loadUWrapper, store, i, 0); } aborts starting with r256656 on powerpc64le-linux with -O2 -m64 -mcpu=power8 -mtune=power8 -mvsx -std=c++11, reduced from gromacs-2018/src/gromacs/simd/tests/bootstrap_loadstore.cpp Not really sure if it isn't a gromacs bug, i.e. whether gromacs-2018/src/gromacs/simd/impl_ibm_vsx/impl_ibm_vsx_simd_float.h static inline SimdFloat gmx_simdcall simdLoadU(const float *m, SimdFloatTag = {}) { return { *reinterpret_cast<const __vector float *>(m) }; } is actually well defined unaligned load (if aligned to __alignof__(float), but not __alignof__(__vector float)). If not, the gromacs folks should be told.
>is actually well defined unaligned load (if aligned to __alignof__(float), but not __alignof__(__vector float)). I don't think it is a well defined unaligned load. I bet if you use -fsantized=undefined you will get an error about it. Also on older powerpc the vector load always ignored the lower (unaligned) bits; I suspect it is still using that instruction here.
I've tried return { *reinterpret_cast <const V __attribute__((aligned (__alignof__(float))))*>(m) }; but that doesn't work either. The difference between r256653 and r256656 is e.g. in loadUWrapper: - lxvd2x 0,0,3 + lvx 0,0,3 addi 10,1,-16 li 9,32 - xxpermdi 0,0,0,2 - xxpermdi 0,0,0,2 + xxpermdi 0,32,32,2 stxvd2x 0,0,10 addi 10,1,-48 - lxvd2x 34,10,9 - xxpermdi 34,34,34,2 + lvx 2,10,9
The Power backend will assume that any pointer declared as pointer to vector is aligned to the vector size. Coercing a (float *) to a (vector float *) violates this assumption. The code as written is not valid.
Agreed. The supported way to do an unaligned vector load on Power is with the vec_xl intrinsic.
Ok, confirmed that it works with return { __builtin_vec_xl (0, m) }; as the body of simdLoadU.