[PATCH 2/4] [AARCH64,NEON] Convert arm_neon.h to use new builtins for vld[234](q?)_lane_*

Tejas Belagod tejas.belagod@arm.com
Fri Sep 19 11:21:00 GMT 2014


On 18/09/14 20:38, Charles Baylis wrote:
> This patch replaces the inline assembler implementations of the
> vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin
> functions added in patch 1.
>
> Tested (with the rest of the patch series) with make check on aarch64-oe-linux
> with qemu, and also causes no regressions in clyon's NEON intrinsics tests.
>
> <DATE>  Charles Baylis  <charles.baylis@linaro.org>
>
>          * config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
>          update uses to use new macro arguments.
>          (__LD3_LANE_FUNC): Likewise.
>          (__LD4_LANE_FUNC): Likewise.
>
> Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a
> ---
>   gcc/config/aarch64/arm_neon.h | 359 ++++++++++++++++++++++++++++--------------
>   1 file changed, 237 insertions(+), 122 deletions(-)
>
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index e62c783..c1fcb47 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -11805,47 +11805,79 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q)
>   __LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q)
>   __LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q)
>
> -#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
> -                       lnsuffix, funcsuffix, Q)                        \
> -  __extension__ static __inline rettype                                        \
> -  __attribute__ ((__always_inline__))                                  \
> -  vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
> -                                    rettype b, const int c)            \
> -  {                                                                    \
> -    rettype result;                                                    \
> -    __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t"    \
> -            "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t"   \
> -            "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t"     \
> -            : "=Q"(result)                                             \
> -            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
> -            : "memory", "v16", "v17");                                 \
> -    return result;                                                     \
> -  }
> -
> -__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
> -__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
> -__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
> -__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
> -__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
> -__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
> -__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
> -__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
> -__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
> -__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
> -__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
> -__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
> -__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
> -__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
> -__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
> -__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
> -__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
> -__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
> -__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
> -__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
> -__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
> -__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
> -__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
> -__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
> +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
> +                        mode, ptrmode, funcsuffix, signedtype)            \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
> +{                                                                         \
> +  __builtin_aarch64_simd_oi __o;                                          \
> +  largetype __temp;                                                       \
> +  __temp.val[0] =                                                         \
> +    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
> +  __temp.val[1] =                                                         \
> +    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
> +  __o = __builtin_aarch64_set_qregoi##mode (__o,                          \
> +                                          (signedtype) __temp.val[0],     \
> +                                          0);                             \
> +  __o = __builtin_aarch64_set_qregoi##mode (__o,                          \
> +                                          (signedtype) __temp.val[1],     \
> +                                          1);                             \
> +  __o =        __builtin_aarch64_ld2_lane##mode (                                 \
> +         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
> +  __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0);         \
> +  __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1);         \
> +  return __b;                                                             \
> +}
> +
> +__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
> +                sf, f32, float32x4_t)
> +__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
> +                df, f64, float64x2_t)
> +__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
> +                int8x16_t)
> +__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
> +                p16, int16x8_t)
> +__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
> +                int8x16_t)
> +__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
> +                int16x8_t)
> +__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
> +                int32x4_t)
> +__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
> +                int64x2_t)
> +__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
> +                int8x16_t)
> +__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
> +                u16, int16x8_t)
> +__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
> +                u32, int32x4_t)
> +__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
> +                u64, int64x2_t)
> +
> +#undef __LD2_LANE_FUNC
> +#define __LD2_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)       \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{                                                                         \
> +  union { intype __i;                                                     \
> +         __builtin_aarch64_simd_oi __o; } __temp = { __b };               \
> +  __temp.__o = __builtin_aarch64_ld2_lane##mode (                         \
> +       (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);      \
> +  return __temp.__i;                                                      \
> +}
> +
> +__LD2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
> +__LD2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
> +__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
> +__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
> +__LD2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
> +__LD2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
> +__LD2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
> +__LD2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
> +__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
> +__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
> +__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
> +__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
>
>   #define __LD3R_FUNC(rettype, structtype, ptrtype,                      \
>                      regsuffix, funcsuffix, Q)                           \
> @@ -11887,47 +11919,85 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q)
>   __LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q)
>   __LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q)
>
> -#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
> -                       lnsuffix, funcsuffix, Q)                        \
> -  __extension__ static __inline rettype                                        \
> -  __attribute__ ((__always_inline__))                                  \
> -  vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
> -                                    rettype b, const int c)            \
> -  {                                                                    \
> -    rettype result;                                                    \
> -    __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t"   \
> -            "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t"  \
> -            "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t"    \
> -            : "=Q"(result)                                             \
> -            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
> -            : "memory", "v16", "v17", "v18");                          \
> -    return result;                                                     \
> -  }
> -
> -__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
> -__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
> -__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
> -__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
> -__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
> -__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
> -__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
> -__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
> -__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
> -__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
> -__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
> -__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
> -__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
> -__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
> -__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
> -__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
> -__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
> -__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
> -__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
> -__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
> -__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
> -__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
> -__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
> -__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
> +#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
> +                        mode, ptrmode, funcsuffix, signedtype)            \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
> +{                                                                         \
> +  __builtin_aarch64_simd_ci __o;                                          \
> +  largetype __temp;                                                       \
> +  __temp.val[0] =                                                         \
> +    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
> +  __temp.val[1] =                                                         \
> +    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
> +  __temp.val[2] =                                                         \
> +    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));         \
> +  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
> +                                          (signedtype) __temp.val[0],     \
> +                                          0);                             \
> +  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
> +                                          (signedtype) __temp.val[1],     \
> +                                          1);                             \
> +  __o = __builtin_aarch64_set_qregci##mode (__o,                          \
> +                                          (signedtype) __temp.val[2],     \
> +                                          2);                             \
> +  __o =        __builtin_aarch64_ld3_lane##mode (                                 \
> +         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
> +  __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0);         \
> +  __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1);         \
> +  __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2);         \
> +  return __b;                                                             \
> +}
> +
> +__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
> +                sf, f32, float32x4_t)
> +__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
> +                df, f64, float64x2_t)
> +__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
> +                int8x16_t)
> +__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
> +                p16, int16x8_t)
> +__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
> +                int8x16_t)
> +__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
> +                int16x8_t)
> +__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
> +                int32x4_t)
> +__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
> +                int64x2_t)
> +__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
> +                int8x16_t)
> +__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
> +                u16, int16x8_t)
> +__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
> +                u32, int32x4_t)
> +__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
> +                u64, int64x2_t)
> +
> +#undef __LD3_LANE_FUNC
> +#define __LD3_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)       \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{                                                                         \
> +  union { intype __i;                                                     \
> +         __builtin_aarch64_simd_xi __o; } __temp = { __b };               \
> +  __temp.__o = __builtin_aarch64_ld4_lane##mode (                         \
> +       (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);      \
> +  return __temp.__i;                                                      \
> +}
> +
> +__LD3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
> +__LD3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
> +__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
> +__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
> +__LD3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
> +__LD3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
> +__LD3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
> +__LD3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
> +__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
> +__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
> +__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
> +__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
>
>   #define __LD4R_FUNC(rettype, structtype, ptrtype,                      \
>                      regsuffix, funcsuffix, Q)                           \
> @@ -11969,47 +12039,92 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q)
>   __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q)
>   __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q)
>
> -#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix,                   \
> -                       lnsuffix, funcsuffix, Q)                        \
> -  __extension__ static __inline rettype                                        \
> -  __attribute__ ((__always_inline__))                                  \
> -  vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,               \
> -                                    rettype b, const int c)            \
> -  {                                                                    \
> -    rettype result;                                                    \
> -    __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t"   \
> -            "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t"  \
> -            "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t"    \
> -            : "=Q"(result)                                             \
> -            : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c)               \
> -            : "memory", "v16", "v17", "v18", "v19");                   \
> -    return result;                                                     \
> -  }
>
> -__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
> -__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
> -__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
> -__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
> -__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
> -__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
> -__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
> -__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
> -__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
> -__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
> -__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
> -__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
> -__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
> -__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
> -__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
> -__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
> -__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
> -__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
> -__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
> -__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
> -__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
> -__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
> -__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
> -__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
> +#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype,              \
> +                        mode, ptrmode, funcsuffix, signedtype)            \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c)  \
> +{                                                                         \
> +  __builtin_aarch64_simd_xi __o;                                          \
> +  largetype __temp;                                                       \
> +  __temp.val[0] =                                                         \
> +    vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0));         \
> +  __temp.val[1] =                                                         \
> +    vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0));         \
> +  __temp.val[2] =                                                         \
> +    vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0));         \
> +  __temp.val[3] =                                                         \
> +    vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0));         \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[0],     \
> +                                          0);                             \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[1],     \
> +                                          1);                             \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[2],     \
> +                                          2);                             \
> +  __o = __builtin_aarch64_set_qregxi##mode (__o,                          \
> +                                          (signedtype) __temp.val[3],     \
> +                                          3);                             \
> +  __o =        __builtin_aarch64_ld4_lane##mode (                                 \
> +         (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c);           \
> +  __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0);         \
> +  __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1);         \
> +  __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2);         \
> +  __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3);         \
> +  return __b;                                                             \
> +}
> +
> +__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
> +                sf, f32, float32x4_t)
> +__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
> +                df, f64, float64x2_t)
> +__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
> +                int8x16_t)
> +__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
> +                p16, int16x8_t)
> +__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
> +                int8x16_t)
> +__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
> +                int16x8_t)
> +__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
> +                int32x4_t)
> +__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
> +                int64x2_t)
> +__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
> +                int8x16_t)
> +__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
> +                u16, int16x8_t)
> +__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
> +                u32, int32x4_t)
> +__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
> +                u64, int64x2_t)
> +
> +#undef __LD4_LANE_FUNC
> +#define __LD4_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix)       \
> +__extension__ static __inline intype __attribute__ ((__always_inline__))   \
> +vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
> +{                                                                         \
> +  union { intype __i;                                                     \
> +         __builtin_aarch64_simd_xi __o; } __temp = { __b };               \
> +  __temp.__o = __builtin_aarch64_ld4_lane##mode (                         \
> +       (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c);      \
> +  return __temp.__i;                                                      \
> +}
> +

The reason we avoided using type-punning using unions was that reload 
would get confused with potential subreg(mem) that could be introduced 
because of memory xfer caused by unions and large int modes. As a 
result, we would get incorrect or sub-optimal code. But this seems to 
have fixed itself. :-)

Because this involves xfers between large int modes and 
CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test 
what impact your patch has with C_C_M_C removed, so that it will be 
easier to fix the fallout once we remove C_C_M_C eventually. To test 
this you will need Richard's patch set 
https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html.

Same for your other 2 patches in this series(3,4).

Thanks,
Tejas.

> +__LD4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
> +__LD4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
> +__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
> +__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
> +__LD4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
> +__LD4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
> +__LD4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
> +__LD4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
> +__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
> +__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
> +__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
> +__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
>
>   #define __ST2_LANE_FUNC(intype, largetype, ptrtype,                         \
>                          mode, ptr_mode, funcsuffix, signedtype)              \
> --
> 1.9.1
>
>




More information about the Gcc-patches mailing list