[PATCH 1/2] [ARM] PR68532: Fix up vuzp for big endian
Kyrill Tkachov
kyrylo.tkachov@foss.arm.com
Mon Feb 8 11:42:00 GMT 2016
Hi Charles,
On 03/02/16 18:59, charles.baylis@linaro.org wrote:
> From: Charles Baylis <charles.baylis@linaro.org>
>
> gcc/ChangeLog:
>
> 2016-02-03 Charles Baylis <charles.baylis@linaro.org>
>
> PR target/68532
> * config/arm/arm.c (neon_endian_lane_map): New function.
> (neon_vector_pair_endian_lane_map): New function.
> (arm_evpc_neon_vuzp): Allow for big endian lane order.
> * config/arm/arm_neon.h (vuzpq_s8): Adjust shuffle patterns for big
> endian.
> (vuzpq_s16): Likewise.
> (vuzpq_s32): Likewise.
> (vuzpq_f32): Likewise.
> (vuzpq_u8): Likewise.
> (vuzpq_u16): Likewise.
> (vuzpq_u32): Likewise.
> (vuzpq_p8): Likewise.
> (vuzpq_p16): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> 2015-12-15 Charles Baylis <charles.baylis@linaro.org>
>
> PR target/68532
> * gcc.c-torture/execute/pr68532.c: New test.
>
> Change-Id: Ifd35d79bd42825f05403a1b96d8f34ef0f21dac3
>
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> index d8a2745..e9aa982 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -28208,6 +28208,35 @@ arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
> arm_expand_vec_perm_1 (target, op0, op1, sel);
> }
>
> +/* map lane ordering between architectural lane order, and GCC lane order,
> + taking into account ABI. See comment above output_move_neon for details. */
> +static int
> +neon_endian_lane_map (machine_mode mode, int lane)
s/map/Map/
New line between comment and function signature.
> +{
> + if (BYTES_BIG_ENDIAN)
> + {
> + int nelems = GET_MODE_NUNITS (mode);
> + /* Reverse lane order. */
> + lane = (nelems - 1 - lane);
> + /* Reverse D register order, to match ABI. */
> + if (GET_MODE_SIZE (mode) == 16)
> + lane = lane ^ (nelems / 2);
> + }
> + return lane;
> +}
> +
> +/* some permutations index into pairs of vectors, this is a helper function
> + to map indexes into those pairs of vectors. */
> +static int
> +neon_pair_endian_lane_map (machine_mode mode, int lane)
Similarly, s/some/Some/ and new line after comment.
> +{
> + int nelem = GET_MODE_NUNITS (mode);
> + if (BYTES_BIG_ENDIAN)
> + lane =
> + neon_endian_lane_map (mode, lane & (nelem - 1)) + (lane & nelem);
> + return lane;
> +}
> +
> /* Generate or test for an insn that supports a constant permutation. */
>
> /* Recognize patterns for the VUZP insns. */
> @@ -28218,14 +28247,22 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
> unsigned int i, odd, mask, nelt = d->nelt;
> rtx out0, out1, in0, in1;
> rtx (*gen)(rtx, rtx, rtx, rtx);
> + int first_elem;
> + int swap;
>
Just make this a bool.
> if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
> return false;
>
> - /* Note that these are little-endian tests. Adjust for big-endian later. */
> - if (d->perm[0] == 0)
> + /* arm_expand_vec_perm_const_1 () helpfully swaps the operands for the
> + big endian pattern on 64 bit vectors, so we correct for that. */
> + swap = BYTES_BIG_ENDIAN && !d->one_vector_p
> + && GET_MODE_SIZE (d->vmode) == 8 ? d->nelt : 0;
> +
> + first_elem = d->perm[neon_endian_lane_map (d->vmode, 0)] ^ swap;
> +
> + if (first_elem == neon_endian_lane_map (d->vmode, 0))
> odd = 0;
> - else if (d->perm[0] == 1)
> + else if (first_elem == neon_endian_lane_map (d->vmode, 1))
> odd = 1;
> else
> return false;
> @@ -28233,8 +28270,9 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
>
> for (i = 0; i < nelt; i++)
> {
> - unsigned elt = (i * 2 + odd) & mask;
> - if (d->perm[i] != elt)
> + unsigned elt =
> + (neon_pair_endian_lane_map (d->vmode, i) * 2 + odd) & mask;
> + if ((d->perm[i] ^ swap) != neon_pair_endian_lane_map (d->vmode, elt))
> return false;
> }
>
> @@ -28258,10 +28296,9 @@ arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
>
> in0 = d->op0;
> in1 = d->op1;
> - if (BYTES_BIG_ENDIAN)
> + if (swap)
> {
> std::swap (in0, in1);
> - odd = !odd;
> }
remove the braces around the std::swap
>
> out0 = d->target;
> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> index 47816d5..2e014b6 100644
> --- a/gcc/config/arm/arm_neon.h
> +++ b/gcc/config/arm/arm_neon.h
> @@ -8741,9 +8741,9 @@ vuzpq_s8 (int8x16_t __a, int8x16_t __b)
> int8x16x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
> + { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
> + { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
> @@ -8759,9 +8759,9 @@ vuzpq_s16 (int16x8_t __a, int16x8_t __b)
> int16x8x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 9, 11, 13, 15, 1, 3, 5, 7 });
> + { 5, 7, 1, 3, 13, 15, 9, 11 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 8, 10, 12, 14, 0, 2, 4, 6 });
> + { 4, 6, 0, 2, 12, 14, 8, 10 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> { 0, 2, 4, 6, 8, 10, 12, 14 });
> @@ -8776,8 +8776,8 @@ vuzpq_s32 (int32x4_t __a, int32x4_t __b)
> {
> int32x4x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
> - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
> + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
> + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
> @@ -8790,8 +8790,8 @@ vuzpq_f32 (float32x4_t __a, float32x4_t __b)
> {
> float32x4x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
> - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
> + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
> + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
> @@ -8805,9 +8805,9 @@ vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
> uint8x16x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
> + { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
> + { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
> @@ -8823,9 +8823,9 @@ vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
> uint16x8x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 9, 11, 13, 15, 1, 3, 5, 7 });
> + { 5, 7, 1, 3, 13, 15, 9, 11 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 8, 10, 12, 14, 0, 2, 4, 6 });
> + { 4, 6, 0, 2, 12, 14, 8, 10 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> { 0, 2, 4, 6, 8, 10, 12, 14 });
> @@ -8840,8 +8840,8 @@ vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
> {
> uint32x4x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> - __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 5, 7, 1, 3 });
> - __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 4, 6, 0, 2 });
> + __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 3, 1, 7, 5 });
> + __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 2, 0, 6, 4 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint32x4_t) { 0, 2, 4, 6 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint32x4_t) { 1, 3, 5, 7 });
> @@ -8855,9 +8855,9 @@ vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
> poly8x16x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15 });
> + { 9, 11, 13, 15, 1, 3, 5, 7, 25, 27, 29, 31, 17, 19, 21, 23 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint8x16_t)
> - { 16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14 });
> + { 8, 10, 12, 14, 0, 2, 4, 6, 24, 26, 28, 30, 16, 18, 20, 22 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint8x16_t)
> { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 });
> @@ -8873,9 +8873,9 @@ vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
> poly16x8x2_t __rv;
> #ifdef __ARM_BIG_ENDIAN
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 9, 11, 13, 15, 1, 3, 5, 7 });
> + { 5, 7, 1, 3, 13, 15, 9, 11 });
> __rv.val[1] = __builtin_shuffle (__a, __b, (uint16x8_t)
> - { 8, 10, 12, 14, 0, 2, 4, 6 });
> + { 4, 6, 0, 2, 12, 14, 8, 10 });
> #else
> __rv.val[0] = __builtin_shuffle (__a, __b, (uint16x8_t)
> { 0, 2, 4, 6, 8, 10, 12, 14 });
> diff --git a/gcc/testsuite/gcc.c-torture/execute/pr68532.c b/gcc/testsuite/gcc.c-torture/execute/pr68532.c
> new file mode 100644
> index 0000000..3c40aa8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.c-torture/execute/pr68532.c
> @@ -0,0 +1,24 @@
> +/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
> +
> +#define SIZE 128
> +unsigned short _Alignas (16) in[SIZE];
> +
> +extern void abort (void);
> +
> +__attribute__ ((noinline)) int
> +test (unsigned short sum, unsigned short *in, int x)
> +{
> + for (int j = 0; j < SIZE; j += 8)
> + sum += in[j] * x;
> + return sum;
> +}
> +
> +int
> +main ()
> +{
> + for (int i = 0; i < SIZE; i++)
> + in[i] = i;
> + if (test (0, in, 1) != 960)
> + abort ();
AFAIK tests here usually prefer __builtin_abort ();
That way you don't have to declare the abort prototype in the beginning.
Ok with those changes.
Thanks,
Kyrill
More information about the Gcc-patches
mailing list