This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: Update interface to TARGET_VECTORIZE_VEC_PERM_CONST_OK
- From: Richard Biener <richard dot guenther at gmail dot com>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>, Richard Sandiford <richard dot sandiford at linaro dot org>
- Date: Mon, 25 Sep 2017 13:43:05 +0200
- Subject: Re: Update interface to TARGET_VECTORIZE_VEC_PERM_CONST_OK
- Authentication-results: sourceware.org; auth=none
- References: <87a81mr8th.fsf@linaro.org>
On Fri, Sep 22, 2017 at 6:34 PM, Richard Sandiford
<richard.sandiford@linaro.org> wrote:
> This patch makes TARGET_VECTORIZE_VEC_PERM_CONST_OK take the permute
> vector in the form of a vec_perm_indices instead of an unsigned char *.
> It follows on from the recent patch that did the same in target-independent
> code.
>
> It was easy to make ARM and AArch64 use vec_perm_indices internally
> as well, and converting AArch64 helps with SVE. I did try doing the same
> for the other ports, but the surgery needed was much more invasive and
> much less obviously correct.
>
> Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64le-linux-gnu.
> Also tested by comparing the testsuite assembly output on at least one
> target per CPU directory. OK to install?
Ok.
Thanks,
Richard.
> Richard
>
>
> 2017-09-22 Richard Sandiford <richard.sandifird@linaro.org>
>
> gcc/
> * target.def (vec_perm_const_ok): Change sel parameter to
> vec_perm_indices.
> * optabs-query.c (can_vec_perm_p): Update accordingly.
> * doc/tm.texi: Regenerate.
> * config/aarch64/aarch64.c (expand_vec_perm_d): Change perm to
> auto_vec_perm_indices and remove separate nelt field.
> (aarch64_evpc_trn, aarch64_evpc_uzp, aarch64_evpc_zip)
> (aarch64_evpc_ext, aarch64_evpc_rev, aarch64_evpc_dup)
> (aarch64_evpc_tbl, aarch64_expand_vec_perm_const_1)
> (aarch64_expand_vec_perm_const): Update accordingly.
> (aarch64_vectorize_vec_perm_const_ok): Likewise. Change sel
> to vec_perm_indices.
> * config/arm/arm.c (expand_vec_perm_d): Change perm to
> auto_vec_perm_indices and remove separate nelt field.
> (arm_evpc_neon_vuzp, arm_evpc_neon_vzip, arm_evpc_neon_vrev)
> (arm_evpc_neon_vtrn, arm_evpc_neon_vext, arm_evpc_neon_vtbl)
> (arm_expand_vec_perm_const_1, arm_expand_vec_perm_const): Update
> accordingly.
> (arm_vectorize_vec_perm_const_ok): Likewise. Change sel
> to vec_perm_indices.
> * config/i386/i386.c (ix86_vectorize_vec_perm_const_ok): Change
> sel to vec_perm_indices.
> * config/ia64/ia64.c (ia64_vectorize_vec_perm_const_ok): Likewise.
> * config/mips/mips.c (mips_vectorize_vec_perm_const_ok): Likewise.
> * config/powerpcspe/powerpcspe.c (rs6000_vectorize_vec_perm_const_ok):
> Likewise.
> * config/rs6000/rs6000.c (rs6000_vectorize_vec_perm_const_ok):
> Likewise.
>
> Index: gcc/target.def
> ===================================================================
> --- gcc/target.def 2017-09-22 17:31:36.935337179 +0100
> +++ gcc/target.def 2017-09-22 17:31:56.428954480 +0100
> @@ -1847,7 +1847,7 @@ DEFHOOK
> DEFHOOK
> (vec_perm_const_ok,
> "Return true if a vector created for @code{vec_perm_const} is valid.",
> - bool, (machine_mode, const unsigned char *sel),
> + bool, (machine_mode, vec_perm_indices),
> NULL)
>
> /* Return true if the target supports misaligned store/load of a
> Index: gcc/optabs-query.c
> ===================================================================
> --- gcc/optabs-query.c 2017-09-14 17:04:19.080694343 +0100
> +++ gcc/optabs-query.c 2017-09-22 17:31:56.428006577 +0100
> @@ -367,7 +367,7 @@ can_vec_perm_p (machine_mode mode, bool
> if (direct_optab_handler (vec_perm_const_optab, mode) != CODE_FOR_nothing
> && (sel == NULL
> || targetm.vectorize.vec_perm_const_ok == NULL
> - || targetm.vectorize.vec_perm_const_ok (mode, &(*sel)[0])))
> + || targetm.vectorize.vec_perm_const_ok (mode, *sel)))
> return true;
> }
>
> Index: gcc/doc/tm.texi
> ===================================================================
> --- gcc/doc/tm.texi 2017-09-22 17:31:36.933441374 +0100
> +++ gcc/doc/tm.texi 2017-09-22 17:31:56.428006577 +0100
> @@ -5774,7 +5774,7 @@ correct for most targets.
> Return true if vector alignment is reachable (by peeling N iterations) for the given scalar type @var{type}. @var{is_packed} is false if the scalar access using @var{type} is known to be naturally aligned.
> @end deftypefn
>
> -@deftypefn {Target Hook} bool TARGET_VECTORIZE_VEC_PERM_CONST_OK (machine_mode, const unsigned char *@var{sel})
> +@deftypefn {Target Hook} bool TARGET_VECTORIZE_VEC_PERM_CONST_OK (machine_mode, @var{vec_perm_indices})
> Return true if a vector created for @code{vec_perm_const} is valid.
> @end deftypefn
>
> Index: gcc/config/aarch64/aarch64.c
> ===================================================================
> --- gcc/config/aarch64/aarch64.c 2017-09-21 11:53:16.681759682 +0100
> +++ gcc/config/aarch64/aarch64.c 2017-09-22 17:31:56.412840135 +0100
> @@ -141,8 +141,8 @@ static void aarch64_elf_asm_constructor
> static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
> static void aarch64_override_options_after_change (void);
> static bool aarch64_vector_mode_supported_p (machine_mode);
> -static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
> - const unsigned char *sel);
> +static bool aarch64_vectorize_vec_perm_const_ok (machine_mode,
> + vec_perm_indices);
> static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
> static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
> const_tree type,
> @@ -13146,9 +13146,8 @@ #define MAX_VECT_LEN 16
> struct expand_vec_perm_d
> {
> rtx target, op0, op1;
> - unsigned char perm[MAX_VECT_LEN];
> + auto_vec_perm_indices perm;
> machine_mode vmode;
> - unsigned char nelt;
> bool one_vector_p;
> bool testing_p;
> };
> @@ -13231,7 +13230,7 @@ aarch64_expand_vec_perm (rtx target, rtx
> static bool
> aarch64_evpc_trn (struct expand_vec_perm_d *d)
> {
> - unsigned int i, odd, mask, nelt = d->nelt;
> + unsigned int i, odd, mask, nelt = d->perm.length ();
> rtx out, in0, in1, x;
> rtx (*gen) (rtx, rtx, rtx);
> machine_mode vmode = d->vmode;
> @@ -13319,7 +13318,7 @@ aarch64_evpc_trn (struct expand_vec_perm
> static bool
> aarch64_evpc_uzp (struct expand_vec_perm_d *d)
> {
> - unsigned int i, odd, mask, nelt = d->nelt;
> + unsigned int i, odd, mask, nelt = d->perm.length ();
> rtx out, in0, in1, x;
> rtx (*gen) (rtx, rtx, rtx);
> machine_mode vmode = d->vmode;
> @@ -13406,7 +13405,7 @@ aarch64_evpc_uzp (struct expand_vec_perm
> static bool
> aarch64_evpc_zip (struct expand_vec_perm_d *d)
> {
> - unsigned int i, high, mask, nelt = d->nelt;
> + unsigned int i, high, mask, nelt = d->perm.length ();
> rtx out, in0, in1, x;
> rtx (*gen) (rtx, rtx, rtx);
> machine_mode vmode = d->vmode;
> @@ -13499,7 +13498,7 @@ aarch64_evpc_zip (struct expand_vec_perm
> static bool
> aarch64_evpc_ext (struct expand_vec_perm_d *d)
> {
> - unsigned int i, nelt = d->nelt;
> + unsigned int i, nelt = d->perm.length ();
> rtx (*gen) (rtx, rtx, rtx, rtx);
> rtx offset;
>
> @@ -13563,7 +13562,7 @@ aarch64_evpc_ext (struct expand_vec_perm
> static bool
> aarch64_evpc_rev (struct expand_vec_perm_d *d)
> {
> - unsigned int i, j, diff, nelt = d->nelt;
> + unsigned int i, j, diff, nelt = d->perm.length ();
> rtx (*gen) (rtx, rtx);
>
> if (!d->one_vector_p)
> @@ -13641,7 +13640,7 @@ aarch64_evpc_dup (struct expand_vec_perm
> rtx out = d->target;
> rtx in0;
> machine_mode vmode = d->vmode;
> - unsigned int i, elt, nelt = d->nelt;
> + unsigned int i, elt, nelt = d->perm.length ();
> rtx lane;
>
> elt = d->perm[0];
> @@ -13686,7 +13685,7 @@ aarch64_evpc_tbl (struct expand_vec_perm
> {
> rtx rperm[MAX_VECT_LEN], sel;
> machine_mode vmode = d->vmode;
> - unsigned int i, nelt = d->nelt;
> + unsigned int i, nelt = d->perm.length ();
>
> if (d->testing_p)
> return true;
> @@ -13720,12 +13719,11 @@ aarch64_expand_vec_perm_const_1 (struct
> /* The pattern matching functions above are written to look for a small
> number to begin the sequence (0, 1, N/2). If we begin with an index
> from the second operand, we can swap the operands. */
> - if (d->perm[0] >= d->nelt)
> + unsigned int nelt = d->perm.length ();
> + if (d->perm[0] >= nelt)
> {
> - unsigned i, nelt = d->nelt;
> -
> gcc_assert (nelt == (nelt & -nelt));
> - for (i = 0; i < nelt; ++i)
> + for (unsigned int i = 0; i < nelt; ++i)
> d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
>
> std::swap (d->op0, d->op1);
> @@ -13764,15 +13762,16 @@ aarch64_expand_vec_perm_const (rtx targe
>
> d.vmode = GET_MODE (target);
> gcc_assert (VECTOR_MODE_P (d.vmode));
> - d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
> d.testing_p = false;
>
> + nelt = GET_MODE_NUNITS (d.vmode);
> + d.perm.reserve (nelt);
> for (i = which = 0; i < nelt; ++i)
> {
> rtx e = XVECEXP (sel, 0, i);
> int ei = INTVAL (e) & (2 * nelt - 1);
> which |= (ei < nelt ? 1 : 2);
> - d.perm[i] = ei;
> + d.perm.quick_push (ei);
> }
>
> switch (which)
> @@ -13807,19 +13806,18 @@ aarch64_expand_vec_perm_const (rtx targe
> }
>
> static bool
> -aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
> - const unsigned char *sel)
> +aarch64_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
> {
> struct expand_vec_perm_d d;
> unsigned int i, nelt, which;
> bool ret;
>
> d.vmode = vmode;
> - d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
> d.testing_p = true;
> - memcpy (d.perm, sel, nelt);
> + d.perm.safe_splice (sel);
>
> /* Calculate whether all elements are in one vector. */
> + nelt = sel.length ();
> for (i = which = 0; i < nelt; ++i)
> {
> unsigned char e = d.perm[i];
> Index: gcc/config/arm/arm.c
> ===================================================================
> --- gcc/config/arm/arm.c 2017-09-22 17:22:08.191305805 +0100
> +++ gcc/config/arm/arm.c 2017-09-22 17:31:56.414735941 +0100
> @@ -287,8 +287,7 @@ static int arm_cortex_a5_branch_cost (bo
> static int arm_cortex_m_branch_cost (bool, bool);
> static int arm_cortex_m7_branch_cost (bool, bool);
>
> -static bool arm_vectorize_vec_perm_const_ok (machine_mode vmode,
> - const unsigned char *sel);
> +static bool arm_vectorize_vec_perm_const_ok (machine_mode, vec_perm_indices);
>
> static bool aarch_macro_fusion_pair_p (rtx_insn*, rtx_insn*);
>
> @@ -28657,9 +28656,8 @@ #define MAX_VECT_LEN 16
> struct expand_vec_perm_d
> {
> rtx target, op0, op1;
> - unsigned char perm[MAX_VECT_LEN];
> + auto_vec_perm_indices perm;
> machine_mode vmode;
> - unsigned char nelt;
> bool one_vector_p;
> bool testing_p;
> };
> @@ -28766,7 +28764,7 @@ neon_pair_endian_lane_map (machine_mode
> static bool
> arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
> {
> - unsigned int i, odd, mask, nelt = d->nelt;
> + unsigned int i, odd, mask, nelt = d->perm.length ();
> rtx out0, out1, in0, in1;
> rtx (*gen)(rtx, rtx, rtx, rtx);
> int first_elem;
> @@ -28778,7 +28776,7 @@ arm_evpc_neon_vuzp (struct expand_vec_pe
> /* arm_expand_vec_perm_const_1 () helpfully swaps the operands for the
> big endian pattern on 64 bit vectors, so we correct for that. */
> swap_nelt = BYTES_BIG_ENDIAN && !d->one_vector_p
> - && GET_MODE_SIZE (d->vmode) == 8 ? d->nelt : 0;
> + && GET_MODE_SIZE (d->vmode) == 8 ? nelt : 0;
>
> first_elem = d->perm[neon_endian_lane_map (d->vmode, 0)] ^ swap_nelt;
>
> @@ -28837,7 +28835,7 @@ arm_evpc_neon_vuzp (struct expand_vec_pe
> static bool
> arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
> {
> - unsigned int i, high, mask, nelt = d->nelt;
> + unsigned int i, high, mask, nelt = d->perm.length ();
> rtx out0, out1, in0, in1;
> rtx (*gen)(rtx, rtx, rtx, rtx);
> int first_elem;
> @@ -28912,7 +28910,7 @@ arm_evpc_neon_vzip (struct expand_vec_pe
> static bool
> arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
> {
> - unsigned int i, j, diff, nelt = d->nelt;
> + unsigned int i, j, diff, nelt = d->perm.length ();
> rtx (*gen)(rtx, rtx);
>
> if (!d->one_vector_p)
> @@ -28988,7 +28986,7 @@ arm_evpc_neon_vrev (struct expand_vec_pe
> static bool
> arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
> {
> - unsigned int i, odd, mask, nelt = d->nelt;
> + unsigned int i, odd, mask, nelt = d->perm.length ();
> rtx out0, out1, in0, in1;
> rtx (*gen)(rtx, rtx, rtx, rtx);
>
> @@ -29054,7 +29052,7 @@ arm_evpc_neon_vtrn (struct expand_vec_pe
> static bool
> arm_evpc_neon_vext (struct expand_vec_perm_d *d)
> {
> - unsigned int i, nelt = d->nelt;
> + unsigned int i, nelt = d->perm.length ();
> rtx (*gen) (rtx, rtx, rtx, rtx);
> rtx offset;
>
> @@ -29128,7 +29126,7 @@ arm_evpc_neon_vtbl (struct expand_vec_pe
> {
> rtx rperm[MAX_VECT_LEN], sel;
> machine_mode vmode = d->vmode;
> - unsigned int i, nelt = d->nelt;
> + unsigned int i, nelt = d->perm.length ();
>
> /* TODO: ARM's VTBL indexing is little-endian. In order to handle GCC's
> numbering of elements for big-endian, we must reverse the order. */
> @@ -29165,11 +29163,10 @@ arm_expand_vec_perm_const_1 (struct expa
> /* The pattern matching functions above are written to look for a small
> number to begin the sequence (0, 1, N/2). If we begin with an index
> from the second operand, we can swap the operands. */
> - if (d->perm[0] >= d->nelt)
> + unsigned int nelt = d->perm.length ();
> + if (d->perm[0] >= nelt)
> {
> - unsigned i, nelt = d->nelt;
> -
> - for (i = 0; i < nelt; ++i)
> + for (unsigned int i = 0; i < nelt; ++i)
> d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
>
> std::swap (d->op0, d->op1);
> @@ -29204,15 +29201,16 @@ arm_expand_vec_perm_const (rtx target, r
>
> d.vmode = GET_MODE (target);
> gcc_assert (VECTOR_MODE_P (d.vmode));
> - d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
> d.testing_p = false;
>
> + nelt = GET_MODE_NUNITS (d.vmode);
> + d.perm.reserve (nelt);
> for (i = which = 0; i < nelt; ++i)
> {
> rtx e = XVECEXP (sel, 0, i);
> int ei = INTVAL (e) & (2 * nelt - 1);
> which |= (ei < nelt ? 1 : 2);
> - d.perm[i] = ei;
> + d.perm.quick_push (ei);
> }
>
> switch (which)
> @@ -29249,19 +29247,18 @@ arm_expand_vec_perm_const (rtx target, r
> /* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK. */
>
> static bool
> -arm_vectorize_vec_perm_const_ok (machine_mode vmode,
> - const unsigned char *sel)
> +arm_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
> {
> struct expand_vec_perm_d d;
> unsigned int i, nelt, which;
> bool ret;
>
> d.vmode = vmode;
> - d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
> d.testing_p = true;
> - memcpy (d.perm, sel, nelt);
> + d.perm.safe_splice (sel);
>
> /* Categorize the set of elements in the selector. */
> + nelt = GET_MODE_NUNITS (d.vmode);
> for (i = which = 0; i < nelt; ++i)
> {
> unsigned char e = d.perm[i];
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c 2017-09-22 17:22:08.149305815 +0100
> +++ gcc/config/i386/i386.c 2017-09-22 17:31:56.418527551 +0100
> @@ -50024,8 +50024,7 @@ ix86_expand_vec_perm_const (rtx operands
> /* Implement targetm.vectorize.vec_perm_const_ok. */
>
> static bool
> -ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
> - const unsigned char *sel)
> +ix86_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
> {
> struct expand_vec_perm_d d;
> unsigned int i, nelt, which;
> @@ -50096,11 +50095,11 @@ ix86_vectorize_vec_perm_const_ok (machin
>
> /* Extract the values from the vector CST into the permutation
> array in D. */
> - memcpy (d.perm, sel, nelt);
> for (i = which = 0; i < nelt; ++i)
> {
> - unsigned char e = d.perm[i];
> + unsigned char e = sel[i];
> gcc_assert (e < 2 * nelt);
> + d.perm[i] = e;
> which |= (e < nelt ? 1 : 2);
> }
>
> Index: gcc/config/ia64/ia64.c
> ===================================================================
> --- gcc/config/ia64/ia64.c 2017-09-21 11:53:16.654742357 +0100
> +++ gcc/config/ia64/ia64.c 2017-09-22 17:31:56.419475454 +0100
> @@ -333,8 +333,7 @@ static machine_mode ia64_get_reg_raw_mod
> static section * ia64_hpux_function_section (tree, enum node_frequency,
> bool, bool);
>
> -static bool ia64_vectorize_vec_perm_const_ok (machine_mode vmode,
> - const unsigned char *sel);
> +static bool ia64_vectorize_vec_perm_const_ok (machine_mode, vec_perm_indices);
>
> static unsigned int ia64_hard_regno_nregs (unsigned int, machine_mode);
> static bool ia64_hard_regno_mode_ok (unsigned int, machine_mode);
> @@ -11824,8 +11823,7 @@ ia64_expand_vec_perm_const (rtx operands
> /* Implement targetm.vectorize.vec_perm_const_ok. */
>
> static bool
> -ia64_vectorize_vec_perm_const_ok (machine_mode vmode,
> - const unsigned char *sel)
> +ia64_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
> {
> struct expand_vec_perm_d d;
> unsigned int i, nelt, which;
> @@ -11837,10 +11835,10 @@ ia64_vectorize_vec_perm_const_ok (machin
>
> /* Extract the values from the vector CST into the permutation
> array in D. */
> - memcpy (d.perm, sel, nelt);
> for (i = which = 0; i < nelt; ++i)
> {
> - unsigned char e = d.perm[i];
> + unsigned char e = sel[i];
> + d.perm[i] = e;
> gcc_assert (e < 2 * nelt);
> which |= (e < nelt ? 1 : 2);
> }
> Index: gcc/config/mips/mips.c
> ===================================================================
> --- gcc/config/mips/mips.c 2017-09-21 11:53:16.776320319 +0100
> +++ gcc/config/mips/mips.c 2017-09-22 17:31:56.421371259 +0100
> @@ -21470,8 +21470,7 @@ mips_sched_reassociation_width (unsigned
> /* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK. */
>
> static bool
> -mips_vectorize_vec_perm_const_ok (machine_mode vmode,
> - const unsigned char *sel)
> +mips_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
> {
> struct expand_vec_perm_d d;
> unsigned int i, nelt, which;
> @@ -21480,12 +21479,12 @@ mips_vectorize_vec_perm_const_ok (machin
> d.vmode = vmode;
> d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
> d.testing_p = true;
> - memcpy (d.perm, sel, nelt);
>
> /* Categorize the set of elements in the selector. */
> for (i = which = 0; i < nelt; ++i)
> {
> - unsigned char e = d.perm[i];
> + unsigned char e = sel[i];
> + d.perm[i] = e;
> gcc_assert (e < 2 * nelt);
> which |= (e < nelt ? 1 : 2);
> }
> Index: gcc/config/powerpcspe/powerpcspe.c
> ===================================================================
> --- gcc/config/powerpcspe/powerpcspe.c 2017-09-21 11:53:16.643935427 +0100
> +++ gcc/config/powerpcspe/powerpcspe.c 2017-09-22 17:31:56.424214967 +0100
> @@ -38731,8 +38731,7 @@ rs6000_expand_vec_perm_const (rtx operan
> /* Test whether a constant permutation is supported. */
>
> static bool
> -rs6000_vectorize_vec_perm_const_ok (machine_mode vmode,
> - const unsigned char *sel)
> +rs6000_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
> {
> /* AltiVec (and thus VSX) can handle arbitrary permutations. */
> if (TARGET_ALTIVEC)
> Index: gcc/config/rs6000/rs6000.c
> ===================================================================
> --- gcc/config/rs6000/rs6000.c 2017-09-21 11:53:16.730390867 +0100
> +++ gcc/config/rs6000/rs6000.c 2017-09-22 17:31:56.427058675 +0100
> @@ -35594,8 +35594,7 @@ rs6000_expand_vec_perm_const (rtx operan
> /* Test whether a constant permutation is supported. */
>
> static bool
> -rs6000_vectorize_vec_perm_const_ok (machine_mode vmode,
> - const unsigned char *sel)
> +rs6000_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
> {
> /* AltiVec (and thus VSX) can handle arbitrary permutations. */
> if (TARGET_ALTIVEC)