This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[13/13] [AArch64] Use vec_perm_indices helper routines
- From: Richard Sandiford <richard dot sandiford at linaro dot org>
- To: gcc-patches at gcc dot gnu dot org
- Cc: richard dot earnshaw at arm dot com, james dot greenhalgh at arm dot com, marcus dot shawcroft at arm dot com
- Date: Sat, 09 Dec 2017 23:27:47 +0000
- Subject: [13/13] [AArch64] Use vec_perm_indices helper routines
- Authentication-results: sourceware.org; auth=none
- References: <87indfmrgt.fsf@linaro.org>
This patch makes the AArch64 vec_perm_const code use the new
vec_perm_indices routines, instead of checking each element individually.
This means that they extend naturally to variable-length vectors.
Also, aarch64_evpc_dup was the only function that generated rtl when
testing_p is true, and that looked accidental. The patch adds the
missing check and then replaces the gen_rtx_REG/start_sequence/
end_sequence stuff with an assert that no rtl is generated.
Tested on aarch64-linux-gnu. Also tested by making sure that there
were no assembly output differences for aarch64_be-linux-gnu or
aarch64_be-linux-gnu. OK to install?
Richard
2017-12-09 Richard Sandiford <richard.sandiford@linaro.org>
gcc/
* config/aarch64/aarch64.c (aarch64_evpc_trn): Use d.perm.series_p
instead of checking each element individually.
(aarch64_evpc_uzp): Likewise.
(aarch64_evpc_zip): Likewise.
(aarch64_evpc_ext): Likewise.
(aarch64_evpc_rev): Likewise.
(aarch64_evpc_dup): Test the encoding for a single duplicated element,
instead of checking each element individually. Return true without
generating rtl if
(aarch64_vectorize_vec_perm_const): Use all_from_input_p to test
whether all selected elements come from the same input, instead of
checking each element individually. Remove calls to gen_rtx_REG,
start_sequence and end_sequence and instead assert that no rtl is
generated.
Index: gcc/config/aarch64/aarch64.c
===================================================================
--- gcc/config/aarch64/aarch64.c 2017-12-09 22:48:47.535824832 +0000
+++ gcc/config/aarch64/aarch64.c 2017-12-09 22:49:00.139270410 +0000
@@ -13295,7 +13295,7 @@ aarch64_expand_vec_perm (rtx target, rtx
static bool
aarch64_evpc_trn (struct expand_vec_perm_d *d)
{
- unsigned int i, odd, mask, nelt = d->perm.length ();
+ unsigned int odd, nelt = d->perm.length ();
rtx out, in0, in1, x;
machine_mode vmode = d->vmode;
@@ -13304,21 +13304,11 @@ aarch64_evpc_trn (struct expand_vec_perm
/* Note that these are little-endian tests.
We correct for big-endian later. */
- if (d->perm[0] == 0)
- odd = 0;
- else if (d->perm[0] == 1)
- odd = 1;
- else
+ odd = d->perm[0];
+ if ((odd != 0 && odd != 1)
+ || !d->perm.series_p (0, 2, odd, 2)
+ || !d->perm.series_p (1, 2, nelt + odd, 2))
return false;
- mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
-
- for (i = 0; i < nelt; i += 2)
- {
- if (d->perm[i] != i + odd)
- return false;
- if (d->perm[i + 1] != ((i + nelt + odd) & mask))
- return false;
- }
/* Success! */
if (d->testing_p)
@@ -13342,7 +13332,7 @@ aarch64_evpc_trn (struct expand_vec_perm
static bool
aarch64_evpc_uzp (struct expand_vec_perm_d *d)
{
- unsigned int i, odd, mask, nelt = d->perm.length ();
+ unsigned int odd;
rtx out, in0, in1, x;
machine_mode vmode = d->vmode;
@@ -13351,20 +13341,10 @@ aarch64_evpc_uzp (struct expand_vec_perm
/* Note that these are little-endian tests.
We correct for big-endian later. */
- if (d->perm[0] == 0)
- odd = 0;
- else if (d->perm[0] == 1)
- odd = 1;
- else
+ odd = d->perm[0];
+ if ((odd != 0 && odd != 1)
+ || !d->perm.series_p (0, 1, odd, 2))
return false;
- mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
-
- for (i = 0; i < nelt; i++)
- {
- unsigned elt = (i * 2 + odd) & mask;
- if (d->perm[i] != elt)
- return false;
- }
/* Success! */
if (d->testing_p)
@@ -13388,7 +13368,7 @@ aarch64_evpc_uzp (struct expand_vec_perm
static bool
aarch64_evpc_zip (struct expand_vec_perm_d *d)
{
- unsigned int i, high, mask, nelt = d->perm.length ();
+ unsigned int high, nelt = d->perm.length ();
rtx out, in0, in1, x;
machine_mode vmode = d->vmode;
@@ -13397,25 +13377,11 @@ aarch64_evpc_zip (struct expand_vec_perm
/* Note that these are little-endian tests.
We correct for big-endian later. */
- high = nelt / 2;
- if (d->perm[0] == high)
- /* Do Nothing. */
- ;
- else if (d->perm[0] == 0)
- high = 0;
- else
+ high = d->perm[0];
+ if ((high != 0 && high * 2 != nelt)
+ || !d->perm.series_p (0, 2, high, 1)
+ || !d->perm.series_p (1, 2, high + nelt, 1))
return false;
- mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
-
- for (i = 0; i < nelt / 2; i++)
- {
- unsigned elt = (i + high) & mask;
- if (d->perm[i * 2] != elt)
- return false;
- elt = (elt + nelt) & mask;
- if (d->perm[i * 2 + 1] != elt)
- return false;
- }
/* Success! */
if (d->testing_p)
@@ -13440,23 +13406,14 @@ aarch64_evpc_zip (struct expand_vec_perm
static bool
aarch64_evpc_ext (struct expand_vec_perm_d *d)
{
- unsigned int i, nelt = d->perm.length ();
+ unsigned int nelt = d->perm.length ();
rtx offset;
unsigned int location = d->perm[0]; /* Always < nelt. */
/* Check if the extracted indices are increasing by one. */
- for (i = 1; i < nelt; i++)
- {
- unsigned int required = location + i;
- if (d->one_vector_p)
- {
- /* We'll pass the same vector in twice, so allow indices to wrap. */
- required &= (nelt - 1);
- }
- if (d->perm[i] != required)
- return false;
- }
+ if (!d->perm.series_p (0, 1, location, 1))
+ return false;
/* Success! */
if (d->testing_p)
@@ -13488,7 +13445,7 @@ aarch64_evpc_ext (struct expand_vec_perm
static bool
aarch64_evpc_rev (struct expand_vec_perm_d *d)
{
- unsigned int i, j, diff, size, unspec, nelt = d->perm.length ();
+ unsigned int i, diff, size, unspec;
if (!d->one_vector_p)
return false;
@@ -13504,18 +13461,10 @@ aarch64_evpc_rev (struct expand_vec_perm
else
return false;
- for (i = 0; i < nelt ; i += diff + 1)
- for (j = 0; j <= diff; j += 1)
- {
- /* This is guaranteed to be true as the value of diff
- is 7, 3, 1 and we should have enough elements in the
- queue to generate this. Getting a vector mask with a
- value of diff other than these values implies that
- something is wrong by the time we get here. */
- gcc_assert (i + j < nelt);
- if (d->perm[i + j] != i + diff - j)
- return false;
- }
+ unsigned int step = diff + 1;
+ for (i = 0; i < step; ++i)
+ if (!d->perm.series_p (i, step, diff - i, step))
+ return false;
/* Success! */
if (d->testing_p)
@@ -13532,15 +13481,17 @@ aarch64_evpc_dup (struct expand_vec_perm
rtx out = d->target;
rtx in0;
machine_mode vmode = d->vmode;
- unsigned int i, elt, nelt = d->perm.length ();
+ unsigned int elt;
rtx lane;
+ if (d->perm.encoding ().encoded_nelts () != 1)
+ return false;
+
+ /* Success! */
+ if (d->testing_p)
+ return true;
+
elt = d->perm[0];
- for (i = 1; i < nelt; i++)
- {
- if (elt != d->perm[i])
- return false;
- }
/* The generic preparation in aarch64_expand_vec_perm_const_1
swaps the operand order and the permute indices if it finds
@@ -13628,61 +13579,37 @@ aarch64_vectorize_vec_perm_const (machin
rtx op1, const vec_perm_indices &sel)
{
struct expand_vec_perm_d d;
- unsigned int i, which;
- d.vmode = vmode;
- d.target = target;
- d.op0 = op0;
- d.op1 = op1;
- d.testing_p = !target;
-
- /* Calculate whether all elements are in one vector. */
- unsigned int nelt = sel.length ();
- for (i = which = 0; i < nelt; ++i)
+ /* Check whether the mask can be applied to a single vector. */
+ if (op0 && rtx_equal_p (op0, op1))
+ d.one_vector_p = true;
+ else if (sel.all_from_input_p (0))
{
- unsigned int ei = sel[i] & (2 * nelt - 1);
- which |= (ei < nelt ? 1 : 2);
+ d.one_vector_p = true;
+ op1 = op0;
}
-
- switch (which)
+ else if (sel.all_from_input_p (1))
{
- default:
- gcc_unreachable ();
-
- case 3:
- d.one_vector_p = false;
- if (d.testing_p || !rtx_equal_p (op0, op1))
- break;
-
- /* The elements of PERM do not suggest that only the first operand
- is used, but both operands are identical. Allow easier matching
- of the permutation by folding the permutation into the single
- input vector. */
- /* Fall Through. */
- case 2:
- d.op0 = op1;
- d.one_vector_p = true;
- break;
-
- case 1:
- d.op1 = op0;
d.one_vector_p = true;
- break;
+ op0 = op1;
}
+ else
+ d.one_vector_p = false;
- d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2, nelt);
+ d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
+ sel.nelts_per_input ());
+ d.vmode = vmode;
+ d.target = target;
+ d.op0 = op0;
+ d.op1 = op1;
+ d.testing_p = !target;
if (!d.testing_p)
return aarch64_expand_vec_perm_const_1 (&d);
- d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
- d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
- if (!d.one_vector_p)
- d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
-
- start_sequence ();
+ rtx_insn *last = get_last_insn ();
bool ret = aarch64_expand_vec_perm_const_1 (&d);
- end_sequence ();
+ gcc_assert (last == get_last_insn ());
return ret;
}