[Patch AArch64] GCC 6 regression in vector performance. - Fix vector initialization to happen with lane load instructions.
James Greenhalgh
james.greenhalgh@arm.com
Mon Feb 8 10:56:00 GMT 2016
On Tue, Feb 02, 2016 at 10:29:29AM +0000, James Greenhalgh wrote:
> On Wed, Jan 20, 2016 at 03:22:11PM +0000, James Greenhalgh wrote:
> >
> > Hi,
> >
> > In a number of cases where we try to create vectors we end up spilling to the
> > stack and then filling. This is one example distilled from a couple of
> > micro-benchmrks where the issue shows up. The reason for the extra cost
> > in this case is the unnecessary use of the stack. The patch attempts to
> > finesse this by using lane loads or vector inserts to produce the right
> > results.
> >
> > This patch is mostly Ramana's work, I've just cleaned it up a little.
> >
> > This has been in a number of our trees lately, and we haven't seen any
> > regressions. I've also bootstrapped and tested it, and run a set of
> > benchmarks to show no regressions on Cortex-A57 or Cortex-A53.
> >
> > The patch fixes some regressions caused by the more agressive vectorization
> > in GCC6, so I'd like to propose it to go in even though we are in Stage 4.
> >
> > OK?
>
> *Ping*
*ping^2*
Cheers,
James
> > 2016-01-20 James Greenhalgh <james.greenhalgh@arm.com>
> > Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>
> >
> > * config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor,
> > always use lane loads to construct non-constant vectors.
> >
> > gcc/testsuite/
> >
> > 2016-01-20 James Greenhalgh <james.greenhalgh@arm.com>
> > Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>
> >
> > * gcc.target/aarch64/vector_initialization_nostack.c: New.
> >
>
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index 03bc1b9..3787b38 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -10985,28 +10985,37 @@ aarch64_simd_make_constant (rtx vals)
> > return NULL_RTX;
> > }
> >
> > +/* Expand a vector initialisation sequence, such that TARGET is
> > + initialised to contain VALS. */
> > +
> > void
> > aarch64_expand_vector_init (rtx target, rtx vals)
> > {
> > machine_mode mode = GET_MODE (target);
> > machine_mode inner_mode = GET_MODE_INNER (mode);
> > + /* The number of vector elements. */
> > int n_elts = GET_MODE_NUNITS (mode);
> > + /* The number of vector elements which are not constant. */
> > int n_var = 0;
> > rtx any_const = NULL_RTX;
> > + /* The first element of vals. */
> > + rtx v0 = XVECEXP (vals, 0, 0);
> > bool all_same = true;
> >
> > + /* Count the number of variable elements to initialise. */
> > for (int i = 0; i < n_elts; ++i)
> > {
> > rtx x = XVECEXP (vals, 0, i);
> > - if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
> > + if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
> > ++n_var;
> > else
> > any_const = x;
> >
> > - if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
> > - all_same = false;
> > + all_same &= rtx_equal_p (x, v0);
> > }
> >
> > + /* No variable elements, hand off to aarch64_simd_make_constant which knows
> > + how best to handle this. */
> > if (n_var == 0)
> > {
> > rtx constant = aarch64_simd_make_constant (vals);
> > @@ -11020,14 +11029,15 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> > /* Splat a single non-constant element if we can. */
> > if (all_same)
> > {
> > - rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
> > + rtx x = copy_to_mode_reg (inner_mode, v0);
> > aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
> > return;
> > }
> >
> > - /* Half the fields (or less) are non-constant. Load constant then overwrite
> > - varying fields. Hope that this is more efficient than using the stack. */
> > - if (n_var <= n_elts/2)
> > + /* Initialise a vector which is part-variable. We want to first try
> > + to build those lanes which are constant in the most efficient way we
> > + can. */
> > + if (n_var != n_elts)
> > {
> > rtx copy = copy_rtx (vals);
> >
> > @@ -11054,31 +11064,21 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> > XVECEXP (copy, 0, i) = subst;
> > }
> > aarch64_expand_vector_init (target, copy);
> > + }
> >
> > - /* Insert variables. */
> > - enum insn_code icode = optab_handler (vec_set_optab, mode);
> > - gcc_assert (icode != CODE_FOR_nothing);
> > + /* Insert the variable lanes directly. */
> >
> > - for (int i = 0; i < n_elts; i++)
> > - {
> > - rtx x = XVECEXP (vals, 0, i);
> > - if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
> > - continue;
> > - x = copy_to_mode_reg (inner_mode, x);
> > - emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
> > - }
> > - return;
> > - }
> > + enum insn_code icode = optab_handler (vec_set_optab, mode);
> > + gcc_assert (icode != CODE_FOR_nothing);
> >
> > - /* Construct the vector in memory one field at a time
> > - and load the whole vector. */
> > - rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
> > for (int i = 0; i < n_elts; i++)
> > - emit_move_insn (adjust_address_nv (mem, inner_mode,
> > - i * GET_MODE_SIZE (inner_mode)),
> > - XVECEXP (vals, 0, i));
> > - emit_move_insn (target, mem);
> > -
> > + {
> > + rtx x = XVECEXP (vals, 0, i);
> > + if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
> > + continue;
> > + x = copy_to_mode_reg (inner_mode, x);
> > + emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
> > + }
> > }
> >
> > static unsigned HOST_WIDE_INT
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
> > new file mode 100644
> > index 0000000..bbad04d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
> > @@ -0,0 +1,53 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */
> > +float arr_f[100][100];
> > +float
> > +f9 (void)
> > +{
> > +
> > + int i;
> > + float sum = 0;
> > + for (i = 0; i < 100; i++)
> > + sum += arr_f[i][0] * arr_f[0][i];
> > + return sum;
> > +
> > +}
> > +
> > +
> > +int arr[100][100];
> > +int
> > +f10 (void)
> > +{
> > +
> > + int i;
> > + int sum = 0;
> > + for (i = 0; i < 100; i++)
> > + sum += arr[i][0] * arr[0][i];
> > + return sum;
> > +
> > +}
> > +
> > +double arr_d[100][100];
> > +double
> > +f11 (void)
> > +{
> > + int i;
> > + double sum = 0;
> > + for (i = 0; i < 100; i++)
> > + sum += arr_d[i][0] * arr_d[0][i];
> > + return sum;
> > +}
> > +
> > +char arr_c[100][100];
> > +char
> > +f12 (void)
> > +{
> > + int i;
> > + char sum = 0;
> > + for (i = 0; i < 100; i++)
> > + sum += arr_c[i][0] * arr_c[0][i];
> > + return sum;
> > +}
> > +
> > +
> > +/* { dg-final { scan-assembler-not "sp" } } */
>
More information about the Gcc-patches
mailing list