[PATCH 2/3] [ARM] Refactor costs calculation for MEM.

Wed Sep 13 09:02:00 GMT 2017

Hi Charles,

On 12/09/17 09:34, charles.baylis@linaro.org wrote:
> From: Charles Baylis <charles.baylis@linaro.org>
>
> This patch moves the calculation of costs for MEM into a
> separate function, and reforms the calculation into two
> parts. Firstly any additional cost of the addressing mode
> is calculated, and then the cost of the memory access itself
> is added.
>
> In this patch, the calculation of the cost of the addressing
> mode is left as a placeholder, to be added in a subsequent
> patch.
>

Can you please mention how has this series been tested?
A bootstrap and test run on arm-none-linux-gnueabihf is required at least.
Also, do you have any benchmarking results for this?
I agree that generating the addressing modes in the new tests is desirable.
So I'm not objecting to the goal of this patch, but a check to make sure 
that this doesn't regress SPEC
would be great.  Further comments on the patch inline.

> gcc/ChangeLog:
>
> <date>  Charles Baylis <charles.baylis@linaro.org>
>
>         * config/arm/arm.c (arm_mem_costs): New function.
>         (arm_rtx_costs_internal): Use arm_mem_costs.
>
> gcc/testsuite/ChangeLog:
>
> <date>  Charles Baylis <charles.baylis@linaro.org>
>
>         * gcc.target/arm/addr-modes-float.c: New test.
>         * gcc.target/arm/addr-modes-int.c: New test.
>         * gcc.target/arm/addr-modes.h: New header.
>
> Change-Id: I99e93406ea39ee31f71c7bf428ad3e127b7a618e
> ---
>  gcc/config/arm/arm.c                            | 67 
> ++++++++++++++++---------
>  gcc/testsuite/gcc.target/arm/addr-modes-float.c | 42 ++++++++++++++++
>  gcc/testsuite/gcc.target/arm/addr-modes-int.c   | 46 +++++++++++++++++
>  gcc/testsuite/gcc.target/arm/addr-modes.h       | 53 +++++++++++++++++++
>  4 files changed, 183 insertions(+), 25 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/addr-modes-float.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/addr-modes-int.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/addr-modes.h
>
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> index 32001e5..b8dbed6 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -9228,8 +9228,48 @@ arm_unspec_cost (rtx x, enum rtx_code /* 
> outer_code */, bool speed_p, int *cost)
> } \
>          while (0);
>
> +/* Helper function for arm_rtx_costs_internal.  Calculates the cost 
> of a MEM,
> +   considering the costs of the addressing mode and memory access
> +   separately.  */
> +static bool
> +arm_mem_costs (rtx x, const struct cpu_cost_table *extra_cost,
> +              int *cost, bool speed_p)
> +{
> +  machine_mode mode = GET_MODE (x);
> +  if (flag_pic
> +      && GET_CODE (XEXP (x, 0)) == PLUS
> +      && will_be_in_index_register (XEXP (XEXP (x, 0), 1)))
> +    /* This will be split into two instructions.  Add the cost of the
> +       additional instruction here.  The cost of the memory access is 
> computed
> +       below.  See arm.md:calculate_pic_address.  */
> +    *cost = COSTS_N_INSNS (1);
> +  else
> +    *cost = 0;

For speed_p we want the size cost of the insn (COSTS_N_INSNS (1) for a 
each insn)
plus the appropriate field in extra_cost. So you should unconditionally 
initialise the cost
to COSTS_N_INSNS (1), conditionally increment it by COSTS_N_INSNS (1) 
with the condition above.

> +
> +  /* Calculate cost of the addressing mode.  */
> +  if (speed_p)
> +    {
> +      /* TODO: Add table-driven costs for addressing modes.  (See 
> patch 2) */
> +    }

You mean "patch 3". I recommend you just remove this conditional from 
this patch and add the logic
in patch 3 entirely.

> +
> +  /* Calculate cost of memory access.  */
> +  if (speed_p)
> +    {
> +      /* data transfer is transfer size divided by bus width.  */
> +      int bus_width_bytes = current_tune->bus_width / 4;

This should be bus_width / BITS_PER_UNIT to get the size in bytes.
BITS_PER_UNIT is 8 though, so you'll have to double check to make sure the
cost calculation and generated code is still appropriate.

> +      *cost += CEIL (GET_MODE_SIZE (mode), bus_width_bytes);
> +      *cost += extra_cost->ldst.load;
> +    }
> +  else
> +    {
> +      *cost += COSTS_N_INSNS (1);
> +    }

Given my first comment above this else would be deleted.

Thanks,
Kyrill

> +
> +  return true;
> +}
> +
>  /* RTX costs.  Make an estimate of the cost of executing the operation
> -   X, which is contained with an operation with code OUTER_CODE.
> +   X, which is contained within an operation with code OUTER_CODE.
>     SPEED_P indicates whether the cost desired is the performance cost,
>     or the size cost.  The estimate is stored in COST and the return
>     value is TRUE if the cost calculation is final, or FALSE if the
> @@ -9308,30 +9348,7 @@ arm_rtx_costs_internal (rtx x, enum rtx_code 
> code, enum rtx_code outer_code,
>        return false;
>
>      case MEM:
> -      /* A memory access costs 1 insn if the mode is small, or the 
> address is
> -        a single register, otherwise it costs one insn per word.  */
> -      if (REG_P (XEXP (x, 0)))
> -       *cost = COSTS_N_INSNS (1);
> -      else if (flag_pic
> -              && GET_CODE (XEXP (x, 0)) == PLUS
> -              && will_be_in_index_register (XEXP (XEXP (x, 0), 1)))
> -       /* This will be split into two instructions.
> -          See arm.md:calculate_pic_address.  */
> -       *cost = COSTS_N_INSNS (2);
> -      else
> -       *cost = COSTS_N_INSNS (ARM_NUM_REGS (mode));
> -
> -      /* For speed optimizations, add the costs of the address and
> -        accessing memory.  */
> -      if (speed_p)
> -#ifdef NOT_YET
> -       *cost += (extra_cost->ldst.load
> -                 + arm_address_cost (XEXP (x, 0), mode,
> -                                     ADDR_SPACE_GENERIC, speed_p));
> -#else
> -        *cost += extra_cost->ldst.load;
> -#endif
> -      return true;
> +      return arm_mem_costs (x, extra_cost, cost, speed_p);
>
>      case PARALLEL:
>      {
> diff --git a/gcc/testsuite/gcc.target/arm/addr-modes-float.c 
> b/gcc/testsuite/gcc.target/arm/addr-modes-float.c
> new file mode 100644
> index 0000000..3b4235c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/addr-modes-float.c
> @@ -0,0 +1,42 @@
> +/* { dg-options "-O2" } */
> +/* { dg-add-options arm_neon } */
> +/* { dg-require-effective-target arm_neon_ok } */
> +/* { dg-do compile } */
> +
> +#include <arm_neon.h>
> +
> +#include "addr-modes.h"
> +
> +POST_STORE(float)
> +/* { dg-final { scan-assembler "vstmia.32" } } */
> +POST_STORE(double)
> +/* { dg-final { scan-assembler "vstmia.64" } } */
> +
> +POST_LOAD(float)
> +/* { dg-final { scan-assembler "vldmia.32" } } */
> +POST_LOAD(double)
> +/* { dg-final { scan-assembler "vldmia.64" } } */
> +
> +POST_STORE_VEC (int8_t, int8x8_t, vst1_s8)
> +/* { dg-final { scan-assembler "vst1.8\t\{.*\}, \\\[r\[0-9\]+\\\]!" } 
> } */
> +POST_STORE_VEC (int8_t, int8x16_t, vst1q_s8)
> +/* { dg-final { scan-assembler "vst1.8\t\{.*\[-,\]d.*\}, 
> \\\[r\[0-9\]+\\\]!" } } */
> +
> +POST_STORE_VEC (int8_t, int8x8x2_t, vst2_s8)
> +/* { dg-final { scan-assembler "vst2.8\t\{.*\}, \\\[r\[0-9\]+\\\]!" } 
> } */
> +POST_STORE_VEC (int8_t, int8x16x2_t, vst2q_s8)
> +/* { dg-final { scan-assembler "vst2.8\t\{.*-d.*\}, 
> \\\[r\[0-9\]+\\\]!" } } */
> +
> +POST_STORE_VEC (int8_t, int8x8x3_t, vst3_s8)
> +/* { dg-final { scan-assembler "vst3.8\t\{.*\}, \\\[r\[0-9\]+\\\]!" } 
> } */
> +POST_STORE_VEC (int8_t, int8x16x3_t, vst3q_s8)
> +/* { dg-final { scan-assembler "vst3.8\t\{d\[02468\], d\[02468\], 
> d\[02468\]\}, \\\[r\[0-9\]+\\\]!" } } */
> +/* { dg-final { scan-assembler "vst3.8\t\{d\[13579\], d\[13579\], 
> d\[13579\]\}, \\\[r\[0-9\]+\\\]!" { xfail *-*-* } } } */
> +
> +POST_STORE_VEC (int8_t, int8x8x4_t, vst4_s8)
> +/* { dg-final { scan-assembler "vst4.8\t\{.*\}, \\\[r\[0-9\]+\\\]!" } 
> } */
> +POST_STORE_VEC (int8_t, int8x16x4_t, vst4q_s8)
> +/* { dg-final { scan-assembler "vst4.8\t\{d\[02468\], d\[02468\], 
> d\[02468\], d\[02468\]\}, \\\[r\[0-9\]+\\\]!" } } */
> +/* { dg-final { scan-assembler "vst4.8\t\{d\[13579\], d\[13579\], 
> d\[13579\], d\[13579\]\}, \\\[r\[0-9\]+\\\]!" { xfail *-*-* } } } */
> +
> +/* { dg-final { scan-assembler-not "add" { xfail *-*-* } } } */
> diff --git a/gcc/testsuite/gcc.target/arm/addr-modes-int.c 
> b/gcc/testsuite/gcc.target/arm/addr-modes-int.c
> new file mode 100644
> index 0000000..e3e1e6a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/addr-modes-int.c
> @@ -0,0 +1,46 @@
> +/* { dg-options "-O2 -march=armv7-a" } */
> +/* { dg-add-options arm_neon } */
> +/* { dg-require-effective-target arm_neon_ok } */
> +/* { dg-do compile } */
> +
> +#include "addr-modes.h"
> +
> +typedef long long ll;
> +
> +PRE_STORE(char)
> +/* { dg-final { scan-assembler "strb.*#1]!" } } */
> +PRE_STORE(short)
> +/* { dg-final { scan-assembler "strh.*#2]!" } } */
> +PRE_STORE(int)
> +/* { dg-final { scan-assembler "str.*#4]!" } } */
> +PRE_STORE(ll)
> +/* { dg-final { scan-assembler "strd.*#8]!" } } */
> +
> +POST_STORE(char)
> +/* { dg-final { scan-assembler "strb.*], #1" } } */
> +POST_STORE(short)
> +/* { dg-final { scan-assembler "strh.*], #2" } } */
> +POST_STORE(int)
> +/* { dg-final { scan-assembler "str.*], #4" } } */
> +POST_STORE(ll)
> +/* { dg-final { scan-assembler "strd.*], #8" } } */
> +
> +PRE_LOAD(char)
> +/* { dg-final { scan-assembler "ldrb.*#1]!" } } */
> +PRE_LOAD(short)
> +/* { dg-final { scan-assembler "ldrsh.*#2]!" } } */
> +PRE_LOAD(int)
> +/* { dg-final { scan-assembler "ldr.*#4]!" } } */
> +PRE_LOAD(ll)
> +/* { dg-final { scan-assembler "ldrd.*#8]!" } } */
> +
> +POST_LOAD(char)
> +/* { dg-final { scan-assembler "ldrb.*], #1" } } */
> +POST_LOAD(short)
> +/* { dg-final { scan-assembler "ldrsh.*], #2" } } */
> +POST_LOAD(int)
> +/* { dg-final { scan-assembler "ldr.*], #4" } } */
> +POST_LOAD(ll)
> +/* { dg-final { scan-assembler "ldrd.*], #8" } } */
> +
> +/* { dg-final { scan-assembler-not "\tadd" } } */
> diff --git a/gcc/testsuite/gcc.target/arm/addr-modes.h 
> b/gcc/testsuite/gcc.target/arm/addr-modes.h
> new file mode 100644
> index 0000000..eac4678
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/addr-modes.h
> @@ -0,0 +1,53 @@
> +
> +#define PRE_STORE(T)                   \
> +  T *                                  \
> +  T ## _pre_store (T *p, T v)          \
> +  {                                    \
> +    *++p = v;                          \
> +    return p;                          \
> +  }                                    \
> +
> +#define POST_STORE(T)                  \
> +  T *                                  \
> +  T ## _post_store (T *p, T v)         \
> +  {                                    \
> +    *p++ = v;                          \
> +    return p;                          \
> +  }
> +
> +#define POST_STORE_VEC(T, VT, OP)      \
> +  T *                                  \
> +  VT ## _post_store (T * p, VT v)      \
> +  {                                    \
> +    OP (p, v);                         \
> +    p += sizeof (VT) / sizeof (T);     \
> +    return p;                          \
> +  }
> +
> +#define PRE_LOAD(T)                    \
> +  void                                 \
> +  T ## _pre_load (T *p)                        \
> +  {                                    \
> +    extern void f ## T (T*,T);         \
> +    T x = *++p;                                \
> +    f ## T (p, x);                     \
> +  }
> +
> +#define POST_LOAD(T)                   \
> +  void                                 \
> +  T ## _post_load (T *p)               \
> +  {                                    \
> +    extern void f ## T (T*,T);         \
> +    T x = *p++;                                \
> +    f ## T (p, x);                     \
> +  }
> +
> +#define POST_LOAD_VEC(T, VT, OP)       \
> +  void                                 \
> +  VT ## _post_load (T * p)             \
> +  {                                    \
> +    extern void f ## T (T*,T);         \
> +    VT x = OP (p, v);                  \
> +    p += sizeof (VT) / sizeof (T);     \
> +    f ## T (p, x);                     \
> +  }
> -- 
> 2.7.4
>