This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH/AARCH64] Enable software prefetching (-fprefetch-loop-arrays) for ThunderX 88xxx


> On Jan 26, 2017, at 11:56 PM, Andrew Pinski <apinski@cavium.com> wrote:
> 
> Hi,
>  This patch enables -fprefetch-loop-arrays for -mcpu=thunderxt88 and
> -mcpu=thunderxt88p1.  I filled out the tuning structures for both
> thunderx and thunderx2t99.  No other core current enables software
> prefetching so I set them to 0 which does not change the default
> parameters.
> 
> OK?  Bootstrapped and tested on both ThunderX2 CN99xx and ThunderX
> CN88xx with no regressions.  I got a 2x improvement for 462.libquantum
> on CN88xx, overall a 10% improvement on SPEC INT on CN88xx at -Ofast.
> CN99xx's SPEC did not change.

Below are several comments.

> Index: config/aarch64/aarch64-cores.def
> ===================================================================
> --- config/aarch64/aarch64-cores.def	(revision 244917)
> +++ config/aarch64/aarch64-cores.def	(working copy)
> @@ -63,8 +63,8 @@ AARCH64_CORE("qdf24xx",     qdf24xx,   c
>  AARCH64_CORE("thunderx",      thunderx,      thunderx,  8A,    AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, thunderx,  0x43, 0x0a0, -1)
>  /* Do not swap around "thunderxt88p1" and "thunderxt88",
>     this order is required to handle variant correctly. */
> -AARCH64_CORE("thunderxt88p1", thunderxt88p1, thunderx,  8A,    AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO,		   thunderx,  0x43, 0x0a1, 0)
> -AARCH64_CORE("thunderxt88",   thunderxt88,   thunderx,  8A,    AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, thunderx,  0x43, 0x0a1, -1)
> +AARCH64_CORE("thunderxt88p1", thunderxt88p1, thunderx,  8A,    AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO,		   thunderxt88,  0x43, 0x0a1, 0)
> +AARCH64_CORE("thunderxt88",   thunderxt88,   thunderx,  8A,    AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, thunderxt88,  0x43, 0x0a1, -1)
>  AARCH64_CORE("thunderxt81",   thunderxt81,   thunderx,  8_1A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, thunderx,  0x43, 0x0a2, -1)
>  AARCH64_CORE("thunderxt83",   thunderxt83,   thunderx,  8_1A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_LSE, thunderx,  0x43, 0x0a3, -1)

IMO, this should be in a separate patch that adds thunderxt88p1 tunings.

>  
> Index: config/aarch64/aarch64-protos.h
> ===================================================================
> --- config/aarch64/aarch64-protos.h	(revision 244917)
> +++ config/aarch64/aarch64-protos.h	(working copy)
> @@ -220,10 +220,19 @@ struct tune_params
>    unsigned int max_case_values;
>    /* Value for PARAM_L1_CACHE_LINE_SIZE; or 0 to use the default.  */
>    unsigned int cache_line_size;
> +  /* Value for PARAM_PREFETCH_LATENCY; or 0 to use the default.  */
> +  unsigned int prefetch_latency;
> +  /* Value for PARAM_SIMULTANEOUS_PREFETCHES; or 0 to use the default.  */
> +  unsigned int simultaneous_prefetches;
> +  /* Value for PARAM_L1_CACHE_SIZE; or 0 to use the default.  */
> +  unsigned int l1_cache_size;
> +  /* Value for PARAM_L2_CACHE_SIZE; or 0 to use the default.  */
> +  unsigned int l2_cache_size;
>  
>  /* An enum specifying how to take into account CPU autoprefetch capabilities
>     during instruction scheduling:
>     - AUTOPREFETCHER_OFF: Do not take autoprefetch capabilities into account.
> +   - AUTOPREFETCHER_SW: Turn on software based prefetching.
>     - AUTOPREFETCHER_WEAK: Attempt to sort sequences of loads/store in order of
>     offsets but allow the pipeline hazard recognizer to alter that order to
>     maximize multi-issue opportunities.
> @@ -233,6 +242,7 @@ struct tune_params
>    enum aarch64_autoprefetch_model
>    {
>      AUTOPREFETCHER_OFF,
> +    AUTOPREFETCHER_SW,
>      AUTOPREFETCHER_WEAK,
>      AUTOPREFETCHER_STRONG
>    } autoprefetcher_model;

As I explain below, it is not a good idea to mix loop array prefetching with scheduler's HW autoprefetcher model. 

> Index: config/aarch64/aarch64.c
> ===================================================================
> --- config/aarch64/aarch64.c	(revision 244917)
> +++ config/aarch64/aarch64.c	(working copy)
> @@ -535,6 +535,10 @@ static const struct tune_params generic_
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
>    0,	/* cache_line_size.  */
> +  0,	/* prefetch_latency. */
> +  0,	/* simultaneous_prefetches. */
> +  0,	/* l1_cache_size. */
> +  0,	/* l2_cache_size. */
>    tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
>  };
> @@ -561,6 +565,10 @@ static const struct tune_params cortexa3
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
>    0,	/* cache_line_size.  */
> +  0,	/* prefetch_latency. */
> +  0,	/* simultaneous_prefetches. */
> +  0,	/* l1_cache_size. */
> +  0,	/* l2_cache_size. */
>    tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
>  };
> @@ -587,6 +595,10 @@ static const struct tune_params cortexa5
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
>    0,	/* cache_line_size.  */
> +  0,	/* prefetch_latency. */
> +  0,	/* simultaneous_prefetches. */
> +  0,	/* l1_cache_size. */
> +  0,	/* l2_cache_size. */
>    tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
>  };
> @@ -613,6 +625,10 @@ static const struct tune_params cortexa5
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
>    0,	/* cache_line_size.  */
> +  0,	/* prefetch_latency. */
> +  0,	/* simultaneous_prefetches. */
> +  0,	/* l1_cache_size. */
> +  0,	/* l2_cache_size. */
>    tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
>  };
> @@ -639,6 +655,10 @@ static const struct tune_params cortexa7
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
>    0,	/* cache_line_size.  */
> +  0,	/* prefetch_latency. */
> +  0,	/* simultaneous_prefetches. */
> +  0,	/* l1_cache_size. */
> +  0,	/* l2_cache_size. */
>    tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
>  };
> @@ -665,6 +685,10 @@ static const struct tune_params cortexa7
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
>    0,	/* cache_line_size.  */
> +  0,	/* prefetch_latency. */
> +  0,	/* simultaneous_prefetches. */
> +  0,	/* l1_cache_size. */
> +  0,	/* l2_cache_size. */
>    tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
>  };
> @@ -690,6 +714,10 @@ static const struct tune_params exynosm1
>    2,	/* min_div_recip_mul_df.  */
>    48,	/* max_case_values.  */
>    64,	/* cache_line_size.  */
> +  0,	/* prefetch_latency. */
> +  0,	/* simultaneous_prefetches. */
> +  0,	/* l1_cache_size. */
> +  0,	/* l2_cache_size. */
>    tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
>  };
> @@ -714,11 +742,45 @@ static const struct tune_params thunderx
>    2,	/* min_div_recip_mul_sf.  */
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
> -  0,	/* cache_line_size.  */
> +  128,	/* cache_line_size.  */
> +  0,    /* prefetch_latency. */
> +  8,    /* simultaneous_prefetches. */
> +  32,   /* l1_cache_size. */
> +  0,     /* l2_cache_size. */
>    tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)	/* tune_flags.  */
>  };
>  
> +/* Tunings for ThunderX CN88xx */
> +static const struct tune_params thunderxt88_tunings =
> +{
> +  &thunderx_extra_costs,
> +  &generic_addrcost_table,
> +  &thunderx_regmove_cost,
> +  &thunderx_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  6, /* memmov_cost  */
> +  2, /* issue_rate  */
> +  AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
> +  8,    /* function_align.  */
> +  8,    /* jump_align.  */
> +  8,    /* loop_align.  */
> +  2,    /* int_reassoc_width.  */
> +  4,    /* fp_reassoc_width.  */
> +  1,    /* vec_reassoc_width.  */
> +  2,    /* min_div_recip_mul_sf.  */
> +  2,    /* min_div_recip_mul_df.  */
> +  0,    /* max_case_values.  */
> +  128,    /* cache_line_size.  */
> +  0,    /* prefetch_latency. */
> +  8,    /* simultaneous_prefetches. */
> +  32,   /* l1_cache_size. */
> +  16*1024,     /* l2_cache_size. */
> +  tune_params::AUTOPREFETCHER_SW,      /* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)      /* tune_flags.  */
> +};

I think this belongs into a separate patch.

> +
>  static const struct tune_params xgene1_tunings =
>  {
>    &xgene1_extra_costs,
> @@ -740,6 +802,10 @@ static const struct tune_params xgene1_t
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
>    0,	/* cache_line_size.  */
> +  0,	/* prefetch_latency. */
> +  0,	/* simultaneous_prefetches. */
> +  0,	/* l1_cache_size. */
> +  0,	/* l2_cache_size. */
>    tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
>  };
> @@ -766,6 +832,10 @@ static const struct tune_params qdf24xx_
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
>    64,	/* cache_line_size.  */
> +  0,	/* prefetch_latency. */
> +  0,	/* simultaneous_prefetches. */
> +  0,	/* l1_cache_size. */
> +  0,	/* l2_cache_size. */
>    tune_params::AUTOPREFETCHER_STRONG,	/* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_NONE)		/* tune_flags.  */
>  };
> @@ -791,7 +861,11 @@ static const struct tune_params thunderx
>    2,	/* min_div_recip_mul_df.  */
>    0,	/* max_case_values.  */
>    64,	/* cache_line_size.  */
> -  tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
> +  0,	/* prefetch_latency. */
> +  8,	/* simultaneous_prefetches. */
> +  32,	/* l1_cache_size. */
> +  256,	/* l2_cache_size. */
> +  tune_params::AUTOPREFETCHER_NONE,	/* autoprefetcher_model.  */
>    (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
>  };
>  
> @@ -8646,6 +8720,7 @@ aarch64_override_options_internal (struc
>    switch (aarch64_tune_params.autoprefetcher_model)
>      {
>        case tune_params::AUTOPREFETCHER_OFF:
> +      case tune_params::AUTOPREFETCHER_SW:
>  	queue_depth = -1;
>  	break;
>        case tune_params::AUTOPREFETCHER_WEAK:
> @@ -8672,6 +8747,42 @@ aarch64_override_options_internal (struc
>  			   opts->x_param_values,
>  			   global_options_set.x_param_values);
>  
> +  /* Set the prefetch latncy.  */
> +  if (selected_cpu->tune->prefetch_latency != 0)
> +    maybe_set_param_value (PARAM_PREFETCH_LATENCY,
> +			   selected_cpu->tune->prefetch_latency,
> +			   opts->x_param_values,
> +			   global_options_set.x_param_values);
> +
> +  /* Set the simultaneous prefetches.  */
> +  if (selected_cpu->tune->simultaneous_prefetches != 0)
> +    maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
> +			   selected_cpu->tune->simultaneous_prefetches,
> +			   opts->x_param_values,
> +			   global_options_set.x_param_values);
> +
> +  /* Set the l1 cache size.  */
> +  if (selected_cpu->tune->l1_cache_size != 0)
> +    maybe_set_param_value (PARAM_L1_CACHE_SIZE,
> +			   selected_cpu->tune->l1_cache_size,
> +			   opts->x_param_values,
> +			   global_options_set.x_param_values);
> +
> +  /* Set the l2 cache size.  */
> +  if (selected_cpu->tune->l2_cache_size != 0)
> +    maybe_set_param_value (PARAM_L2_CACHE_SIZE,
> +			   selected_cpu->tune->l2_cache_size,
> +			   opts->x_param_values,
> +			   global_options_set.x_param_values);
> +
> +
> +  /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
> +  if (opts->x_flag_prefetch_loop_arrays < 0
> +      && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
> +      && !opts->x_optimize_size
> +      && aarch64_tune_params.autoprefetcher_model == tune_params::AUTOPREFETCHER_SW)
> +    opts->x_flag_prefetch_loop_arrays = 1;

AUTOPREFETCHER_* setting is for a completely different optimization done during instruction scheduling.  It controls model of hardware auto-prefetcher, which some cores have.  Autoprefetch HW does prefetches itself, and it is tangential to loop array prefetching.  There are cores that benefit from both optimizations simultaneously, and there is no need to allow only one or the other.

I think AArch64 should use (simultaneous_prefetches > 0) as the condition to enable loop array prefetching.

--
Maxim Kuvyrkov
www.linaro.org




Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]