This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH][GCC][AArch64] optimize float immediate moves (1 /4) - infrastructure.
- From: James Greenhalgh <james dot greenhalgh at arm dot com>
- To: Tamar Christina <Tamar dot Christina at arm dot com>
- Cc: Richard Sandiford <richard dot sandiford at linaro dot org>, GCC Patches <gcc-patches at gcc dot gnu dot org>, nd <nd at arm dot com>, Marcus Shawcroft <Marcus dot Shawcroft at arm dot com>, Richard Earnshaw <Richard dot Earnshaw at arm dot com>
- Date: Tue, 13 Jun 2017 17:39:35 +0100
- Subject: Re: [PATCH][GCC][AArch64] optimize float immediate moves (1 /4) - infrastructure.
- Authentication-results: sourceware.org; auth=none
- Authentication-results: spf=pass (sender IP is 217.140.96.140) smtp.mailfrom=arm.com; gcc.gnu.org; dkim=none (message not signed) header.d=none;gcc.gnu.org; dmarc=bestguesspass action=none header.from=arm.com;
- Nodisclaimer: True
- References: <VI1PR0801MB203164A5E1F6B6EDA0F2074AFFC80@VI1PR0801MB2031.eurprd08.prod.outlook.com> <8760g6bwig.fsf@linaro.org> <VI1PR0801MB2031CA473713EDC999DF103CFFCD0@VI1PR0801MB2031.eurprd08.prod.outlook.com>
- Spamdiagnosticmetadata: NSPM
- Spamdiagnosticoutput: 1:99
This patch is pretty huge, are there any opportunities to further split
it to aid review?
I have some comments in line.
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index a069427f576f6bd7336bbe4497249773bd33d138..2ab2d96e40e80a79b5648046ca2d6e202d3939a2 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -147,6 +147,8 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
> const_tree type,
> int misalignment,
> bool is_packed);
> +static machine_mode
> +aarch64_simd_container_mode (machine_mode mode, unsigned width);
>
> /* Major revision number of the ARM Architecture implemented by the target. */
> unsigned aarch64_architecture_version;
> @@ -4613,6 +4615,66 @@ aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
> return true;
> }
>
> +/* Return the binary representation of floating point constant VALUE in INTVAL.
> + If the value cannot be converted, return false without setting INTVAL.
> + The conversion is done in the given MODE. */
> +bool
> +aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
> +{
> + machine_mode mode = GET_MODE (value);
> + if (GET_CODE (value) != CONST_DOUBLE
> + || !SCALAR_FLOAT_MODE_P (mode)
> + || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT)
> + return false;
> +
> + unsigned HOST_WIDE_INT ival = 0;
> +
> + /* Only support up to DF mode. */
> + gcc_assert (GET_MODE_BITSIZE (mode) <= 64);
> + int needed = GET_MODE_BITSIZE (mode) == 64 ? 2 : 1;
> +
> + long res[2];
> + real_to_target (res,
> + CONST_DOUBLE_REAL_VALUE (value),
> + REAL_MODE_FORMAT (mode));
> +
> + ival = zext_hwi (res[needed - 1], 32);
> + for (int i = needed - 2; i >= 0; i--)
> + {
> + ival <<= 32;
> + ival |= zext_hwi (res[i], 32);
> + }
> +
> + *intval = ival;
???
Two cases here, needed is either 2 if GET_MODE_BITSIZE (mode) == 64, or it
is 1 otherwise. So i starts at either -1 or 0. So this for loop either runs
0 or 1 times. What am I missing? I'm sure this is all an indirect way of
writing:
*intval = 0;
if (GET_MODE_BITSIZE (mode) == 64)
*intval = zext_hwi (res[1], 32) << 32
*intval |= zext_hwi (res[0], 32)
> + return true;
> +}
> +
> +/* Return TRUE if rtx X is an immediate constant that can be moved using a
> + single MOV(+MOVK) followed by an FMOV. */
> +bool
> +aarch64_float_const_rtx_p (rtx x)
> +{
> + machine_mode mode = GET_MODE (x);
> + if (mode == VOIDmode)
> + return false;
> +
> + /* Determine whether it's cheaper to write float constants as
> + mov/movk pairs over ldr/adrp pairs. */
> + unsigned HOST_WIDE_INT ival;
> +
> + if (GET_CODE (x) == CONST_DOUBLE
> + && SCALAR_FLOAT_MODE_P (mode)
> + && aarch64_reinterpret_float_as_int (x, &ival))
> + {
> + machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
> + int num_instr = aarch64_internal_mov_immediate
> + (NULL_RTX, gen_int_mode (ival, imode), false, imode);
> + return num_instr < 3;
Should this cost model be static on a magin number? Is it not the case that
the decision should be based on the relative speeds of a memory access
compared with mov/movk/fmov ?
> + }
> +
> + return false;
> +}
> +
> /* Return TRUE if rtx X is immediate constant 0.0 */
> bool
> aarch64_float_const_zero_rtx_p (rtx x)
> @@ -4625,6 +4687,46 @@ aarch64_float_const_zero_rtx_p (rtx x)
> return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
> }
>
> +/* Return TRUE if rtx X is immediate constant that fits in a single
> + MOVI immediate operation. */
> +bool
> +aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
> +{
> + if (!TARGET_SIMD)
> + return false;
> +
> + machine_mode vmode, imode;
> + unsigned HOST_WIDE_INT ival;
> +
> + /* Don't write float constants out to memory. */
> + if (GET_CODE (x) == CONST_DOUBLE
> + && SCALAR_FLOAT_MODE_P (mode))
> + {
> + if (!aarch64_reinterpret_float_as_int (x, &ival))
> + return false;
> +
> + imode = int_mode_for_mode (mode);
> + }
> + else if (GET_CODE (x) == CONST_INT
> + && SCALAR_INT_MODE_P (mode))
> + {
> + imode = mode;
> + ival = INTVAL (x);
> + }
> + else
> + return false;
> +
> + unsigned width = GET_MODE_BITSIZE (mode) * 2;
Why * 2? It isn't obvious to me from my understanding of movi why that would
be better than just clamping to 64-bit?
> + if (width < GET_MODE_BITSIZE (DFmode))
> + width = GET_MODE_BITSIZE (DFmode);
> +
> + vmode = aarch64_simd_container_mode (imode, width);
> + rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
> +
> + return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
> +}
> +
> +
> /* Return the fixed registers used for condition codes. */
>
> static bool
> @@ -5758,12 +5860,6 @@ aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
> return NO_REGS;
> }
>
> - /* If it's an integer immediate that MOVI can't handle, then
> - FP_REGS is not an option, so we return NO_REGS instead. */
> - if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
> - && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
> - return NO_REGS;
> -
> /* Register eliminiation can result in a request for
> SP+constant->FP_REGS. We cannot support such operations which
> use SP as source and an FP_REG as destination, so reject out
> @@ -6674,26 +6770,44 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
> return true;
>
> case CONST_DOUBLE:
> +
> + /* First determine number of instructions to do the move
> + as an integer constant. */
> + if (!aarch64_float_const_representable_p (x)
> + && !aarch64_can_const_movi_rtx_p (x, mode)
> + && aarch64_float_const_rtx_p (x))
> + {
> + unsigned HOST_WIDE_INT ival;
> + bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
> + gcc_assert (succeed);
Just:
gcc_assert (aarch64_reinterpret_float_as_int (x, &ival));
There's not much extra information in the name "succeed", so no extra value
in the variable assignment.
> +
> + machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
> + int ncost = aarch64_internal_mov_immediate
> + (NULL_RTX, gen_int_mode (ival, imode), false, imode);
> + *cost += COSTS_N_INSNS (ncost);
> + return true;
> + }
> +
> if (speed)
> {
> - /* mov[df,sf]_aarch64. */
> - if (aarch64_float_const_representable_p (x))
> - /* FMOV (scalar immediate). */
> - *cost += extra_cost->fp[mode == DFmode].fpconst;
> - else if (!aarch64_float_const_zero_rtx_p (x))
> - {
> - /* This will be a load from memory. */
> - if (mode == DFmode)
> + /* mov[df,sf]_aarch64. */
> + if (aarch64_float_const_representable_p (x))
> + /* FMOV (scalar immediate). */
> + *cost += extra_cost->fp[mode == DFmode].fpconst;
> + else if (!aarch64_float_const_zero_rtx_p (x))
> + {
> + /* This will be a load from memory. */
> + if (mode == DFmode)
> *cost += extra_cost->ldst.loadd;
> - else
> + else
> *cost += extra_cost->ldst.loadf;
> - }
> - else
> - /* Otherwise this is +0.0. We get this using MOVI d0, #0
> - or MOV v0.s[0], wzr - neither of which are modeled by the
> - cost tables. Just use the default cost. */
> - {
> - }
> + }
> + else
> + /* Otherwise this is +0.0. We get this using MOVI d0, #0
> + or MOV v0.s[0], wzr - neither of which are modeled by the
> + cost tables. Just use the default cost. */
> + {
> + }
> }
>
> return true;
> @@ -12599,15 +12698,28 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
> }
>
> char*
> -aarch64_output_scalar_simd_mov_immediate (rtx immediate,
> - machine_mode mode)
> +aarch64_output_scalar_simd_mov_immediate (rtx immediate, machine_mode mode)
> {
> +
> + /* If a floating point number was passed and we desire to use it in an
> + integer mode do the conversion to integer. */
> + if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
> + {
> + unsigned HOST_WIDE_INT ival;
> + if (!aarch64_reinterpret_float_as_int (immediate, &ival))
> + gcc_unreachable ();
> + immediate = gen_int_mode (ival, mode);
> + }
> +
> machine_mode vmode;
> + int width = GET_MODE_BITSIZE (mode) * 2;
Dubious * 2 again!
> + if (width < 64)
> + width = 64;
>
> gcc_assert (!VECTOR_MODE_P (mode));
> - vmode = aarch64_simd_container_mode (mode, 64);
> + vmode = aarch64_simd_container_mode (mode, width);
> rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
> - return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
> + return aarch64_output_simd_mov_immediate (v_op, vmode, width);
> }
>
> /* Split operands into moves from op[1] + op[2] into op[0]. */
Thanks,
James