[PATCH 2/3] aarch64: Add support for moving fpm system register
Kyrylo Tkachov
ktkachov@nvidia.com
Mon Jul 22 09:51:46 GMT 2024
Hi Claudio,
> On 22 Jul 2024, at 11:30, Claudio Bantaloukas <claudio.bantaloukas@arm.com> wrote:
>
> External email: Use caution opening links or attachments
>
>
> Unlike most system registers, fpmr can be heavily written to in code that
> exercises the fp8 functionality. That is because every fp8 instrinsic call
> can potentially change the value of fpmr.
> Rather than just use a an unspec, we treat the fpmr system register like
> all other registers and use a move operation to read and write to it.
>
> We introduce a new class of moveable system registers that, currently,
> only accepts fpmr and a new constraint, Umv, that allows us to
> selectively use mrs and msr instructions when expanding rtl for them.
> Given that there is code that depends on "real" registers coming before
> "fake" ones, we introduce a new constant FPM_REGNUM that uses an
> existing value and renumber registers below that.
> This requires us to update the bitmaps that describe which registers
> belong to each register class.
>
This approach sounds reasonable to me. Some comments inline below.
> gcc/ChangeLog:
>
> * config/aarch64/aarch64.cc (aarch64_hard_regno_nregs): Add
> support for MOVEABLE_SYSREGS class.
> (aarch64_hard_regno_mode_ok): Only allow 64 bit reads and writes
> to fpmr.
> (aarch64_regno_regclass): Support MOVEABLE_SYSREGS class.
> (aarch64_class_max_nregs): Likewise.
> * config/aarch64/aarch64.h (FIXED_REGISTERS): add fpmr.
> (CALL_REALLY_USED_REGISTERS): Likewise.
> (REGISTER_NAMES): Likewise.
> (enum reg_class): Add MOVEABLE_SYSREGS class.
> (REG_CLASS_NAMES): Likewise.
> (REG_CLASS_CONTENTS): Update class bitmaps to deal with fpmr,
> the new MOVEABLE_REGS class and renumbering of registers.
> * config/aarch64/aarch64.md: (FPM_REGNUM): added new register
> number, reusing old value.
> (FFR_REGNUM): Renumber.
> (FFRT_REGNUM): Likewise.
> (LOWERING_REGNUM): Likewise.
> (TPIDR2_BLOCK_REGNUM): Likewise.
> (SME_STATE_REGNUM): Likewise.
> (TPIDR2_SETUP_REGNUM): Likewise.
> (ZA_FREE_REGNUM): Likewise.
> (ZA_SAVED_REGNUM): Likewise.
> (ZA_REGNUM): Likewise.
> (ZT0_REGNUM): Likewise.
> (*mov<mode>_aarch64): Add support for moveable sysregs.
> (*movsi_aarch64): Likewise.
> (*movdi_aarch64): Likewise.
> * config/aarch64/constraints.md (MOVEABLE_SYSREGS): New constraint.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/acle/fp8.c: New tests.
> ---
> gcc/config/aarch64/aarch64.cc | 9 ++
> gcc/config/aarch64/aarch64.h | 14 ++-
> gcc/config/aarch64/aarch64.md | 30 ++++--
> gcc/config/aarch64/constraints.md | 3 +
> gcc/testsuite/gcc.target/aarch64/acle/fp8.c | 107 +++++++++++++++++++-
> 5 files changed, 146 insertions(+), 17 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 0d9e80d85b2..fa526836c6a 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -2018,6 +2018,7 @@ aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
> case PR_HI_REGS:
> return mode == VNx32BImode ? 2 : 1;
>
> + case MOVEABLE_SYSREGS:
> case FFR_REGS:
> case PR_AND_FFR_REGS:
> case FAKE_REGS:
> @@ -2045,6 +2046,10 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
> /* This must have the same size as _Unwind_Word. */
> return mode == DImode;
>
> + if (regno == FPM_REGNUM)
> + /* This must have the same size as the FPMR register. */
> + return mode == QImode || mode == HImode || mode == SImode || mode == DImode;
> +
> unsigned int vec_flags = aarch64_classify_vector_mode (mode);
> if (vec_flags == VEC_SVE_PRED)
> return pr_or_ffr_regnum_p (regno);
> @@ -12682,6 +12687,9 @@ aarch64_regno_regclass (unsigned regno)
> if (PR_REGNUM_P (regno))
> return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
>
> + if (regno == FPM_REGNUM)
> + return MOVEABLE_SYSREGS;
> +
> if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
> return FFR_REGS;
>
> @@ -13070,6 +13078,7 @@ aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
> case PR_HI_REGS:
> return mode == VNx32BImode ? 2 : 1;
>
> + case MOVEABLE_SYSREGS:
> case STACK_REG:
> case FFR_REGS:
> case PR_AND_FFR_REGS:
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 40793aab814..7385bdaa456 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -522,6 +522,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
> 1, 1, 1, 1, /* SFP, AP, CC, VG */ \
> 0, 0, 0, 0, 0, 0, 0, 0, /* P0 - P7 */ \
> 0, 0, 0, 0, 0, 0, 0, 0, /* P8 - P15 */ \
> + 1, /* FPMR */ \
> 1, 1, /* FFR and FFRT */ \
> 1, 1, 1, 1, 1, 1, 1, 1 /* Fake registers */ \
> }
> @@ -546,6 +547,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
> 1, 1, 1, 0, /* SFP, AP, CC, VG */ \
> 1, 1, 1, 1, 1, 1, 1, 1, /* P0 - P7 */ \
> 1, 1, 1, 1, 1, 1, 1, 1, /* P8 - P15 */ \
> + 1, /* FPMR */ \
> 1, 1, /* FFR and FFRT */ \
> 0, 0, 0, 0, 0, 0, 0, 0 /* Fake registers */ \
> }
> @@ -563,6 +565,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
> "sfp", "ap", "cc", "vg", \
> "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", \
> "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", \
> + "fpmr", \
> "ffr", "ffrt", \
> "lowering", "tpidr2_block", "sme_state", "tpidr2_setup", \
> "za_free", "za_saved", "za", "zt0" \
> @@ -774,6 +777,7 @@ enum reg_class
> PR_REGS,
> FFR_REGS,
> PR_AND_FFR_REGS,
> + MOVEABLE_SYSREGS,
> FAKE_REGS,
> ALL_REGS,
> LIM_REG_CLASSES /* Last */
> @@ -800,6 +804,7 @@ enum reg_class
> "PR_REGS", \
> "FFR_REGS", \
> "PR_AND_FFR_REGS", \
> + "MOVEABLE_SYSREGS", \
> "FAKE_REGS", \
> "ALL_REGS" \
> }
> @@ -821,10 +826,11 @@ enum reg_class
> { 0x00000000, 0x00000000, 0x00000ff0 }, /* PR_LO_REGS */ \
> { 0x00000000, 0x00000000, 0x000ff000 }, /* PR_HI_REGS */ \
> { 0x00000000, 0x00000000, 0x000ffff0 }, /* PR_REGS */ \
> - { 0x00000000, 0x00000000, 0x00300000 }, /* FFR_REGS */ \
> - { 0x00000000, 0x00000000, 0x003ffff0 }, /* PR_AND_FFR_REGS */ \
> - { 0x00000000, 0x00000000, 0x3fc00000 }, /* FAKE_REGS */ \
> - { 0xffffffff, 0xffffffff, 0x000fffff } /* ALL_REGS */ \
> + { 0x00000000, 0x00000000, 0x00600000 }, /* FFR_REGS */ \
> + { 0x00000000, 0x00000000, 0x006ffff0 }, /* PR_AND_FFR_REGS */ \
> + { 0x00000000, 0x00000000, 0x00100000 }, /* MOVEABLE_SYSREGS */ \
> + { 0x00000000, 0x00000000, 0x7f800000 }, /* FAKE_REGS */ \
> + { 0xffffffff, 0xffffffff, 0x001fffff } /* ALL_REGS */ \
> }
>
> #define REGNO_REG_CLASS(REGNO) aarch64_regno_regclass (REGNO)
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 94ff0eefa77..b654c0b9bb8 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -107,10 +107,14 @@ (define_constants
> (P14_REGNUM 82)
> (P15_REGNUM 83)
> (LAST_SAVED_REGNUM 83)
> - (FFR_REGNUM 84)
> +
> + ;; Floating Point Mode Register, used in FP8 insns.
> + (FPM_REGNUM 84)
> +
Why not define FPM_REGNUM...
> + (FFR_REGNUM 85)
> ;; "FFR token": a fake register used for representing the scheduling
> ;; restrictions on FFR-related operations.
> - (FFRT_REGNUM 85)
> + (FFRT_REGNUM 86)
>
> ;; ----------------------------------------------------------------
> ;; Fake registers
> @@ -122,17 +126,17 @@ (define_constants
> ;; ABI-related lowering is needed. These placeholders read and
> ;; write this register. Instructions that depend on the lowering
> ;; read the register.
> - (LOWERING_REGNUM 86)
> + (LOWERING_REGNUM 87)
>
> ;; Represents the contents of the current function's TPIDR2 block,
> ;; in abstract form.
> - (TPIDR2_BLOCK_REGNUM 87)
> + (TPIDR2_BLOCK_REGNUM 88)
>
> ;; Holds the value that the current function wants PSTATE.ZA to be.
> ;; The actual value can sometimes vary, because it does not track
> ;; changes to PSTATE.ZA that happen during a lazy save and restore.
> ;; Those effects are instead tracked by ZA_SAVED_REGNUM.
> - (SME_STATE_REGNUM 88)
> + (SME_STATE_REGNUM 89)
>
> ;; Instructions write to this register if they set TPIDR2_EL0 to a
> ;; well-defined value. Instructions read from the register if they
> @@ -140,14 +144,14 @@ (define_constants
> ;;
> ;; The register does not model the architected TPIDR2_ELO, just the
> ;; current function's management of it.
> - (TPIDR2_SETUP_REGNUM 89)
> + (TPIDR2_SETUP_REGNUM 90)
>
> ;; Represents the property "has an incoming lazy save been committed?".
> - (ZA_FREE_REGNUM 90)
> + (ZA_FREE_REGNUM 91)
>
> ;; Represents the property "are the current function's ZA contents
> ;; stored in the lazy save buffer, rather than in ZA itself?".
> - (ZA_SAVED_REGNUM 91)
> + (ZA_SAVED_REGNUM 92)
>
> ;; Represents the contents of the current function's ZA state in
> ;; abstract form. At various times in the function, these contents
> @@ -155,10 +159,10 @@ (define_constants
> ;;
> ;; The contents persist even when the architected ZA is off. Private-ZA
> ;; functions have no effect on its contents.
> - (ZA_REGNUM 92)
> + (ZA_REGNUM 93)
>
> ;; Similarly represents the contents of the current function's ZT0 state.
> - (ZT0_REGNUM 93)
> + (ZT0_REGNUM 94)
>
…. Here as 95. Then you’d avoid having to update all the other regnums to new values. I don’t think we aim to have these register names in alphabetical order...
> (FIRST_FAKE_REGNUM LOWERING_REGNUM)
> (LAST_FAKE_REGNUM ZT0_REGNUM)
> @@ -1405,6 +1409,8 @@ (define_insn "*mov<mode>_aarch64"
> [w, r Z ; neon_from_gp<q>, nosimd ] fmov\t%s0, %w1
> [w, w ; neon_dup , simd ] dup\t%<Vetype>0, %1.<v>[0]
> [w, w ; neon_dup , nosimd ] fmov\t%s0, %s1
> + [Umv, r ; mrs , * ] msr\t%0, %x1
> + [r, Umv ; mrs , * ] mrs\t%x0, %1
> }
> )
>
> @@ -1467,6 +1473,8 @@ (define_insn_and_split "*movsi_aarch64"
> [r , w ; f_mrc , fp , 4] fmov\t%w0, %s1
> [w , w ; fmov , fp , 4] fmov\t%s0, %s1
> [w , Ds ; neon_move, simd, 4] << aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
> + [Umv, r ; mrs , * , 8] msr\t%0, %x1
> + [r, Umv ; mrs , * , 8] mrs\t%x0, %1
> }
> "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
> && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
> @@ -1505,6 +1513,8 @@ (define_insn_and_split "*movdi_aarch64"
> [w, w ; fmov , fp , 4] fmov\t%d0, %d1
> [w, Dd ; neon_move, simd, 4] << aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);
> [w, Dx ; neon_move, simd, 8] #
> + [Umv, r; mrs , * , 8] msr\t%0, %x1
> + [r, Umv; mrs , * , 8] mrs\t%x0, %1
> }
> "CONST_INT_P (operands[1])
> && REG_P (operands[0])
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index a2569cea510..0c81fb28f7e 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -77,6 +77,9 @@ (define_register_constraint "Upl" "PR_LO_REGS"
> (define_register_constraint "Uph" "PR_HI_REGS"
> "SVE predicate registers p8 - p15.")
>
> +(define_register_constraint "Umv" "MOVEABLE_SYSREGS"
> + "@internal System Registers suitable for moving rather than requiring an unspec msr")
> +
> (define_constraint "c"
> "@internal The condition code register."
> (match_operand 0 "cc_register"))
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
> index b774f28c9f0..10fd128d8f9 100644
> --- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
> @@ -1,14 +1,14 @@
> /* Test the fp8 ACLE intrinsics family. */
> /* { dg-do compile } */
> /* { dg-options "-O1 -march=armv8-a" } */
> -/* { dg-final { check-function-bodies "**" "" } } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
>
> #ifdef __ARM_FEATURE_FP8
> #error "__ARM_FEATURE_FP8 feature macro defined."
> #endif
>
> #pragma GCC push_options
> -#pragma GCC target ("arch=armv9.4-a+fp8")
> +#pragma GCC target("arch=armv9.4-a+fp8")
>
Some unnecessary whitespace changes?
Anyway, this feature macro test should be at the end of the FP8 support as discussed in patch 1/3.
Thanks,
Kyrill
> #include <arm_acle.h>
>
> @@ -16,4 +16,105 @@
> #error "__ARM_FEATURE_FP8 feature macro not defined."
> #endif
>
> -#pragma GCC pop_options
> +/*
> +**test_write_fpmr_sysreg_asm_64:
> +** msr fpmr, x0
> +** ret
> +*/
> +void
> +test_write_fpmr_sysreg_asm_64 (uint64_t val)
> +{
> + register uint64_t fpmr asm ("fpmr") = val;
> + asm volatile ("" ::"Umv"(fpmr));
> +}
> +
> +/*
> +**test_write_fpmr_sysreg_asm_32:
> +** uxtw x0, w0
> +** msr fpmr, x0
> +** ret
> +*/
> +void
> +test_write_fpmr_sysreg_asm_32 (uint32_t val)
> +{
> + register uint64_t fpmr asm ("fpmr") = val;
> + asm volatile ("" ::"Umv"(fpmr));
> +}
> +
> +/*
> +**test_write_fpmr_sysreg_asm_16:
> +** and x0, x0, 65535
> +** msr fpmr, x0
> +** ret
> +*/
> +void
> +test_write_fpmr_sysreg_asm_16 (uint16_t val)
> +{
> + register uint64_t fpmr asm ("fpmr") = val;
> + asm volatile ("" ::"Umv"(fpmr));
> +}
> +
> +/*
> +**test_write_fpmr_sysreg_asm_8:
> +** and x0, x0, 255
> +** msr fpmr, x0
> +** ret
> +*/
> +void
> +test_write_fpmr_sysreg_asm_8 (uint8_t val)
> +{
> + register uint64_t fpmr asm ("fpmr") = val;
> + asm volatile ("" ::"Umv"(fpmr));
> +}
> +
> +/*
> +**test_read_fpmr_sysreg_asm_64:
> +** mrs x0, fpmr
> +** ret
> +*/
> +uint64_t
> +test_read_fpmr_sysreg_asm_64 ()
> +{
> + register uint64_t fpmr asm ("fpmr");
> + asm volatile ("" : "=Umv"(fpmr) :);
> + return fpmr;
> +}
> +
> +/*
> +**test_read_fpmr_sysreg_asm_32:
> +** mrs x0, fpmr
> +** ret
> +*/
> +uint32_t
> +test_read_fpmr_sysreg_asm_32 ()
> +{
> + register uint32_t fpmr asm ("fpmr");
> + asm volatile ("" : "=Umv"(fpmr) :);
> + return fpmr;
> +}
> +
> +/*
> +**test_read_fpmr_sysreg_asm_16:
> +** mrs x0, fpmr
> +** ret
> +*/
> +uint16_t
> +test_read_fpmr_sysreg_asm_16 ()
> +{
> + register uint16_t fpmr asm ("fpmr");
> + asm volatile ("" : "=Umv"(fpmr) :);
> + return fpmr;
> +}
> +
> +/*
> +**test_read_fpmr_sysreg_asm_8:
> +** mrs x0, fpmr
> +** ret
> +*/
> +uint8_t
> +test_read_fpmr_sysreg_asm_8 ()
> +{
> + register uint8_t fpmr asm ("fpmr");
> + asm volatile ("" : "=Umv"(fpmr) :);
> + return fpmr;
> +}
More information about the Gcc-patches
mailing list