This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
RE: [PATCH][AArch64] Enable fusion of AES instructions
- From: Wilco Dijkstra <Wilco dot Dijkstra at arm dot com>
- To: Kyrylo Tkachov <Kyrylo dot Tkachov at arm dot com>
- Cc: 'GCC Patches' <gcc-patches at gcc dot gnu dot org>, nd <nd at arm dot com>
- Date: Tue, 15 Dec 2015 11:05:22 +0000
- Subject: RE: [PATCH][AArch64] Enable fusion of AES instructions
- Authentication-results: sourceware.org; auth=none
- Nodisclaimer: True
- References: <000001d1067c$1c5021e0$54f065a0$ at com> <56267012 dot 90509 at arm dot com>
- Spamdiagnosticmetadata: NSPM
- Spamdiagnosticoutput: 1:23
Kyrill Tkachov wrote:
> On 14/10/15 13:30, Wilco Dijkstra wrote:
> > Enable instruction fusion of dependent AESE; AESMC and AESD; AESIMC pairs. This can give up to 2x
> > speedup on many AArch64 implementations. Also model the crypto instructions on Cortex-A57 according
> > to the Optimization Guide.
> >
> > Passes regression tests.
>
> arm-wise this is ok, but I'd like a follow up patch to enable this fusion
> for the arm port as well. It should be fairly simple.
> Just add a new enum value to fuse_ops inside tune_params in arm-protos.h
> and update the arm implementation in aarch_macro_fusion_pair_p similar
> to your aarch64 implementation.
I sent out a patch for AArch32 as well. Assuming you're still OK, could you commit this please?
Wilco
> > ChangeLog:
> > 2015-10-14 Wilco Dijkstra <wdijkstr@arm.com>
> >
> > * gcc/config/aarch64/aarch64.c (cortexa53_tunings): Add AES fusion.
> > (cortexa57_tunings): Likewise.
> > (cortexa72_tunings): Likewise.
> > (arch_macro_fusion_pair_p): Add support for AES fusion.
> > * gcc/config/aarch64/aarch64-fusion-pairs.def: Add AES_AESMC entry.
> > * gcc/config/arm/aarch-common.c (aarch_crypto_can_dual_issue):
> > Allow virtual registers before reload so early scheduling works.
> > * gcc/config/arm/cortex-a57.md (cortex_a57_crypto_simple): Use
> > correct latency and pipeline.
> > (cortex_a57_crypto_complex): Likewise.
> > (cortex_a57_crypto_xor): Likewise.
> > (define_bypass): Add AES bypass.
> >
> >
> > ---
> > gcc/config/aarch64/aarch64-fusion-pairs.def | 1 +
> > gcc/config/aarch64/aarch64.c | 10 +++++++---
> > gcc/config/arm/aarch-common.c | 7 +++++--
> > gcc/config/arm/cortex-a57.md | 17 +++++++++++------
> > 4 files changed, 24 insertions(+), 11 deletions(-)
> >
> > diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def
> > b/gcc/config/aarch64/aarch64-fusion-pairs.def
> > index 53bbef4..fea79fc 100644
> > --- a/gcc/config/aarch64/aarch64-fusion-pairs.def
> > +++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
> > @@ -33,4 +33,5 @@ AARCH64_FUSION_PAIR ("adrp+add", ADRP_ADD)
> > AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK)
> > AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR)
> > AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
> > +AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
> >
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index 230902d..96368c6 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -376,7 +376,7 @@ static const struct tune_params cortexa53_tunings =
> > &generic_branch_cost,
> > 4, /* memmov_cost */
> > 2, /* issue_rate */
> > - (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> > + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> > | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
> > 8, /* function_align. */
> > 8, /* jump_align. */
> > @@ -398,7 +398,7 @@ static const struct tune_params cortexa57_tunings =
> > &generic_branch_cost,
> > 4, /* memmov_cost */
> > 3, /* issue_rate */
> > - (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> > + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> > | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
> > 16, /* function_align. */
> > 8, /* jump_align. */
> > @@ -420,7 +420,7 @@ static const struct tune_params cortexa72_tunings =
> > &generic_branch_cost,
> > 4, /* memmov_cost */
> > 3, /* issue_rate */
> > - (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> > + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
> > | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
> > 16, /* function_align. */
> > 8, /* jump_align. */
> > @@ -12843,6 +12843,10 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
> > }
> > }
> >
> > + if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_AES_AESMC)
> > + && aarch_crypto_can_dual_issue (prev, curr))
> > + return true;
> > +
> > if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
> > && any_condjump_p (curr))
> > {
> > diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c
> > index 5dd8222..e191ab6 100644
> > --- a/gcc/config/arm/aarch-common.c
> > +++ b/gcc/config/arm/aarch-common.c
> > @@ -63,8 +63,11 @@ aarch_crypto_can_dual_issue (rtx_insn *producer_insn, rtx_insn *consumer_insn)
> > {
> > unsigned int regno = REGNO (SET_DEST (producer_set));
> >
> > - return REGNO (SET_DEST (consumer_set)) == regno
> > - && REGNO (XVECEXP (consumer_src, 0, 0)) == regno;
> > + /* Before reload the registers are virtual, so the destination of
> > + consumer_set doesn't need to match. */
> > +
> > + return (REGNO (SET_DEST (consumer_set)) == regno || !reload_completed)
> > + && REGNO (XVECEXP (consumer_src, 0, 0)) == regno;
> > }
> >
> > return 0;
> > diff --git a/gcc/config/arm/cortex-a57.md b/gcc/config/arm/cortex-a57.md
> > index a32c848..eab9d99 100644
> > --- a/gcc/config/arm/cortex-a57.md
> > +++ b/gcc/config/arm/cortex-a57.md
> > @@ -745,20 +745,20 @@
> > neon_fp_sqrt_s_q, neon_fp_sqrt_d_q"))
> > "ca57_cx2_block*3")
> >
> > -(define_insn_reservation "cortex_a57_crypto_simple" 4
> > +(define_insn_reservation "cortex_a57_crypto_simple" 3
> > (and (eq_attr "tune" "cortexa57")
> > (eq_attr "type" "crypto_aese,crypto_aesmc,crypto_sha1_fast,crypto_sha256_fast"))
> > - "ca57_cx2")
> > + "ca57_cx1")
> >
> > -(define_insn_reservation "cortex_a57_crypto_complex" 7
> > +(define_insn_reservation "cortex_a57_crypto_complex" 6
> > (and (eq_attr "tune" "cortexa57")
> > (eq_attr "type" "crypto_sha1_slow,crypto_sha256_slow"))
> > - "ca57_cx2+(ca57_cx2_issue,ca57_cx2)")
> > + "ca57_cx1*2")
> >
> > -(define_insn_reservation "cortex_a57_crypto_xor" 7
> > +(define_insn_reservation "cortex_a57_crypto_xor" 6
> > (and (eq_attr "tune" "cortexa57")
> > (eq_attr "type" "crypto_sha1_xor"))
> > - "(ca57_cx1+ca57_cx2)")
> > + "(ca57_cx1*2)|(ca57_cx2*2)")
> >
> > ;; We lie with calls. They take up all issue slots, but are otherwise
> > ;; not harmful.
> > @@ -795,3 +795,8 @@
> > (define_bypass 1 "cortex_a57_*"
> > "cortex_a57_call,cortex_a57_branch")
> >
> > +;; AESE+AESMC and AESD+AESIMC pairs forward with zero latency
> > +(define_bypass 0 "cortex_a57_crypto_simple"
> > + "cortex_a57_crypto_simple"
> > + "aarch_crypto_can_dual_issue")
> > +