This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH][aarch64] Avoid tag collisions for loads on falkor


Hi Siddhesh,

On 02/07/18 10:15, Siddhesh Poyarekar wrote:
Hi,

This is a rewrite of the tag collision avoidance patch that Kugan had
written as a machine reorg pass back in February[1].

The falkor hardware prefetching system uses a combination of the
source, destination and offset to decide which prefetcher unit to
train with the load.  This is great when loads in a loop are
sequential but sub-optimal if there are unrelated loads in a loop that
tag to the same prefetcher unit.

This pass attempts to rename the desination register of such colliding
loads using routines available in regrename.c so that their tags do
not collide.  This shows some performance gains with mcf and xalancbmk
(~5% each) and will be tweaked further.  The pass is placed near the
fag end of the pass list so that subsequent passes don't inadvertantly
end up undoing the renames.

A full gcc bootstrap and testsuite ran successfully on aarch64, i.e. it
did not introduce any new regressions.  I also did a make-check with
-mcpu=falkor to ensure that there were no regressions.  The couple of
regressions I found were target-specific and were related to scheduling
and cost differences and are not correctness issues.


Nice! What were the regressions though? Would be nice to adjust the tests
to make them more robust so that we have as clean a testsuite as possible.

[1] https://patchwork.ozlabs.org/patch/872532/

2018-07-02  Siddhesh Poyarekar <siddhesh@sourceware.org>
            Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>

        * config/aarch64/falkor-tag-collision-avoidance.c: New file.
        * config.gcc (extra_objs): Build it.
        * config/aarch64/t-aarch64 (falkor-tag-collision-avoidance.o):
        Likewise.
        * config/aarch64/aarch64-passes.def
        (pass_tag_collision_avoidance): New pass.
        * config/aarch64/aarch64.c (qdf24xx_tunings): Add
        AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS to tuning_flags.
        (aarch64_classify_address): Remove static qualifier.
        (aarch64_address_info, aarch64_address_type): Move to...
        * config/aarch64/aarch64-protos.h: ... here.
        (make_pass_tag_collision_avoidance): New function.
        * config/aarch64/aarch64-tuning-flags.def (rename_load_regs):
        New tuning flag.


More comments inline, but a general observation:
in the function comment for the new functions can you please include a description
of the function arguments and the meaning of the return value (for example, some functions return -1 ; what does that mean?).
It really does make it much easier to maintain the code after some time has passed.

Thanks,
Kyrill

---
 gcc/config.gcc                                |   2 +-
 gcc/config/aarch64/aarch64-passes.def         |   1 +
 gcc/config/aarch64/aarch64-protos.h           |  49 ++
 gcc/config/aarch64/aarch64-tuning-flags.def   |   2 +
 gcc/config/aarch64/aarch64.c                  |  48 +-
 .../aarch64/falkor-tag-collision-avoidance.c  | 821 ++++++++++++++++++
 gcc/config/aarch64/t-aarch64                  |   9 +
 8 files changed, 891 insertions(+), 46 deletions(-)
 create mode 100644 gcc/config/aarch64/falkor-tag-collision-avoidance.c

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 4d9f9c6ea29..b78a30f5d69 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -304,7 +304,7 @@ aarch64*-*-*)
         extra_headers="arm_fp16.h arm_neon.h arm_acle.h"
         c_target_objs="aarch64-c.o"
         cxx_target_objs="aarch64-c.o"
-       extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o"
+       extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o falkor-tag-collision-avoidance.o"
target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c"
         target_has_targetm_common=yes
         ;;
diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def
index 87747b420b0..f61a8870aa1 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -19,3 +19,4 @@
    <http://www.gnu.org/licenses/>. */

 INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering);
+INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 4ea50acaa59..175a3faf057 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -283,6 +283,49 @@ struct tune_params
   const struct cpu_prefetch_tune *prefetch;
 };

+/* Classifies an address.
+
+   ADDRESS_REG_IMM
+       A simple base register plus immediate offset.
+
+   ADDRESS_REG_WB
+       A base register indexed by immediate offset with writeback.
+
+   ADDRESS_REG_REG
+       A base register indexed by (optionally scaled) register.
+
+   ADDRESS_REG_UXTW
+       A base register indexed by (optionally scaled) zero-extended register.
+
+   ADDRESS_REG_SXTW
+       A base register indexed by (optionally scaled) sign-extended register.
+
+   ADDRESS_LO_SUM
+       A LO_SUM rtx with a base register and "LO12" symbol relocation.
+
+   ADDRESS_SYMBOLIC:
+       A constant symbolic address, in pc-relative literal pool.  */
+
+enum aarch64_address_type {
+  ADDRESS_REG_IMM,
+  ADDRESS_REG_WB,
+  ADDRESS_REG_REG,
+  ADDRESS_REG_UXTW,
+  ADDRESS_REG_SXTW,
+  ADDRESS_LO_SUM,
+  ADDRESS_SYMBOLIC
+};
+
+/* Address information.  */
+struct aarch64_address_info {
+  enum aarch64_address_type type;
+  rtx base;
+  rtx offset;
+  poly_int64 const_offset;
+  int shift;
+  enum aarch64_symbol_type symbol_type;
+};
+
 #define AARCH64_FUSION_PAIR(x, name) \
   AARCH64_FUSE_##name##_index,
 /* Supported fusion operations.  */
@@ -546,6 +589,11 @@ void aarch64_swap_ldrstr_operands (rtx *, bool);
 extern void aarch64_asm_output_pool_epilogue (FILE *, const char *,
                                               tree, HOST_WIDE_INT);

+
+extern bool aarch64_classify_address (struct aarch64_address_info *, rtx,
+                                     machine_mode, bool,
+ aarch64_addr_query_type = ADDR_QUERY_M);
+
 /* Defined in common/config/aarch64-common.c.  */
 bool aarch64_handle_option (struct gcc_options *, struct gcc_options *,
                              const struct cl_decoded_option *, location_t);
@@ -556,6 +604,7 @@ std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
unsigned long);

 rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
+rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *ctxt);

 poly_uint64 aarch64_regmode_natural_size (machine_mode);

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index ea9ead234cb..2bfb470d605 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -41,4 +41,6 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
 /* Disallow load/store pair instructions on Q-registers. */
 AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)

+AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index afc91850d6f..1452ec71803 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -78,48 +78,6 @@
 /* Defined for convenience.  */
 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)

-/* Classifies an address.
-
-   ADDRESS_REG_IMM
-       A simple base register plus immediate offset.
-
-   ADDRESS_REG_WB
-       A base register indexed by immediate offset with writeback.
-
-   ADDRESS_REG_REG
-       A base register indexed by (optionally scaled) register.
-
-   ADDRESS_REG_UXTW
-       A base register indexed by (optionally scaled) zero-extended register.
-
-   ADDRESS_REG_SXTW
-       A base register indexed by (optionally scaled) sign-extended register.
-
-   ADDRESS_LO_SUM
-       A LO_SUM rtx with a base register and "LO12" symbol relocation.
-
-   ADDRESS_SYMBOLIC:
-       A constant symbolic address, in pc-relative literal pool.  */
-
-enum aarch64_address_type {
-  ADDRESS_REG_IMM,
-  ADDRESS_REG_WB,
-  ADDRESS_REG_REG,
-  ADDRESS_REG_UXTW,
-  ADDRESS_REG_SXTW,
-  ADDRESS_LO_SUM,
-  ADDRESS_SYMBOLIC
-};
-
-struct aarch64_address_info {
-  enum aarch64_address_type type;
-  rtx base;
-  rtx offset;
-  poly_int64 const_offset;
-  int shift;
-  enum aarch64_symbol_type symbol_type;
-};
-
 /* Information about a legitimate vector immediate operand.  */
 struct simd_immediate_info
 {
@@ -906,7 +864,7 @@ static const struct tune_params qdf24xx_tunings =
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),           /* tune_flags.  */
+  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
   &qdf24xx_prefetch_tune
 };

@@ -5697,10 +5655,10 @@ virt_or_elim_regno_p (unsigned regno)
    If it is, fill in INFO appropriately.  STRICT_P is true if
    REG_OK_STRICT is in effect.  */

-static bool
+bool
 aarch64_classify_address (struct aarch64_address_info *info,
                           rtx x, machine_mode mode, bool strict_p,
-                         aarch64_addr_query_type type = ADDR_QUERY_M)
+                         aarch64_addr_query_type type)
 {
   enum rtx_code code = GET_CODE (x);
   rtx op0, op1;
diff --git a/gcc/config/aarch64/falkor-tag-collision-avoidance.c b/gcc/config/aarch64/falkor-tag-collision-avoidance.c
new file mode 100644
index 00000000000..b31c13077b0
--- /dev/null
+++ b/gcc/config/aarch64/falkor-tag-collision-avoidance.c
@@ -0,0 +1,821 @@
+/* Tag Collision Avoidance pass for Falkor.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>. */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#define INCLUDE_LIST
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
+#include "tree.h"
+#include "tree-pass.h"
+#include "aarch64-protos.h"
+#include "hash-map.h"
+#include "cfgloop.h"
+#include "cfgrtl.h"
+#include "rtl-iter.h"
+#include "df.h"
+#include "memmodel.h"
+#include "optabs.h"
+#include "regs.h"
+#include "recog.h"
+#include "regrename.h"
+#include "print-rtl.h"
+
+/* The Falkor hardware prefetching system uses the encoding of the registers and
+   offsets of loads to decide which of the multiple hardware prefetchers to
+   assign the load to.  This has the positive effect of accelerating prefetches
+   when all related loads with uniform strides are assigned to the same
+   prefetcher unit.  The down side is that because of the way the assignment
+   works, multiple unrelated loads may end up on the same prefetch unit, thus
+   causing the unit to bounce between different sets of addresses and never
+   train correctly.  The point of this pass is to avoid such collisions so that
+   unrelated loads are spread out to different prefetchers.  It also makes a
+   rudimentarny attempt to ensure that related loads with the same tags don't
+   get moved out unnecessarily.

s/rudimentarny/rudimentary/

+
+   Perhaps a future enhancement would be to make a more concerted attempt to
+   get related loads under the same tag.  See the memcpy/memset implementation
+   for falkor in glibc to understand the kind of impact this can have on
+   falkor.
+
+   The assignment of loads is based on a tag that is computed from the encoding
+   of the first destination register (the only destination in case of LDR), the
+   base register and the offset (either the register or the immediate value, as
+   encoded in the instruction).  This is what the 14 bit tag looks like:
+
+   |<- 6 bits ->|<- 4b ->|<- 4b ->|
+   --------------------------------
+   |  OFFSET    |  SRC   |  DST   |
+   --------------------------------
+
+   For all cases, the SRC and DST are the 4 LSB of the encoding of the register
+   in the instruction.  Offset computation is more involved and is as follows:
+
+   - For register offset addressing: 4 LSB of the offset register with the MSB
+     of the 6 bits set to 1.
+
+   - For immediate offset: 4 LSB of the encoded immediate offset.  The encoding
+     depends on the width of the load and is expressed as multiples of the
+     width.
+
+   - For loads with update: 4 LSB of the offset.  The encoding here is the
+     exact number by which the base is offset and incremented.
+
+   Based on the above it is clear that registers 0 and 16 will result in
+   collisions, 1 and 17 and so on.  This pass detects such collisions within a
+   def/use chain of the source register in a loop and tries to resolve the
+   collision by renaming one of the destination registers. */
+
+/* Get the destination part of the tag.  */
+#define TAG_GET_DEST(__tag) ((__tag) & 0xf)
+
+/* Get the tag with the destination part updated.  */
+#define TAG_UPDATE_DEST(__tag, __dest) (((__tag) & ~0xf) | (__dest & 0xf))
+
+/* The instruction information structure.  This is used to cache information
+   about the INSN that we derive when traversing through all of the insns in
+   loops.  */
+class tag_insn_info
+{
+public:
+  rtx_insn *insn;
+  rtx base;
+  rtx dest;
+  rtx offset;
+  bool writeback;
+  bool ldp;
+
+  tag_insn_info (rtx_insn *insn, rtx dest, rtx base, rtx offset,
+                bool writeback, bool ldp)
+    {
+      this->insn = insn;
+      this->dest = dest;
+      this->base = base;
+      this->offset = offset;
+      this->writeback = writeback;
+      this->ldp = ldp;
+    }
+

Since this is C++ you can write it as the more idiomatic constructor initialiser list (I think that's what it's called):
tag_insn_info (rtx_insn *i, rtx b, rtx d, rtx o, bool wr, bool l) : insn (i), base (b), dest (d) etc.

+  /* Compute the tag based on BASE, DEST and OFFSET of the load.  */
+  unsigned tag ()
+    {
+      unsigned int_offset = 0;
+      rtx offset = this->offset;
+      unsigned dest = REGNO (this->dest);
+      unsigned base = REGNO (this->base);
+      machine_mode dest_mode = GET_MODE (this->dest);
+      unsigned dest_mode_size = GET_MODE_SIZE (dest_mode).to_constant ();
+

I appreciate this pass is unlikely to be used with SVE code but it would be nice if we could make it
variable-with-mode-proof. Current practice is to add a comment to .to_constant () calls explaining why
we guarantee that the size is constant, or otherwise check is_constant () and have appropriate fallbacks.
Check other uses of to_constant () and is_constant () in aarch64.c for examples. This applies to all uses
of to_constant () in this file.

+      /* For loads of larger than 16 bytes, the DEST part of the tag is 0.  */
+      if ((dest_mode_size << this->ldp) > 16)
+       dest = 0;
+
+      if (offset && REG_P (offset))
+       int_offset = (1 << 5) | REGNO (offset);
+      else if (offset && CONST_INT_P (offset))
+       {
+         int_offset = INTVAL (offset);
+         int_offset /= GET_MODE_SIZE (dest_mode).to_constant ();
+         if (!this->writeback)
+           int_offset >>= 2;
+       }
+      return ((dest & 0xf)
+             | ((base & 0xf) << 4)
+             | ((int_offset & 0x3f) << 8));
+    }
+};
+
+/* Hash map to traverse and process instructions with colliding tags.  */
+typedef hash_map <rtx, auto_vec <tag_insn_info *> > tag_map_t;
+
+/* Vector of instructions with colliding tags.  */
+typedef auto_vec <tag_insn_info *> insn_info_list_t;
+
+/* Pair of instruction information and unavailable register set to pass to
+   CHECK_COLLIDING_TAGS.  */
+typedef std::pair <tag_insn_info *, HARD_REG_SET *> arg_pair_t;
+
+
+/* Callback to free all tag_insn_info objects.  */
+bool
+free_insn_info (const rtx &t ATTRIBUTE_UNUSED, insn_info_list_t *v,
+               void *arg ATTRIBUTE_UNUSED)
+{
+  while (v->length () > 0)
+    delete v->pop ();
+
+  return true;
+}
+
+
+/* Add all aliases of the register to the unavailable register set.  */
+static void
+ignore_all_aliases (HARD_REG_SET *unavailable, machine_mode mode, unsigned reg)
+{
+  add_to_hard_reg_set (unavailable, mode, reg);
+  add_to_hard_reg_set (unavailable, mode, reg + 16);
+  add_to_hard_reg_set (unavailable, mode, reg + 32);
+  add_to_hard_reg_set (unavailable, mode, reg + 48);
+}
+
+
+/* Callback to check which destination registers are unavailable to us for
+   renaming because of the base and offset colliding.  */
+bool
+check_colliding_tags (const rtx &t, const insn_info_list_t &v, arg_pair_t *arg)
+{
+  HARD_REG_SET *unavailable = arg->second;
+  unsigned orig_tag = arg->first->tag ();
+  unsigned tag = INTVAL (t);
+  machine_mode mode = GET_MODE (arg->first->dest);
+
+  /* Can't collide with emptiness.  */
+  if (v.length () == 0)
+    return true;
+
+  /* Drop all aliased destination registers that result in the same
+     tag.  It is not necessary to drop all of them but we do anyway
+     because it is quicker than checking ranges.  */
+  if (TAG_UPDATE_DEST (tag, 0) == TAG_UPDATE_DEST (orig_tag, 0))
+    ignore_all_aliases (unavailable, mode, TAG_GET_DEST (tag));
+
+  return true;
+}
+
+
+/* Initialize and build a set of hard register numbers to avoid for
+   renaming.  */
+static enum reg_class
+init_unavailable (tag_insn_info *insn_info, tag_map_t &tag_map, du_head_p head,
+                 HARD_REG_SET *unavailable)
+{
+  unsigned dest = head->regno;
+  enum reg_class super_class = NO_REGS;
+  machine_mode mode = GET_MODE (insn_info->dest);
+
+  CLEAR_HARD_REG_SET (*unavailable);
+
+  for (struct du_chain *tmp = head->first; tmp; tmp = tmp->next_use)
+    {
+      if (DEBUG_INSN_P (tmp->insn))
+       continue;
+
+      IOR_COMPL_HARD_REG_SET (*unavailable, reg_class_contents[tmp->cl]);
+      super_class = reg_class_superunion[(int) super_class][(int) tmp->cl];
+    }
+
+  for (unsigned i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+    if (fixed_regs[i] || global_regs[i])
+      add_to_hard_reg_set (unavailable, mode, i);
+
+  arg_pair_t arg = arg_pair_t (insn_info, unavailable);
+
+  /* Exclude all registers that would lead to collisions with other loads.  */
+  tag_map.traverse <arg_pair_t *, check_colliding_tags> (&arg);
+
+  /* Finally, also ignore all aliases of the current reg. */
+  ignore_all_aliases (unavailable, mode, dest & 0xf);
+
+  return super_class;
+}
+
+
+/* Find a suitable and available register and rename the chain of occurrences
+   of the register HEAD in which INSN exists.  CUR_TAG, TAGS and TAG_MAP are
+   used to determine which registers are unavailable due to a potential
+   collision due to the rename.  */
+static int
+rename_chain (tag_insn_info *insn_info, tag_map_t &tag_map, du_head_p head)
+{
+  unsigned dest_regno = head->regno;
+
+  if (head->cannot_rename || head->renamed)
+    return -1;
+
+  HARD_REG_SET unavailable;
+
+  enum reg_class super_class = init_unavailable (insn_info, tag_map, head,
+ &unavailable);
+
+  unsigned new_regno = find_rename_reg (head, super_class, &unavailable,
+                                       dest_regno, false);
+
+  /* Attempt to rename as long as regrename doesn't just throw the same
+     register at us.  */
+  if (new_regno != dest_regno && regrename_do_replace (head, new_regno))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+         fprintf (dump_file, "\tInsn %d: Renamed %d to %d\n",
+                  INSN_UID (insn_info->insn), dest_regno, new_regno);
+
+      return new_regno;
+    }
+
+  return -1;
+}
+
+
+/* Go through the def/use chains for the register and find the chain for this
+   insn to rename.  */
+static int
+rename_dest (tag_insn_info *insn_info, tag_map_t &tag_map)
+{
+  struct du_chain *chain = NULL;
+  du_head_p head = NULL;
+  int i;
+
+  /* Search the chain where this instruction is (one of) the root.  */
+  rtx_insn *insn = insn_info->insn;
+  operand_rr_info *dest_op_info = insn_rr[INSN_UID (insn)].op_info;
+  unsigned dest_regno = REGNO (insn_info->dest);
+
+  for (i = 0; i < dest_op_info->n_chains; i++)
+    {
+      /* The register tracked by this chain does not match the
+        destination register of insn.  */
+      if (dest_op_info->heads[i]->regno != dest_regno)
+       continue;
+
+      head = dest_op_info->heads[i];
+      /* The chain was merged in another, find the new head.  */
+      if (!head->first)
+       head = regrename_chain_from_id (head->id);
+
+      for (chain = head->first; chain; chain = chain->next_use)
+       /* Found the insn in the chain, so try renaming the register in this
+          chain.  */
+       if (chain->insn == insn)
+         return rename_chain (insn_info, tag_map, head);
+    }
+
+  return -1;
+}
+
+
+/* Flag to track if the map has changed.  */
+static bool map_changed = false;
+
+/* The actual reallocation logic.  For each vector of collisions, try to
+   resolve the collision by attempting to rename the destination register of
+   all but one of the loads.  */
+bool
+avoid_collisions_1 (const rtx &t, insn_info_list_t *v, tag_map_t *tag_map)
+{
+  /* We need at least two loads to cause a tag collision, return unchanged.  */
+  if (v->length () < 2)
+    return true;
+
+  tag_insn_info *vec_start = v->pop ();
+  tag_insn_info *insn_info = vec_start;
+
+  /* Try to rename at least one register to reduce the collision.  If we
+     iterate all the way through, we end up dropping one of the loads from the
+     list.  This is fine because we want at most one element to ensure that a
+     subsequent rename attempt does not end up worsening the collision.  */
+  do
+    {
+      int new_regno;
+
+      if ((new_regno = rename_dest (insn_info, *tag_map)) != -1)
+       {
+         rtx new_tag = GEN_INT (TAG_UPDATE_DEST (INTVAL (t), new_regno));
+
+         tag_map->get_or_insert (new_tag).safe_push (insn_info);
+         df_set_regs_ever_live (new_regno, true);
+         map_changed = true;
+         return false;
+       }
+
+      v->safe_insert (0, insn_info);
+      insn_info = v->pop ();
+    }
+  while (insn_info != vec_start);
+
+  if (dump_file)
+    fprintf (dump_file, "\t>> Failed to rename destination in insn %d\n\t>>",
+            INSN_UID (insn_info->insn));
+
+  /* Drop the last element and move on to the next tag.  */
+  delete insn_info;
+  return true;
+}
+
+
+/* For each set of collisions, attempt to rename the registers or insert a move
+   to avoid the collision.  The actual implementation is in
+   REALLOC_COLLISIONS_1, which is called repeatedly until it results in no
+   change to the state of the collision sets.  */
+static void
+avoid_collisions (tag_map_t &tag_map)
+{
+  do
+    {
+      map_changed = false;
+      tag_map.traverse <tag_map_t *, avoid_collisions_1> (&tag_map);
+    }
+  while (map_changed);
+}
+
+
+
+/* Find the use def chain in which INSN exists and then see if there is a
+   definition inside the loop and outside it.  We use this as a simple
+   approximation to determine whether the base register is an IV.  The basic
+   idea is to find INSN in the use-def chains for its base register and find
+   all definitions that reach it.  Of all these definitions, there should be at
+   least one definition that is a simple addition of a constant value, either
+   as a binary operation or a pre or post update.  */
+static bool
+iv_p (rtx_insn *insn, rtx reg, struct loop *loop)
+{
+  df_ref ause;
+  unsigned regno = REGNO (reg);
+
+  /* Ignore loads from the stack.  */
+  if (regno == SP_REGNUM)
+    return false;
+
+  for (ause= DF_REG_USE_CHAIN (regno); ause; ause = DF_REF_NEXT_REG (ause))
+    {
+      if (!DF_REF_INSN_INFO (ause)
+         || !NONDEBUG_INSN_P (DF_REF_INSN (ause)))
+       continue;
+
+      if (insn != DF_REF_INSN (ause))
+       continue;
+
+      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+      df_ref def_rec;
+
+      FOR_EACH_INSN_INFO_DEF (def_rec, insn_info)
+       {
+         rtx_insn *insn = DF_REF_INSN (def_rec);
+         basic_block bb = BLOCK_FOR_INSN (insn);
+
+         if (dominated_by_p (CDI_DOMINATORS, bb, loop->header)
+             && bb->loop_father == loop)
+           {
+             recog_memoized (insn);

Did you mean to continue here if recog_memoized (insn) < 0 ?

+             rtx pat = PATTERN (insn);
+
+             /* Prefetch or clobber; unlikely to be a constant stride.  The
+                falkor software prefetcher tuning is pretty conservative, so
+                its presence indicates that the access pattern is probably
+                strided but most likely with an unknown stride size or a
+                stride size that is quite large.  */
+             if (GET_CODE (pat) != SET)
+               continue;
+
+             rtx x = SET_SRC (pat);
+             if (GET_CODE (x) == ZERO_EXTRACT
+                 || GET_CODE (x) == ZERO_EXTEND
+                 || GET_CODE (x) == SIGN_EXTEND)
+               x = XEXP (x, 0);
+
+             /* Loading the value from memory; unlikely to be a constant stride.  */
+             if (MEM_P (x))
+               continue;
+
+             /* An increment or decrement by a constant MODE_SIZE amount or the
+                result of a binary expression is likely to be an IV.  */
+             if (GET_CODE (x) == POST_INC
+                 || GET_CODE (x) == POST_DEC
+                 || GET_CODE (x) == PRE_INC
+                 || GET_CODE (x) == PRE_DEC)
+               return true;
+             else if (BINARY_P (x)
+                      && (CONST_INT_P (XEXP (x, 0)) || CONST_INT_P (XEXP (x, 1))))
+               {
+                 rtx stride = (CONST_INT_P (XEXP (x, 0))
+                               ? XEXP (x, 0) : XEXP (x, 1));
+
+                 /* Don't bother with very long strides because the prefetcher
+                    is unable to train on them anyway.  */
+                 if (INTVAL (stride) < 2048)
+                   return true;

I appreciate this is a core-specific but can you please at least make it a #define constant with
a meaningful name and use that?

+               }
+           }
+       }
+      return false;
+    }
+  return false;
+}
+
+
+/* Return true if SRC is a strided load in the LOOP, false otherwise.
+   If it is a strided load, set the BASE and OFFSET.  Also, if this is
+   a pre/post increment load, set PRE_POST to true.  */
+static bool
+valid_src_p (rtx src, rtx_insn *insn, struct loop *loop, bool *pre_post,
+            rtx *base, rtx *offset, bool load_pair)
+{
+  subrtx_var_iterator::array_type array;
+  rtx x = NULL_RTX;
+
+  FOR_EACH_SUBRTX_VAR (iter, array, src, NONCONST)
+    if (MEM_P (*iter))
+      {
+       x = *iter;
+       break;
+      }
+
+  if (!x)
+    return false;
+
+  struct aarch64_address_info addr;
+  machine_mode mode = GET_MODE (x);
+
+  if (!aarch64_classify_address (&addr, XEXP (x, 0), mode, true))
+    return false;
+
+  unsigned regno = REGNO (addr.base);
+  if (global_regs[regno] || fixed_regs[regno])
+    return false;
+
+  if (addr.type == ADDRESS_REG_WB)
+    {
+      unsigned code = GET_CODE (XEXP (x, 0));
+
+      *pre_post = true;
+      *base = addr.base;
+
+      if (code == PRE_MODIFY || code == POST_MODIFY)
+       *offset = addr.offset;
+      else
+       {
+         unsigned int_offset = GET_MODE_SIZE (mode).to_constant ();
+
+         /* For post-incremented load pairs we would increment the base twice
+            over, so make that adjustment.  */
+         if (load_pair && (code == POST_INC || code == POST_DEC))
+           int_offset *= 2;
+
+         *offset = GEN_INT (int_offset);
+       }
+      return true;
+    }
+  else if (addr.type == ADDRESS_REG_IMM || addr.type == ADDRESS_REG_REG)
+    {
+      /* Check if the load is strided.  */
+      if (!iv_p (insn, addr.base, loop))
+       return false;
+
+      *base = addr.base;
+      *offset = addr.offset;
+      return true;
+    }
+
+  return false;
+}
+
+
+/* Return true if INSN is a strided load in LOOP.  If it is a strided load, set
+   the DEST, BASE and OFFSET.  Also, if this is a pre/post increment load, set
+   PRE_POST to true.
+
+   The routine does checks on the destination of the insn and depends on
+   STRIDED_LOAD_P to check the source and fill in the BASE and OFFSET.  */
+static bool
+get_load_info (rtx_insn *insn, struct loop *loop, rtx *dest, rtx *base,
+              rtx *offset, bool *pre_post, bool *ldp)
+{
+  if (!INSN_P (insn) || recog_memoized (insn) < 0)
+    return false;
+
+  rtx pat = PATTERN (insn);
+  unsigned code = GET_CODE (pat);
+  bool load_pair = (code == PARALLEL);
+
+  /* For a load pair we need only the first base and destination
+     registers.  We however need to ensure that our pre/post increment
+     offset is doubled; we do that in STRIDED_LOAD_P.  */
+  if (load_pair)
+    {
+      pat = XVECEXP (pat, 0, 0);
+      code = GET_CODE (pat);
+    }
+
+  if (code != SET)
+    return false;
+
+  rtx dest_rtx = SET_DEST (pat);
+
+  if (!REG_P (dest_rtx))
+    return false;
+
+  unsigned regno = REGNO (dest_rtx);
+  machine_mode mode = GET_MODE (dest_rtx);
+  machine_mode inner_mode = GET_MODE_INNER (mode);
+
+  /* Ignore vector struct or lane loads.  */
+  if (GET_MODE_SIZE (mode).to_constant ()
+      != GET_MODE_SIZE (inner_mode).to_constant ())
+    return false;
+
+  /* The largest width we want to bother with is a load of a pair of qud-words.  */

"quad-words"

+  if ((GET_MODE_SIZE (mode).to_constant () << load_pair) > GET_MODE_SIZE (OImode))
+    return false;
+
+  /* Ignore loads into the stack pointer because it is unlikely to be a
+     stream.  */
+  if (regno == SP_REGNUM)
+    return false;
+
+  if (valid_src_p (SET_SRC (pat), insn, loop, pre_post, base, offset, load_pair))
+    {
+      *dest = dest_rtx;
+      *ldp = load_pair;
+
+      return true;
+    }
+
+  return false;
+}
+
+
+/* Return whether INSN and CAND are in the same def/use chain.  */
+static bool
+in_same_chain (rtx_insn *insn, rtx_insn *cand, unsigned regno)
+{
+  struct du_chain *chain = NULL;
+  du_head_p head = NULL;
+  int i;
+
+  /* Search the chain where this instruction is (one of) the root.  */
+  operand_rr_info *op_info = insn_rr[INSN_UID (insn)].op_info;
+
+  for (i = 0; i < op_info->n_chains; i++)
+    {
+      /* The register tracked by this chain does not match the
+        dest register of insn.  */
+      if (op_info->heads[i]->regno != regno)
+       continue;
+
+      head = op_info->heads[i];
+      /* The chain was merged in another, find the new head.  */
+      if (!head->first)
+       head = regrename_chain_from_id (head->id);
+
+      bool found_insn = false, found_cand = false;
+
+      for (chain = head->first; chain; chain = chain->next_use)
+       {
+         rtx *loc = &SET_DEST (PATTERN (chain->insn));
+
+         if (chain->loc != loc)
+           continue;
+
+         if (chain->insn == insn)
+           found_insn = true;
+
+         if (chain->insn == cand)
+           found_cand = true;
+
+         if (found_insn && found_cand)
+           return true;
+       }
+    }
+
+  return false;
+}
+
+
+/* Callback function to traverse the tag map and drop loads that have the same
+   destination and and in the same chain of occurrence.  */
+bool
+single_dest_per_chain (const rtx &t ATTRIBUTE_UNUSED, insn_info_list_t *v,
+                      void *arg ATTRIBUTE_UNUSED)
+{
+  for (int i = v->length () - 1; i>= 1; i--)
+    {
+      tag_insn_info *insn_info = (*v)[i];
+
+      for (int j = v->length () - 2; j >= 0; j--)
+       {
+         /* Filter out destinations in the same chain.  */
+         if (in_same_chain (insn_info->insn, (*v)[j]->insn,
+                            REGNO (insn_info->dest)))
+           {
+             v->ordered_remove (j);
+             i = v->length ();
+             break;
+           }
+       }
+    }
+
+  return true;
+}
+
+
+bool
+dump_insn_list (const rtx &t, const insn_info_list_t &insn_info,
+               void *unused ATTRIBUTE_UNUSED)
+{
+  gcc_assert (dump_file);
+  fprintf (dump_file, "Tag 0x%lx ::\n", INTVAL (t));
+
+  for (unsigned i = 0; i < insn_info.length (); i++)
+    dump_insn_slim (dump_file, insn_info[i]->insn);
+
+  fprintf (dump_file, "\n");
+
+  return true;
+}
+
+
+/* Record all loads into a map indexed by memory tags generated based on the
+   destination register, base register and the offset.  */
+static void
+record_loads (tag_map_t &tag_map, struct loop *loop)
+{
+  rtx_insn *insn;
+  basic_block *body, bb;
+
+  body = get_loop_body (loop);
+
+  for (unsigned i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+      FOR_BB_INSNS (bb, insn)
+       {
+         rtx base = NULL_RTX;
+         rtx dest = NULL_RTX;
+         rtx offset = NULL_RTX;
+         bool writeback = false;
+         bool ldp = false;
+
+         if (!INSN_P (insn) || DEBUG_INSN_P (insn))
+           continue;
+
+         if (get_load_info (insn, loop, &dest, &base, &offset, &writeback,
+                            &ldp))
+           {
+             tag_insn_info *i = new tag_insn_info (insn, dest, base, offset,
+ writeback, ldp);
+             rtx tag = GEN_INT (i->tag ());
+             tag_map.get_or_insert (tag).safe_push (i);
+           }
+       }
+    }
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Loop %d: Tag map generated.\n", loop->num);
+      tag_map.traverse <void *, dump_insn_list> (NULL);
+    }
+
+  /* Try to reduce the dataset before launching into the rename attempt.  Drop
+     destinations in the same collision chain that appear in the same def/use
+     chain, all as defs.  These chains will move together in a rename so
+     there's no point in keeping both in there.  */
+  tag_map.traverse <void *, single_dest_per_chain> (NULL);
+}
+
+
+/* Tag collision avoidance pass for Falkor.  The pass runs in two phases for
+   each loop; the first phase collects all loads that we consider as
+   interesting for renaming into a tag-indexed map of lists.  The second phase
+   renames the destination register of the loads in an attempt to spread out
+   the loads into different tags.  */
+void
+execute_tag_collision_avoidance ()
+{
+  struct loop *loop;
+
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+  df_chain_add_problem (DF_UD_CHAIN);
+  df_compute_regs_ever_live (true);
+  df_analyze ();
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  regrename_init (true);
+  regrename_analyze (NULL);
+
+  compute_bb_for_insn ();
+  calculate_dominance_info (CDI_DOMINATORS);
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+
+  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
+    {
+      tag_map_t tag_map (512);
+
+      record_loads (tag_map, loop);
+      avoid_collisions (tag_map);
+      if (dump_file)
+       {
+         fprintf (dump_file, "Loop %d: Completed rename.\n", loop->num);
+         tag_map.traverse <void *, dump_insn_list> (NULL);
+       }
+      tag_map.traverse <void *, free_insn_info> (NULL);
+    }
+
+  loop_optimizer_finalize ();
+  free_dominance_info (CDI_DOMINATORS);
+  regrename_finish ();
+}
+
+
+const pass_data pass_data_tag_collision_avoidance =
+{
+  RTL_PASS, /* type */
+  "tag_collision_avoidance", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+
+class pass_tag_collision_avoidance : public rtl_opt_pass
+{
+public:
+  pass_tag_collision_avoidance (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_tag_collision_avoidance, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return ((aarch64_tune_params.extra_tuning_flags
+              & AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS)
+             && optimize >= 2);
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      execute_tag_collision_avoidance ();
+      return 0;
+    }
+
+}; // class pass_tag_collision_avoidance
+
+
+/* Create a new pass instance.  */
+rtl_opt_pass *
+make_pass_tag_collision_avoidance (gcc::context *ctxt)
+{
+  return new pass_tag_collision_avoidance (ctxt);
+}
diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
index 0be1f0d63aa..f185b404ce6 100644
--- a/gcc/config/aarch64/t-aarch64
+++ b/gcc/config/aarch64/t-aarch64
@@ -67,6 +67,15 @@ cortex-a57-fma-steering.o: $(srcdir)/config/aarch64/cortex-a57-fma-steering.c \
         $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
$(srcdir)/config/aarch64/cortex-a57-fma-steering.c

+falkor-tag-collision-avoidance.o: $(srcdir)/config/aarch64/falkor-tag-collision-avoidance.c \
+    $(CONFIG_H) $(SYSTEM_H) $(TM_H) $(REGS_H) insn-config.h $(RTL_BASE_H) \
+    dominance.h cfg.h cfganal.h $(BASIC_BLOCK_H) $(INSN_ATTR_H) $(RECOG_H) \
+    output.h hash-map.h $(DF_H) $(OBSTACK_H) $(TARGET_H) $(RTL_H) \
+    $(CONTEXT_H) $(TREE_PASS_H) regrename.h \
+    $(srcdir)/config/aarch64/aarch64-protos.h
+       $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+ $(srcdir)/config/aarch64/falkor-tag-collision-avoidance.c
+
 comma=,
 MULTILIB_OPTIONS    = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst $(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG))))
 MULTILIB_DIRNAMES   = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))
--
2.17.1



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]