This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Cancel bswap opt when intermediate stmts are reused


Hi all,

Currently, when an expression doing manual load or bswap is detected, the
bswap optimization replace it by an actual load and/or bswap instruction
without considering whether the intermediate values computed in the
expressions are reused later. If that is the case, the construction of these
values has to be retained and the sum of the load and/or bswap instruction
and the instruction to contruct those values might be more expensive than
the initial fully manual expression. This patch aims at detecting such a case
and cancel the bswap optimization. In addition, it takes advantage of the
infrastructure necessary for the detection to also cleanup the stmts that
become useless when the bswap optimization is performed.

*** gcc/ChangeLog ***

2014-08-01  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	* tree-ssa-math-opts.c (struct usedtree): New.
	(find_bswap_or_nop_1): Change prototype to take a hashtable and a list
	of struct usedtree. Fill respectively these with visited stmts and
	trees (along with the stmts where they are defined) that would not be
	defined if bswap optimization is applied. Adapt recursion call to
	prototype change.
	(find_bswap_or_nop): Adapt to find_bswap_or_nop_1 prototype change.
	Pass the hashtable and the list of struct usedtree from
	pass_optimize_bswap::execute ().
	(do_bswap_p): New.
	(bswap_replace): Fix typo in heading comment. Stop returning whether
	the bswap optimization was performed since this is now handled by
	do_bswap_p (). Move alignment check to do_bswap_p ().
	(free_usedtrees_list): New.
	(pass_optimize_bswap::execute): Add allocation and deallocation
	handling of the hashtable of browsed stmts. Free the list of struct
	usedtree at the end of each iteration using free_usedtrees_list () and
	the new bswap_check_end_iter label. Move some of the logic to perform
	the bswap optimization to do_bswap_p (). Set gsi after performing the
	bswap optimization and loop manually via the new
	bswap_check_start_iter label so that all stmts are checked for
	load/bswap even when cur_stmt is moved around by bswap_replace ().

*** gcc/testsuite/ChangeLog ***

2014-08-01  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	* gcc.dg/optimize-bswapsi-2.c (read_le32_1): Add an intermediate
	variable in the mix to check that it is optimized when there is no
	use outside the expression computing a load/bswap.
	(read_be32_1): Likewise.
	* gcc.dg/optimize-bswapsi-3.c: New. Create read_le32_1 () and
	read_be32_1 () based on identically named function in
	gcc.dg/optimize-bswapsi-2.c but reusing the intermediate variable so
	as to check that bswap optimization is not performed in these cases.

diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
index de6e697..df7856b 100644
--- a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
@@ -14,7 +14,9 @@ struct uint32_st {
 
 uint32_t read_le32_1 (void)
 {
-  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+  uint32_t low = data[0] | (data[1] << 8);
+  uint32_t ret = low | (data[2] << 16) | (data[3] << 24);
+  return ret;
 }
 
 uint32_t read_le32_2 (struct uint32_st data)
@@ -30,7 +32,9 @@ uint32_t read_le32_3 (unsigned char *data)
 
 uint32_t read_be32_1 (void)
 {
-  return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
+  uint32_t low = data[3] | (data[2] << 8);
+  uint32_t ret = low | (data[1] << 16) | (data[0] << 24);
+  return ret;
 }
 
 uint32_t read_be32_2 (struct uint32_st data)
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-3.c
new file mode 100644
index 0000000..dc48d9d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-3.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap32 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-additional-options "-march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+extern unsigned char data[4];
+
+/* No "bswap" optimization as low is reused */
+uint32_t read_le32_1 (unsigned char *data, uint32_t *low_neg)
+{
+  uint32_t low = data[0] | (data[1] << 8);
+  uint32_t ret = low | (data[2] << 16) | (data[3] << 24);
+  *low_neg = low;
+  return ret;
+}
+
+/* No "bswap" optimization as low is reused */
+uint32_t read_be32_1 (unsigned char *data, uint32_t *low_neg)
+{
+  uint32_t low = data[3] | (data[2] << 8);
+  uint32_t ret = low | (data[1] << 16) | (data[0] << 24);
+  *low_neg = low;
+  return ret;
+}
+
+/* { dg-final { scan-tree-dump-not "32 bit load in target endianness found at" "bswap" } } */
+/* { dg-final { scan-tree-dump-not "32 bit bswap implementation found at" "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 705793b..9f71145 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -1641,6 +1641,18 @@ struct symbolic_number {
 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
   (uint64_t)0x01020304 << 32 | 0x05060708)
 
+/* Structure to memorize all the intermediate trees created to compute a bswap
+   or a load along with the stmt where they are used.  This allows to check
+   whether some trees are reused outside the bswap or load performed and avoid
+   doing the optimization in that case since it might lead to more computation.
+   It also allows to remove all the stmts that create these trees in case the
+   optimization is performed.  */
+struct usedtree {
+  tree t;
+  gimple stmt;
+  struct usedtree *next;
+};
+
 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
    number N.  Return false if the requested operation is not permitted
    on a symbolic number.  */
@@ -1803,19 +1815,26 @@ find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
   return true;
 }
 
-/* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
-   the operation given by the rhs of STMT on the result.  If the operation
-   could successfully be executed the function returns a gimple stmt whose
-   rhs's first tree is the expression of the source operand and NULL
-   otherwise.  */
+/* find_bswap_or_nop_1 invokes itself recursively trying to perform the
+   operation given by the rhs of STMT on the symbolic number represented by N
+   until LIMIT recursions have been done or the variable that is the source of
+   the successive expressions has been found.  The function also keeps track of
+   the intermediate trees defined in the expression with the stmt where they
+   are defined (in USED_TREES) and records the stmt that were visited (in
+   BROWSED_STMTS_HTAB).  If the operation could successfully be executed the
+   function returns a gimple stmt whose rhs's first tree is the expression of
+   the source operand and NULL otherwise.  */
 
 static gimple
-find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
+find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit,
+		     htab_t browsed_stmts_htab, struct usedtree **used_trees)
 {
+  void **slot;
   enum tree_code code;
   tree rhs1, rhs2 = NULL;
   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
   enum gimple_rhs_class rhs_class;
+  struct usedtree *new_used;
 
   if (!limit || !is_gimple_assign (stmt))
     return NULL;
@@ -1823,7 +1842,20 @@ find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
   rhs1 = gimple_assign_rhs1 (stmt);
 
   if (find_bswap_or_nop_load (stmt, rhs1, n))
-    return stmt;
+    {
+      slot = htab_find_slot (browsed_stmts_htab, stmt, INSERT);
+      gcc_assert (slot);
+      if (!*slot)
+	{
+	  *slot = stmt;
+	  new_used = (struct usedtree *) xmalloc (sizeof (*used_trees));
+	  new_used->t = gimple_assign_lhs (stmt);
+	  new_used->stmt = stmt;
+	  new_used->next = *used_trees;
+	  *used_trees = new_used;
+	}
+      return stmt;
+    }
 
   if (TREE_CODE (rhs1) != SSA_NAME)
     return NULL;
@@ -1835,6 +1867,18 @@ find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
   if (rhs_class == GIMPLE_BINARY_RHS)
     rhs2 = gimple_assign_rhs2 (stmt);
 
+  slot = htab_find_slot (browsed_stmts_htab, stmt, INSERT);
+  gcc_assert (slot);
+  if (!*slot)
+    {
+      *slot = stmt;
+      new_used = (struct usedtree *) xmalloc (sizeof (*used_trees));
+      new_used->t = gimple_assign_lhs (stmt);
+      new_used->stmt = stmt;
+      new_used->next = *used_trees;
+      *used_trees = new_used;
+    }
+
   /* Handle unary rhs and binary rhs with integer constants as second
      operand.  */
 
@@ -1851,12 +1895,14 @@ find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  && code != CONVERT_EXPR)
 	return NULL;
 
-      source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
+      source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1,
+					  browsed_stmts_htab, used_trees);
 
       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
 	 we have to initialize the symbolic number.  */
       if (!source_stmt1)
 	{
+
 	  if (gimple_assign_load_p (stmt)
 	      || !init_symbolic_number (n, rhs1))
 	    return NULL;
@@ -1942,12 +1988,14 @@ find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
       switch (code)
 	{
 	case BIT_IOR_EXPR:
-	  source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
+	  source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1,
+					      browsed_stmts_htab, used_trees);
 
 	  if (!source_stmt1)
 	    return NULL;
 
-	  source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
+	  source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1,
+					      browsed_stmts_htab, used_trees);
 
 	  if (!source_stmt2)
 	    return NULL;
@@ -2044,16 +2092,19 @@ find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
   return NULL;
 }
 
-/* Check if STMT completes a bswap implementation or a read in a given
-   endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
-   accordingly.  It also sets N to represent the kind of operations
-   performed: size of the resulting expression and whether it works on
-   a memory source, and if so alias-set and vuse.  At last, the
-   function returns a stmt whose rhs's first tree is the source
-   expression.  */
+/* Check if STMT completes a bswap implementation or a load in a given
+   endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP accordingly.
+   It also sets N to represent the kind of operations performed: size of the
+   resulting expression and whether it works on a memory source, and if so
+   alias-set and vuse.  All the intermediate trees defined in such an
+   implementation are recorded along the stmt where they are defined (in
+   USED_TREES) as well as the stmts that are part of that implementation (in
+   BROWSED_STMTS_HTAB).  At last, the function returns a stmt whose rhs's first
+   tree is the source expression.  */
 
 static gimple
-find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
+find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap,
+		   htab_t browsed_stmts_htab, struct usedtree **used_trees)
 {
 /* The number which the find_bswap_or_nop_1 result should match in order
    to have a full byte swap.  The number is shifted to the right
@@ -2071,7 +2122,8 @@ find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
      in libgcc, and for initial shift/and operation of the src operand.  */
   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
-  source_stmt =  find_bswap_or_nop_1 (stmt, n, limit);
+  source_stmt =  find_bswap_or_nop_1 (stmt, n, limit, browsed_stmts_htab,
+				      used_trees);
 
   if (!source_stmt)
     return NULL;
@@ -2146,16 +2198,73 @@ public:
 
 }; // class pass_optimize_bswap
 
+/* Tests whether the bswap optimization can be performed and is likely to bring
+   performance improvements.  This decision is made based on:
+   - whether the operation performed is a BSWAP or a simple load
+   - whether the operation can be replaced by an instruction (FNDECL is non
+     NULL)
+   - the stmt reading/loading from source operand (SRC_STMT)
+   - whether a the source is in memory (given by N->base_addr)
+   - whether intermediate trees defined in the expression being optimized are
+     reused elsewhere. This is determined from the list of tree defined along
+     with the stmt where they are defined (in USED_TREES), a hashtable of the
+     stmts that are part of the expression (in BROWSED_STMTS_HTAB) and the
+     outermost stmt of the expression (in STMT).  */
+
+static bool
+do_bswap_p (gimple stmt, gimple src_stmt, tree fndecl,
+	    tree load_type ATTRIBUTE_UNUSED, struct symbolic_number *n,
+	    bool bswap, htab_t browsed_stmts_htab, struct usedtree *used_trees)
+{
+  unsigned align ATTRIBUTE_UNUSED;
+  struct usedtree *used;
+  bool ret;
+
+  if (bswap && !fndecl)
+    return false;
+
+  if (n->base_addr)
+    {
+      tree src = gimple_assign_rhs1 (src_stmt);
+      align = get_object_alignment (src);
+      if (bswap
+	  && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
+	  && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
+	return false;
+    }
+
+  for (ret = true, used = used_trees; ret && used; used = used->next)
+    {
+      gimple using_stmt;
+      imm_use_iterator imm_use;
+
+      if (used->stmt == stmt)
+	continue;
+
+      FOR_EACH_IMM_USE_STMT(using_stmt, imm_use, used->t)
+	{
+	  if (!is_gimple_debug (using_stmt)
+	      && !htab_find_slot (browsed_stmts_htab, using_stmt, NO_INSERT))
+	    {
+	      ret = false;
+	      BREAK_FROM_IMM_USE_STMT (imm_use);
+	    }
+	}
+    }
+
+  return ret;
+}
+
 /* Perform the bswap optimization: replace the statement CUR_STMT at
    GSI with a load of type, VUSE and set-alias as described by N if a
    memory source is involved (N->base_addr is non null), followed by
    the builtin bswap invocation in FNDECL if BSWAP is true.  SRC_STMT
    gives where should the replacement be made.  It also gives the
-   source on which CUR_STMT is operating via its rhs's first tree nad
+   source on which CUR_STMT is operating via its rhs's first tree and
    N->range gives the size of the expression involved for maintaining
    some statistics.  */
 
-static bool
+static void
 bswap_replace (gimple cur_stmt, gimple_stmt_iterator gsi, gimple src_stmt,
 	       tree fndecl, tree bswap_type, tree load_type,
 	       struct symbolic_number *n, bool bswap)
@@ -2175,12 +2284,6 @@ bswap_replace (gimple cur_stmt, gimple_stmt_iterator gsi, gimple src_stmt,
       gimple addr_stmt, load_stmt;
       unsigned align;
 
-      align = get_object_alignment (src);
-      if (bswap
-	  && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
-	  && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
-	return false;
-
       gsi_move_before (&gsi, &gsi_ins);
       gsi = gsi_for_stmt (cur_stmt);
 
@@ -2198,6 +2301,7 @@ bswap_replace (gimple cur_stmt, gimple_stmt_iterator gsi, gimple src_stmt,
 	}
 
       /* Perform the load.  */
+      align = get_object_alignment (src);
       aligned_load_type = load_type;
       if (align < TYPE_ALIGN (load_type))
 	aligned_load_type = build_aligned_type (load_type, align);
@@ -2243,7 +2347,7 @@ bswap_replace (gimple cur_stmt, gimple_stmt_iterator gsi, gimple src_stmt,
 		       (int)n->range);
 	      print_gimple_stmt (dump_file, cur_stmt, 0, 0);
 	    }
-	  return true;
+	  return;
 	}
       else
 	{
@@ -2300,7 +2404,32 @@ bswap_replace (gimple cur_stmt, gimple_stmt_iterator gsi, gimple src_stmt,
 
   gsi_insert_after (&gsi, call, GSI_SAME_STMT);
   gsi_remove (&gsi, true);
-  return true;
+}
+
+/* Free the struct usedtree elements from the list USED_TREES. If REMOVE_STMTS
+   is true, the stmts they reference (CUR_STMT excepted) are removed and the
+   ssa names of their lhs is released from the function FUN.  */
+
+static void
+free_usedtrees_list (struct usedtree *used_trees, bool remove_stmts,
+		    gimple cur_stmt, function *fun)
+{
+  gimple_stmt_iterator gsi_rm;
+  struct usedtree *used;
+
+  while ((used = used_trees))
+    {
+      /* Do not mark stmt for removal if it's the replaced one.  */
+      if (remove_stmts && used->stmt != cur_stmt)
+	{
+	  tree lhs = gimple_assign_lhs (used->stmt);
+	  gsi_rm = gsi_for_stmt (used->stmt);
+	  gsi_remove (&gsi_rm, true);
+	  release_ssa_name_fn (fun, lhs);
+	}
+      used_trees = used->next;
+      free (used);
+    }
 }
 
 /* Find manual byte swap implementations as well as load in a given
@@ -2315,6 +2444,7 @@ pass_optimize_bswap::execute (function *fun)
   bool bswap16_p, bswap32_p, bswap64_p;
   bool changed = false;
   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
+  htab_t browsed_stmts_htab;
 
   if (BITS_PER_UNIT != 8)
     return 0;
@@ -2350,6 +2480,9 @@ pass_optimize_bswap::execute (function *fun)
   memset (&nop_stats, 0, sizeof (nop_stats));
   memset (&bswap_stats, 0, sizeof (bswap_stats));
 
+  browsed_stmts_htab = htab_create_alloc (16, htab_hash_pointer,
+					  htab_eq_pointer, 0, xcalloc, free);
+
   FOR_EACH_BB_FN (bb, fun)
     {
       gimple_stmt_iterator gsi;
@@ -2360,19 +2493,29 @@ pass_optimize_bswap::execute (function *fun)
 	 patterns, the wider variant wouldn't be detected.  */
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
         {
-	  gimple src_stmt, cur_stmt = gsi_stmt (gsi);
+	  gimple_stmt_iterator gsi_cont;
+	  gimple src_stmt, cur_stmt;
 	  tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
 	  struct symbolic_number n;
-	  bool bswap;
+	  struct usedtree *used_trees;
+	  bool bswap, rm_stmts;
+
+bswap_check_start_iter:
+	  rm_stmts = false;
+	  cur_stmt = gsi_stmt (gsi);
 
 	  if (!is_gimple_assign (cur_stmt)
 	      || gimple_assign_rhs_code (cur_stmt) != BIT_IOR_EXPR)
 	    continue;
 
-	  src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
+	  htab_empty (browsed_stmts_htab);
+	  used_trees = NULL;
+
+	  src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
+					browsed_stmts_htab, &used_trees);
 
 	  if (!src_stmt)
-	    continue;
+	    goto bswap_check_end_iter;
 
 	  switch (n.range)
 	    {
@@ -2401,15 +2544,37 @@ pass_optimize_bswap::execute (function *fun)
 		}
 	      break;
 	    default:
-	      continue;
+	      goto bswap_check_end_iter;
 	    }
 
-	  if (bswap && !fndecl)
-	    continue;
-
-	  if (bswap_replace (cur_stmt, gsi, src_stmt, fndecl, bswap_type,
-			     load_type, &n, bswap))
-	    changed = true;
+	  if (!do_bswap_p (cur_stmt, src_stmt, fndecl, load_type, &n, bswap,
+			   browsed_stmts_htab, used_trees))
+	    goto bswap_check_end_iter;
+
+	  changed = true;
+	  gsi_cont = gsi_for_stmt (cur_stmt);
+	  gsi_next (&gsi_cont);
+	  bswap_replace (cur_stmt, gsi, src_stmt, fndecl, bswap_type,
+			 load_type, &n, bswap);
+	  rm_stmts = true;
+
+bswap_check_end_iter:
+	  free_usedtrees_list (used_trees, rm_stmts, cur_stmt, fun);
+
+	  /* cur_stmt may have been moved in bswap_replace (in case of a
+	     load) and/or replaced (in case of bswap).  Ideally, we want the
+	     next iteration to check the first stmt that was before cur_stmt
+	     initially that is not an unused stmt.  However it is easier and
+	     simpler to allow the next iteration to check the changed stmt.  */
+	  if (rm_stmts)
+	    {
+	      if (gsi_end_p (gsi_cont))
+		gsi_cont = gsi_last_bb (bb);
+	      else
+		gsi_prev (&gsi_cont);
+	      gsi = gsi_cont;
+	      goto bswap_check_start_iter;
+	    }
 	}
     }

Tested via bootstrap on x86_64-linux-gnu and no regression of the
testsuite (new tests passing) as well as through running the testsuite
compiled by arm-none-eabi-gcc cross compiler under qemu emulating
Cortex M3 without any regression (new tests passing).

Ok for trunk?

Best regards,

Thomas



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]