This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Use bswap framework in store-merging (PR tree-optimization/78821)


Hi!

This patch uses the bswap pass framework inside of the store merging
pass to handle adjacent stores which produce together a 16/32/64 bit
store of bswapped value (loaded or from SSA_NAME) or identity (usually
only from SSA_NAME, the code prefers to use the existing store merging
code if coming from identity load, because it e.g. can handle arbitrary
sizes, not just 16/32/64 bits).

There are small tweaks to the bswap code to make it usable inside of
the store merging pass.  Then when processing the stores, we record
what find_bswap_or_nop_1 returns and do a small sanity check on it,
and when doing coalesce_immediate_stores (i.e. the splitting into
groups), we try for 64-bit, 32-bit and 16-bit sizes if we can extend/shift
(according to endianity) and perform_symbolic_merge them together.
If it is possible, we turn those 2+ adjacent stores that make together
{64,32,16} bits into a separate group and process it specially later
(we need to treat it as a single store rather than multiple, so
split_group is only very lightweight for that case).

Bootstrapped/regtested on {x86_64,i686,powerpc64le,powerpc64}-linux, ok for trunk?

The cases this patch can handle are less common than rhs_code INTEGER_CST
(stores of constants to adjacent memory) or MEM_REF (adjacent memory
copying), but are more common than the bitwise ops, during combined
x86_64+i686 bootstraps/regtests it triggered:
lrotate_expr  974   2528
nop_expr  720   1711
(lrotate_expr stands for bswap, nop_expr for identity, the first column is
the actual count of such new stores, the second is the original number of
stores that have been optimized this way).

2017-11-16  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/78821
	* gimple-ssa-store-merging.c (find_bswap_or_nop_load): Give up
	if base is TARGET_MEM_REF.  If base is not MEM_REF, set base_addr
	to the address of the base rather than the base itself.
	(find_bswap_or_nop_1): Just use pointer comparison for vuse check.
	(find_bswap_or_nop_finalize): New function.
	(find_bswap_or_nop): Use it.
	(bswap_replace): Return a tree rather than bool, change first
	argument from gimple * to gimple_stmt_iterator, allow inserting
	into an empty sequence, allow ins_stmt to be NULL - then emit
	all stmts into gsi.  Fix up MEM_REF address gimplification.
	(pass_optimize_bswap::execute): Adjust bswap_replace caller.
	Formatting fix.
	(struct store_immediate_info): Add N and INS_STMT non-static
	data members.
	(store_immediate_info::store_immediate_info): Initialize them
	from newly added ctor args.
	(merged_store_group::apply_stores): Formatting fixes.  Sort by
	bitpos at the end.
	(stmts_may_clobber_ref_p): For stores call also
	refs_anti_dependent_p.
	(gather_bswap_load_refs): New function.
	(imm_store_chain_info::try_coalesce_bswap): New method.
	(imm_store_chain_info::coalesce_immediate_stores): Use it.
	(split_group): Handle LROTATE_EXPR and NOP_EXPR rhs_code specially.
	(imm_store_chain_info::output_merged_store): Fail if number of
	new estimated stmts is bigger or equal than old.  Handle LROTATE_EXPR
	and NOP_EXPR rhs_code.
	(pass_store_merging::process_store): Compute n and ins_stmt, if
	ins_stmt is non-NULL and the store rhs is otherwise invalid, use
	LROTATE_EXPR rhs_code.  Pass n and ins_stmt to store_immediate_info
	ctor.
	(pass_store_merging::execute): Calculate dominators.

	* gcc.dg/store_merging_16.c: New test.

--- gcc/gimple-ssa-store-merging.c.jj	2017-11-16 10:45:09.239185205 +0100
+++ gcc/gimple-ssa-store-merging.c	2017-11-16 15:34:08.560080214 +0100
@@ -369,7 +369,10 @@ find_bswap_or_nop_load (gimple *stmt, tr
   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
 				   &unsignedp, &reversep, &volatilep);
 
-  if (TREE_CODE (base_addr) == MEM_REF)
+  if (TREE_CODE (base_addr) == TARGET_MEM_REF)
+    /* Do not rewrite TARGET_MEM_REF.  */
+    return false;
+  else if (TREE_CODE (base_addr) == MEM_REF)
     {
       offset_int bit_offset = 0;
       tree off = TREE_OPERAND (base_addr, 1);
@@ -401,6 +404,8 @@ find_bswap_or_nop_load (gimple *stmt, tr
 
       bitpos += bit_offset.to_shwi ();
     }
+  else
+    base_addr = build_fold_addr_expr (base_addr);
 
   if (bitpos % BITS_PER_UNIT)
     return false;
@@ -743,8 +748,7 @@ find_bswap_or_nop_1 (gimple *stmt, struc
 	  if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
 	    return NULL;
 
-	  if (!n1.vuse != !n2.vuse
-	      || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
+	  if (n1.vuse != n2.vuse)
 	    return NULL;
 
 	  source_stmt
@@ -765,39 +769,21 @@ find_bswap_or_nop_1 (gimple *stmt, struc
   return NULL;
 }
 
-/* Check if STMT completes a bswap implementation or a read in a given
-   endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
-   accordingly.  It also sets N to represent the kind of operations
-   performed: size of the resulting expression and whether it works on
-   a memory source, and if so alias-set and vuse.  At last, the
-   function returns a stmt whose rhs's first tree is the source
-   expression.  */
+/* Helper for find_bswap_or_nop and try_coalesce_bswap to compute
+   *CMPXCHG, *CMPNOP and adjust *N.  */
 
-gimple *
-find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
+void
+find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
+			    uint64_t *cmpnop)
 {
   unsigned rsize;
   uint64_t tmpn, mask;
-/* The number which the find_bswap_or_nop_1 result should match in order
-   to have a full byte swap.  The number is shifted to the right
-   according to the size of the symbolic number before using it.  */
-  uint64_t cmpxchg = CMPXCHG;
-  uint64_t cmpnop = CMPNOP;
-
-  gimple *ins_stmt;
-  int limit;
 
-  /* The last parameter determines the depth search limit.  It usually
-     correlates directly to the number n of bytes to be touched.  We
-     increase that number by log2(n) + 1 here in order to also
-     cover signed -> unsigned conversions of the src operand as can be seen
-     in libgcc, and for initial shift/and operation of the src operand.  */
-  limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
-  limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
-  ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
-
-  if (!ins_stmt)
-    return NULL;
+  /* The number which the find_bswap_or_nop_1 result should match in order
+     to have a full byte swap.  The number is shifted to the right
+     according to the size of the symbolic number before using it.  */
+  *cmpxchg = CMPXCHG;
+  *cmpnop = CMPNOP;
 
   /* Find real size of result (highest non-zero byte).  */
   if (n->base_addr)
@@ -810,8 +796,8 @@ find_bswap_or_nop (gimple *stmt, struct
   if (n->range < (int) sizeof (int64_t))
     {
       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
-      cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
-      cmpnop &= mask;
+      *cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
+      *cmpnop &= mask;
     }
 
   /* Zero out the bits corresponding to unused bytes in the result of the
@@ -821,18 +807,47 @@ find_bswap_or_nop (gimple *stmt, struct
       if (BYTES_BIG_ENDIAN)
 	{
 	  mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
-	  cmpxchg &= mask;
-	  cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
+	  *cmpxchg &= mask;
+	  *cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
 	}
       else
 	{
 	  mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
-	  cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
-	  cmpnop &= mask;
+	  *cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
+	  *cmpnop &= mask;
 	}
       n->range = rsize;
     }
 
+  n->range *= BITS_PER_UNIT;
+}
+
+/* Check if STMT completes a bswap implementation or a read in a given
+   endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
+   accordingly.  It also sets N to represent the kind of operations
+   performed: size of the resulting expression and whether it works on
+   a memory source, and if so alias-set and vuse.  At last, the
+   function returns a stmt whose rhs's first tree is the source
+   expression.  */
+
+gimple *
+find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
+{
+  /* The last parameter determines the depth search limit.  It usually
+     correlates directly to the number n of bytes to be touched.  We
+     increase that number by log2(n) + 1 here in order to also
+     cover signed -> unsigned conversions of the src operand as can be seen
+     in libgcc, and for initial shift/and operation of the src operand.  */
+  int limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
+  limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
+  gimple *ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
+
+  if (!ins_stmt)
+    return NULL;
+
+  uint64_t cmpxchg, cmpnop;
+  find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop);
+
   /* A complete byte swap should make the symbolic number to start with
      the largest digit in the highest order byte. Unchanged symbolic
      number indicates a read with same endianness as target architecture.  */
@@ -847,7 +862,6 @@ find_bswap_or_nop (gimple *stmt, struct
   if (!n->base_addr && n->n == cmpnop && n->n_ops == 1)
     return NULL;
 
-  n->range *= BITS_PER_UNIT;
   return ins_stmt;
 }
 
@@ -882,68 +896,89 @@ public:
 }; // class pass_optimize_bswap
 
 /* Perform the bswap optimization: replace the expression computed in the rhs
-   of CUR_STMT by an equivalent bswap, load or load + bswap expression.
+   of gsi_stmt (GSI) (or if NULL add instead of replace) by an equivalent
+   bswap, load or load + bswap expression.
    Which of these alternatives replace the rhs is given by N->base_addr (non
    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
    load to perform are also given in N while the builtin bswap invoke is given
-   in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
-   load statements involved to construct the rhs in CUR_STMT and N->range gives
-   the size of the rhs expression for maintaining some statistics.
-
-   Note that if the replacement involve a load, CUR_STMT is moved just after
-   SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
-   changing of basic block.  */
+   in FNDEL.  Finally, if a load is involved, INS_STMT refers to one of the
+   load statements involved to construct the rhs in gsi_stmt (GSI) and
+   N->range gives the size of the rhs expression for maintaining some
+   statistics.
+
+   Note that if the replacement involve a load and if gsi_stmt (GSI) is
+   non-NULL, that stmt is moved just after INS_STMT to do the load with the
+   same VUSE which can lead to gsi_stmt (GSI) changing of basic block.  */
 
-bool
-bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl,
+tree
+bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
 	       tree bswap_type, tree load_type, struct symbolic_number *n,
 	       bool bswap)
 {
-  gimple_stmt_iterator gsi;
-  tree src, tmp, tgt;
+  tree src, tmp, tgt = NULL_TREE;
   gimple *bswap_stmt;
 
-  gsi = gsi_for_stmt (cur_stmt);
+  gimple *cur_stmt = gsi_stmt (gsi);
   src = n->src;
-  tgt = gimple_assign_lhs (cur_stmt);
+  if (cur_stmt)
+    tgt = gimple_assign_lhs (cur_stmt);
 
   /* Need to load the value from memory first.  */
   if (n->base_addr)
     {
-      gimple_stmt_iterator gsi_ins = gsi_for_stmt (ins_stmt);
+      gimple_stmt_iterator gsi_ins = gsi;
+      if (ins_stmt)
+	gsi_ins = gsi_for_stmt (ins_stmt);
       tree addr_expr, addr_tmp, val_expr, val_tmp;
       tree load_offset_ptr, aligned_load_type;
-      gimple *addr_stmt, *load_stmt;
-      unsigned align;
+      gimple *load_stmt;
+      unsigned align = get_object_alignment (src);
       HOST_WIDE_INT load_offset = 0;
-      basic_block ins_bb, cur_bb;
 
-      ins_bb = gimple_bb (ins_stmt);
-      cur_bb = gimple_bb (cur_stmt);
-      if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
-	return false;
-
-      align = get_object_alignment (src);
-
-      /* Move cur_stmt just before  one of the load of the original
-	 to ensure it has the same VUSE.  See PR61517 for what could
-	 go wrong.  */
-      if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
-	reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
-      gsi_move_before (&gsi, &gsi_ins);
-      gsi = gsi_for_stmt (cur_stmt);
+      if (cur_stmt)
+	{
+	  basic_block ins_bb = gimple_bb (ins_stmt);
+	  basic_block cur_bb = gimple_bb (cur_stmt);
+	  if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
+	    return NULL_TREE;
+
+	  /* Move cur_stmt just before one of the load of the original
+	     to ensure it has the same VUSE.  See PR61517 for what could
+	     go wrong.  */
+	  if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
+	    reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
+	  gsi_move_before (&gsi, &gsi_ins);
+	  gsi = gsi_for_stmt (cur_stmt);
+	}
+      else
+	gsi = gsi_ins;
 
       /* Compute address to load from and cast according to the size
 	 of the load.  */
-      addr_expr = build_fold_addr_expr (unshare_expr (src));
+      addr_expr = build_fold_addr_expr (src);
       if (is_gimple_mem_ref_addr (addr_expr))
-	addr_tmp = addr_expr;
+	addr_tmp = unshare_expr (addr_expr);
       else
 	{
-	  addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
-					 "load_src");
-	  addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
-	  gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
+	  addr_tmp = unshare_expr (n->base_addr);
+	  if (!is_gimple_mem_ref_addr (addr_tmp))
+	    addr_tmp = force_gimple_operand_gsi_1 (&gsi, addr_tmp,
+						   is_gimple_mem_ref_addr,
+						   NULL_TREE, true,
+						   GSI_SAME_STMT);
+	  load_offset = n->bytepos;
+	  if (n->offset)
+	    {
+	      tree off
+		= force_gimple_operand_gsi (&gsi, unshare_expr (n->offset),
+					    true, NULL_TREE, true,
+					    GSI_SAME_STMT);
+	      gimple *stmt
+		= gimple_build_assign (make_ssa_name (TREE_TYPE (addr_tmp)),
+				       POINTER_PLUS_EXPR, addr_tmp, off);
+	      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+	      addr_tmp = gimple_assign_lhs (stmt);
+	    }
 	}
 
       /* Perform the load.  */
@@ -967,7 +1002,7 @@ bswap_replace (gimple *cur_stmt, gimple
 	    }
 
 	  /* Convert the result of load if necessary.  */
-	  if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
+	  if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), load_type))
 	    {
 	      val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
 					    "load_dst");
@@ -975,13 +1010,21 @@ bswap_replace (gimple *cur_stmt, gimple
 	      gimple_set_vuse (load_stmt, n->vuse);
 	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
 	      gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
+	      update_stmt (cur_stmt);
 	    }
-	  else
+	  else if (cur_stmt)
 	    {
 	      gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
 	      gimple_set_vuse (cur_stmt, n->vuse);
+	      update_stmt (cur_stmt);
+	    }
+	  else
+	    {
+	      tgt = make_ssa_name (load_type);
+	      cur_stmt = gimple_build_assign (tgt, MEM_REF, val_expr);
+	      gimple_set_vuse (cur_stmt, n->vuse);
+	      gsi_insert_before (&gsi, cur_stmt, GSI_SAME_STMT);
 	    }
-	  update_stmt (cur_stmt);
 
 	  if (dump_file)
 	    {
@@ -990,7 +1033,7 @@ bswap_replace (gimple *cur_stmt, gimple
 		       (int) n->range);
 	      print_gimple_stmt (dump_file, cur_stmt, 0);
 	    }
-	  return true;
+	  return tgt;
 	}
       else
 	{
@@ -1003,15 +1046,17 @@ bswap_replace (gimple *cur_stmt, gimple
     }
   else if (!bswap)
     {
-      gimple *g;
-      if (!useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src)))
+      gimple *g = NULL;
+      if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src)))
 	{
 	  if (!is_gimple_val (src))
-	    return false;
+	    return NULL_TREE;
 	  g = gimple_build_assign (tgt, NOP_EXPR, src);
 	}
-      else
+      else if (cur_stmt)
 	g = gimple_build_assign (tgt, src);
+      else
+	tgt = src;
       if (n->range == 16)
 	nop_stats.found_16bit++;
       else if (n->range == 32)
@@ -1026,10 +1071,17 @@ bswap_replace (gimple *cur_stmt, gimple
 	  fprintf (dump_file,
 		   "%d bit reshuffle in target endianness found at: ",
 		   (int) n->range);
-	  print_gimple_stmt (dump_file, cur_stmt, 0);
+	  if (cur_stmt)
+	    print_gimple_stmt (dump_file, cur_stmt, 0);
+	  else
+	    {
+	      print_generic_expr (dump_file, tgt, 0);
+	      fprintf (dump_file, "\n");
+	    }
 	}
-      gsi_replace (&gsi, g, true);
-      return true;
+      if (cur_stmt)
+	gsi_replace (&gsi, g, true);
+      return tgt;
     }
   else if (TREE_CODE (src) == BIT_FIELD_REF)
     src = TREE_OPERAND (src, 0);
@@ -1069,6 +1121,8 @@ bswap_replace (gimple *cur_stmt, gimple
   else
     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
 
+  if (tgt == NULL_TREE)
+    tgt = make_ssa_name (bswap_type);
   tmp = tgt;
 
   /* Convert the result if necessary.  */
@@ -1087,12 +1141,23 @@ bswap_replace (gimple *cur_stmt, gimple
     {
       fprintf (dump_file, "%d bit bswap implementation found at: ",
 	       (int) n->range);
-      print_gimple_stmt (dump_file, cur_stmt, 0);
+      if (cur_stmt)
+	print_gimple_stmt (dump_file, cur_stmt, 0);
+      else
+	{
+	  print_generic_expr (dump_file, tgt, 0);
+	  fprintf (dump_file, "\n");
+	}
     }
 
-  gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
-  gsi_remove (&gsi, true);
-  return true;
+  if (cur_stmt)
+    {
+      gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
+      gsi_remove (&gsi, true);
+    }
+  else
+    gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT);
+  return tgt;
 }
 
 /* Find manual byte swap implementations as well as load in a given
@@ -1141,7 +1206,7 @@ pass_optimize_bswap::execute (function *
 	 inserted smaller bswap replacements as sub-patterns, the wider
 	 variant wouldn't be detected.  */
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
-        {
+	{
 	  gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi);
 	  tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
 	  enum tree_code code;
@@ -1211,8 +1276,8 @@ pass_optimize_bswap::execute (function *
 	  if (bswap && !fndecl && n.range != 16)
 	    continue;
 
-	  if (bswap_replace (cur_stmt, ins_stmt, fndecl, bswap_type, load_type,
-			     &n, bswap))
+	  if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
+			     bswap_type, load_type, &n, bswap))
 	    changed = true;
 	}
     }
@@ -1281,8 +1346,15 @@ struct store_immediate_info
   gimple *stmt;
   unsigned int order;
   /* INTEGER_CST for constant stores, MEM_REF for memory copy or
-     BIT_*_EXPR for logical bitwise operation.  */
+     BIT_*_EXPR for logical bitwise operation.
+     LROTATE_EXPR if it can be only bswap optimized and
+     ops are not really meaningful.
+     NOP_EXPR if bswap optimization detected identity, ops
+     are not meaningful.  */
   enum tree_code rhs_code;
+  /* Two fields for bswap optimization purposes.  */
+  struct symbolic_number n;
+  gimple *ins_stmt;
   /* True if BIT_{AND,IOR,XOR}_EXPR result is inverted before storing.  */
   bool bit_not_p;
   /* True if ops have been swapped and thus ops[1] represents
@@ -1293,7 +1365,8 @@ struct store_immediate_info
   store_operand_info ops[2];
   store_immediate_info (unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 			unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
-			gimple *, unsigned int, enum tree_code, bool,
+			gimple *, unsigned int, enum tree_code,
+			struct symbolic_number &, gimple *, bool,
 			const store_operand_info &,
 			const store_operand_info &);
 };
@@ -1305,12 +1378,14 @@ store_immediate_info::store_immediate_in
 					    gimple *st,
 					    unsigned int ord,
 					    enum tree_code rhscode,
+					    struct symbolic_number &nr,
+					    gimple *ins_stmtp,
 					    bool bitnotp,
 					    const store_operand_info &op0r,
 					    const store_operand_info &op1r)
   : bitsize (bs), bitpos (bp), bitregion_start (brs), bitregion_end (bre),
-    stmt (st), order (ord), rhs_code (rhscode), bit_not_p (bitnotp),
-    ops_swapped_p (false)
+    stmt (st), order (ord), rhs_code (rhscode), n (nr),
+    ins_stmt (ins_stmtp), bit_not_p (bitnotp), ops_swapped_p (false)
 #if __cplusplus >= 201103L
     , ops { op0r, op1r }
 {
@@ -1884,13 +1959,13 @@ merged_store_group::apply_stores ()
 	      fprintf (dump_file, "After writing ");
 	      print_generic_expr (dump_file, cst, 0);
 	      fprintf (dump_file, " of size " HOST_WIDE_INT_PRINT_DEC
-			" at position %d the merged region contains:\n",
-			info->bitsize, pos_in_buffer);
+		       " at position %d the merged region contains:\n",
+		       info->bitsize, pos_in_buffer);
 	      dump_char_array (dump_file, val, buf_size);
 	    }
 	  else
 	    fprintf (dump_file, "Failed to merge stores\n");
-        }
+	}
       if (!ret)
 	return false;
       unsigned char *m = mask + (pos_in_buffer / BITS_PER_UNIT);
@@ -1901,6 +1976,7 @@ merged_store_group::apply_stores ()
       else
 	clear_bit_region (m, pos_in_buffer % BITS_PER_UNIT, info->bitsize);
     }
+  stores.qsort (sort_by_bitpos);
   return true;
 }
 
@@ -1937,6 +2013,7 @@ struct imm_store_chain_info
       }
   }
   bool terminate_and_process_chain ();
+  bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int);
   bool coalesce_immediate_stores ();
   bool output_merged_store (merged_store_group *);
   bool output_merged_stores ();
@@ -2075,7 +2152,8 @@ pass_store_merging::terminate_and_releas
 
 /* Return true if stmts in between FIRST (inclusive) and LAST (exclusive)
    may clobber REF.  FIRST and LAST must be in the same basic block and
-   have non-NULL vdef.  */
+   have non-NULL vdef.  We want to be able to sink load of REF across
+   stores between FIRST and LAST, up to right before LAST.  */
 
 bool
 stmts_may_clobber_ref_p (gimple *first, gimple *last, tree ref)
@@ -2092,6 +2170,9 @@ stmts_may_clobber_ref_p (gimple *first,
       stmt = SSA_NAME_DEF_STMT (vop);
       if (stmt_may_clobber_ref_p_1 (stmt, &r))
 	return true;
+      if (gimple_store_p (stmt)
+	  && refs_anti_dependent_p (ref, gimple_get_lhs (stmt)))
+	return true;
       /* Avoid quadratic compile time by bounding the number of checks
 	 we perform.  */
       if (++count > MAX_STORE_ALIAS_CHECKS)
@@ -2192,6 +2273,252 @@ compatible_load_p (merged_store_group *m
   return true;
 }
 
+/* Add all refs loaded to compute VAL to REFS vector.  */
+
+void
+gather_bswap_load_refs (vec<tree> *refs, tree val)
+{
+  if (TREE_CODE (val) != SSA_NAME)
+    return;
+
+  gimple *stmt = SSA_NAME_DEF_STMT (val);
+  if (!is_gimple_assign (stmt))
+    return;
+
+  if (gimple_assign_load_p (stmt))
+    {
+      refs->safe_push (gimple_assign_rhs1 (stmt));
+      return;
+    }
+
+  switch (gimple_assign_rhs_class (stmt))
+    {
+    case GIMPLE_BINARY_RHS:
+      gather_bswap_load_refs (refs, gimple_assign_rhs2 (stmt));
+      /* FALLTHRU */
+    case GIMPLE_UNARY_RHS:
+      gather_bswap_load_refs (refs, gimple_assign_rhs1 (stmt));
+      break;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Return true if m_store_info[first] and at least one following store
+   form a group which store try_size bitsize value which is byte swapped
+   from a memory load or some value, or identity from some value.
+   This uses the bswap pass APIs.  */
+
+bool
+imm_store_chain_info::try_coalesce_bswap (merged_store_group *merged_store,
+					  unsigned int first,
+					  unsigned int try_size)
+{
+  unsigned int len = m_store_info.length (), last = first;
+  unsigned HOST_WIDE_INT width = m_store_info[first]->bitsize;
+  if (width >= try_size)
+    return false;
+  for (unsigned int i = first + 1; i < len; ++i)
+    {
+      if (m_store_info[i]->bitpos != m_store_info[first]->bitpos + width
+	  || m_store_info[i]->ins_stmt == NULL)
+	return false;
+      width += m_store_info[i]->bitsize;
+      if (width >= try_size)
+	{
+	  last = i;
+	  break;
+	}
+    }
+  if (width != try_size)
+    return false;
+
+  bool allow_unaligned
+    = !STRICT_ALIGNMENT && PARAM_VALUE (PARAM_STORE_MERGING_ALLOW_UNALIGNED);
+  /* Punt if the combined store would not be aligned and we need alignment.  */
+  if (!allow_unaligned)
+    {
+      unsigned int align = merged_store->align;
+      unsigned HOST_WIDE_INT align_base = merged_store->align_base;
+      for (unsigned int i = first + 1; i <= last; ++i)
+	{
+	  unsigned int this_align;
+	  unsigned HOST_WIDE_INT align_bitpos = 0;
+	  get_object_alignment_1 (gimple_assign_lhs (m_store_info[i]->stmt),
+				  &this_align, &align_bitpos);
+	  if (this_align > align)
+	    {
+	      align = this_align;
+	      align_base = m_store_info[i]->bitpos - align_bitpos;
+	    }
+	}
+      unsigned HOST_WIDE_INT align_bitpos
+	= (m_store_info[first]->bitpos - align_base) & (align - 1);
+      if (align_bitpos)
+	align = least_bit_hwi (align_bitpos);
+      if (align < try_size)
+	return false;
+    }
+
+  tree type;
+  switch (try_size)
+    {
+    case 16: type = uint16_type_node; break;
+    case 32: type = uint32_type_node; break;
+    case 64: type = uint64_type_node; break;
+    default: gcc_unreachable ();
+    }
+  struct symbolic_number n;
+  gimple *ins_stmt = NULL;
+  int vuse_store = -1;
+  unsigned int first_order = merged_store->first_order;
+  unsigned int last_order = merged_store->last_order;
+  gimple *first_stmt = merged_store->first_stmt;
+  gimple *last_stmt = merged_store->last_stmt;
+  store_immediate_info *infof = m_store_info[first];
+
+  for (unsigned int i = first; i <= last; ++i)
+    {
+      store_immediate_info *info = m_store_info[i];
+      struct symbolic_number this_n = info->n;
+      this_n.type = type;
+      if (!this_n.base_addr)
+	this_n.range = try_size / BITS_PER_UNIT;
+      unsigned int bitpos = info->bitpos - infof->bitpos;
+      if (!do_shift_rotate (LSHIFT_EXPR, &this_n,
+			    BYTES_BIG_ENDIAN
+			    ? try_size - info->bitsize - bitpos
+			    : bitpos))
+	return false;
+      if (n.base_addr && vuse_store)
+	{
+	  unsigned int j;
+	  for (j = first; j <= last; ++j)
+	    if (this_n.vuse == gimple_vuse (m_store_info[j]->stmt))
+	      break;
+	  if (j > last)
+	    {
+	      if (vuse_store == 1)
+		return false;
+	      vuse_store = 0;
+	    }
+	}
+      if (i == first)
+	{
+	  n = this_n;
+	  ins_stmt = info->ins_stmt;
+	}
+      else
+	{
+	  if (n.base_addr)
+	    {
+	      if (n.vuse != this_n.vuse)
+		{
+		  if (vuse_store == 0)
+		    return false;
+		  vuse_store = 1;
+		}
+	      if (info->order > last_order)
+		{
+		  last_order = info->order;
+		  last_stmt = info->stmt;
+		}
+	      else if (info->order < first_order)
+		{
+		  first_order = info->order;
+		  first_stmt = info->stmt;
+		}
+	    }
+
+	  ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt,
+					     &this_n, &n);
+	  if (ins_stmt == NULL)
+	    return false;
+	}
+    }
+
+  uint64_t cmpxchg, cmpnop;
+  find_bswap_or_nop_finalize (&n, &cmpxchg, &cmpnop);
+
+  /* A complete byte swap should make the symbolic number to start with
+     the largest digit in the highest order byte.  Unchanged symbolic
+     number indicates a read with same endianness as target architecture.  */
+  if (n.n != cmpnop && n.n != cmpxchg)
+    return false;
+
+  if (n.base_addr == NULL_TREE && !is_gimple_val (n.src))
+    return false;
+
+  /* Don't handle memory copy this way if normal non-bswap processing
+     would handle it too.  */
+  if (n.n == cmpnop && (unsigned) n.n_ops == last - first + 1)
+    {
+      unsigned int i;
+      for (i = first; i <= last; ++i)
+	if (m_store_info[i]->rhs_code != MEM_REF)
+	  break;
+      if (i == last + 1)
+	return false;
+    }
+
+  if (n.n == cmpxchg)
+    switch (try_size)
+      {
+      case 16:
+	/* Will emit LROTATE_EXPR.  */
+	break;
+      case 32:
+	if (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
+	    && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing)
+	  break;
+	return false;
+      case 64:
+	if (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
+	    && optab_handler (bswap_optab, DImode) != CODE_FOR_nothing)
+	  break;
+	return false;
+      default:
+	gcc_unreachable ();
+      }
+
+  if (!allow_unaligned && n.base_addr)
+    {
+      unsigned int align = get_object_alignment (n.src);
+      if (align < try_size)
+	return false;
+    }
+
+  /* If each load has vuse of the corresponding store, need to verify
+     the loads can be sunk right before the last store.  */
+  if (vuse_store == 1)
+    {
+      auto_vec<tree, 64> refs;
+      for (unsigned int i = first; i <= last; ++i)
+	gather_bswap_load_refs (&refs,
+				gimple_assign_rhs1 (m_store_info[i]->stmt));
+
+      unsigned int i;
+      tree ref;
+      FOR_EACH_VEC_ELT (refs, i, ref)
+	if (stmts_may_clobber_ref_p (first_stmt, last_stmt, ref))
+	  return false;
+      n.vuse = NULL_TREE;
+    }
+
+  infof->n = n;
+  infof->ins_stmt = ins_stmt;
+  for (unsigned int i = first; i <= last; ++i)
+    {
+      m_store_info[i]->rhs_code = n.n == cmpxchg ? LROTATE_EXPR : NOP_EXPR;
+      m_store_info[i]->ops[0].base_addr = NULL_TREE;
+      m_store_info[i]->ops[1].base_addr = NULL_TREE;
+      if (i != first)
+	merged_store->merge_into (m_store_info[i]);
+    }
+
+  return true;
+}
+
 /* Go through the candidate stores recorded in m_store_info and merge them
    into merged_store_group objects recorded into m_merged_store_groups
    representing the widened stores.  Return true if coalescing was successful
@@ -2210,7 +2537,7 @@ imm_store_chain_info::coalesce_immediate
 	     m_store_info.length ());
 
   store_immediate_info *info;
-  unsigned int i;
+  unsigned int i, ignore = 0;
 
   /* Order the stores by the bitposition they write to.  */
   m_store_info.qsort (sort_by_bitpos);
@@ -2229,14 +2556,41 @@ imm_store_chain_info::coalesce_immediate
 	  fprintf (dump_file, "\n------------\n");
 	}
 
-      if (i == 0)
+      if (i <= ignore)
 	continue;
 
+      /* First try to handle group of stores like:
+	 p[0] = data >> 24;
+	 p[1] = data >> 16;
+	 p[2] = data >> 8;
+	 p[3] = data;
+	 using the bswap framework.  */
+      if (info->bitpos == merged_store->start + merged_store->width
+	  && merged_store->stores.length () == 1
+	  && merged_store->stores[0]->ins_stmt != NULL
+	  && info->ins_stmt != NULL)
+	{
+	  unsigned int try_size;
+	  for (try_size = 64; try_size >= 16; try_size >>= 1)
+	    if (try_coalesce_bswap (merged_store, i - 1, try_size))
+	      break;
+
+	  if (try_size >= 16)
+	    {
+	      ignore = i + merged_store->stores.length () - 1;
+	      m_merged_store_groups.safe_push (merged_store);
+	      if (ignore < m_store_info.length ())
+		merged_store = new merged_store_group (m_store_info[ignore]);
+	      else
+		merged_store = NULL;
+	      continue;
+	    }
+	}
+
       /* |---store 1---|
 	       |---store 2---|
-       Overlapping stores.  */
-      unsigned HOST_WIDE_INT start = info->bitpos;
-      if (IN_RANGE (start, merged_store->start,
+	 Overlapping stores.  */
+      if (IN_RANGE (info->bitpos, merged_store->start,
 		    merged_store->start + merged_store->width - 1))
 	{
 	  /* Only allow overlapping stores of constants.  */
@@ -2251,7 +2605,8 @@ imm_store_chain_info::coalesce_immediate
 	 This store is consecutive to the previous one.
 	 Merge it into the current store group.  There can be gaps in between
 	 the stores, but there can't be gaps in between bitregions.  */
-      else if (info->bitregion_start <= merged_store->bitregion_end
+      else if (info->rhs_code != LROTATE_EXPR
+	       && info->bitregion_start <= merged_store->bitregion_end
 	       && info->rhs_code == merged_store->stores[0]->rhs_code)
 	{
 	  store_immediate_info *infof = merged_store->stores[0];
@@ -2297,10 +2652,13 @@ imm_store_chain_info::coalesce_immediate
     }
 
   /* Record or discard the last store group.  */
-  if (!merged_store->apply_stores ())
-    delete merged_store;
-  else
-    m_merged_store_groups.safe_push (merged_store);
+  if (merged_store)
+    {
+      if (!merged_store->apply_stores ())
+	delete merged_store;
+      else
+	m_merged_store_groups.safe_push (merged_store);
+    }
 
   gcc_assert (m_merged_store_groups.length () <= m_store_info.length ());
   bool success
@@ -2560,9 +2918,41 @@ split_group (merged_store_group *group,
 
   gcc_assert ((size % BITS_PER_UNIT == 0) && (pos % BITS_PER_UNIT == 0));
 
+  if (group->stores[0]->rhs_code == LROTATE_EXPR
+      || group->stores[0]->rhs_code == NOP_EXPR)
+    {
+      /* For bswap framework using sets of stores, all the checking
+	 has been done earlier in try_coalesce_bswap and needs to be
+	 emitted as a single store.  */
+      if (total_orig)
+	{
+	  /* Avoid the old/new stmt count heuristics.  It should be
+	     always beneficial.  */
+	  total_new[0] = 1;
+	  total_orig[0] = 2;
+	}
+
+      if (split_stores)
+	{
+	  unsigned HOST_WIDE_INT align_bitpos
+	    = (group->start - align_base) & (group_align - 1);
+	  unsigned HOST_WIDE_INT align = group_align;
+	  if (align_bitpos)
+	    align = least_bit_hwi (align_bitpos);
+	  bytepos = group->start / BITS_PER_UNIT;
+	  struct split_store *store
+	    = new split_store (bytepos, group->width, align);
+	  unsigned int first = 0;
+	  find_constituent_stores (group, &store->orig_stores,
+				   &first, group->start, group->width);
+	  split_stores->safe_push (store);
+	}
+
+      return 1;
+    }
+
   unsigned int ret = 0, first = 0;
   unsigned HOST_WIDE_INT try_pos = bytepos;
-  group->stores.qsort (sort_by_bitpos);
 
   if (total_orig)
     {
@@ -2904,6 +3294,7 @@ imm_store_chain_info::output_merged_stor
 			    " not larger than estimated number of new"
 			    " stmts (%u).\n",
 		 total_orig, total_new);
+      return false;
     }
 
   gimple_stmt_iterator last_gsi = gsi_for_stmt (group->last_stmt);
@@ -2911,13 +3302,63 @@ imm_store_chain_info::output_merged_stor
   tree last_vdef, new_vuse;
   last_vdef = gimple_vdef (group->last_stmt);
   new_vuse = gimple_vuse (group->last_stmt);
+  tree bswap_res = NULL_TREE;
+
+  if (group->stores[0]->rhs_code == LROTATE_EXPR
+      || group->stores[0]->rhs_code == NOP_EXPR)
+    {
+      tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
+      gimple *ins_stmt = group->stores[0]->ins_stmt;
+      struct symbolic_number *n = &group->stores[0]->n;
+      bool bswap = group->stores[0]->rhs_code == LROTATE_EXPR;
+
+      switch (n->range)
+	{
+	case 16:
+	  load_type = bswap_type = uint16_type_node;
+	  break;
+	case 32:
+	  load_type = uint32_type_node;
+	  if (bswap)
+	    {
+	      fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
+	      bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
+	    }
+	  break;
+	case 64:
+	  load_type = uint64_type_node;
+	  if (bswap)
+	    {
+	      fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
+	      bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
+	    }
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      /* If the loads have each vuse of the corresponding store,
+	 we've checked the aliasing already in try_coalesce_bswap and
+	 we want to sink the need load into seq.  So need to use new_vuse
+	 on the load.  */
+      if (n->base_addr && n->vuse == NULL)
+	{
+	  n->vuse = new_vuse;
+	  ins_stmt = NULL;
+	}
+      bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl,
+				 bswap_type, load_type, n, bswap);
+      gcc_assert (bswap_res);
+    }
 
   gimple *stmt = NULL;
   split_store *split_store;
   unsigned int i;
   auto_vec<gimple *, 32> orig_stmts;
-  tree addr = force_gimple_operand_1 (unshare_expr (base_addr), &seq,
+  gimple_seq this_seq;
+  tree addr = force_gimple_operand_1 (unshare_expr (base_addr), &this_seq,
 				      is_gimple_mem_ref_addr, NULL_TREE);
+  gimple_seq_add_seq_without_update (&seq, this_seq);
 
   tree load_addr[2] = { NULL_TREE, NULL_TREE };
   gimple_seq load_seq[2] = { NULL, NULL };
@@ -2941,7 +3382,6 @@ imm_store_chain_info::output_merged_stor
 	load_addr[j] = addr;
       else
 	{
-	  gimple_seq this_seq;
 	  load_addr[j]
 	    = force_gimple_operand_1 (unshare_expr (op.base_addr),
 				      &this_seq, is_gimple_mem_ref_addr,
@@ -2988,10 +3428,12 @@ imm_store_chain_info::output_merged_stor
 	      MR_DEPENDENCE_BASE (dest) = base;
 	    }
 
-	  tree mask
-	    = native_interpret_expr (int_type,
-				     group->mask + try_pos - start_byte_pos,
-				     group->buf_size);
+	  tree mask = integer_zero_node;
+	  if (!bswap_res)
+	    mask = native_interpret_expr (int_type,
+					  group->mask + try_pos
+					  - start_byte_pos,
+					  group->buf_size);
 
 	  tree ops[2];
 	  for (int j = 0;
@@ -2999,7 +3441,9 @@ imm_store_chain_info::output_merged_stor
 	       ++j)
 	    {
 	      store_operand_info &op = split_store->orig_stores[0]->ops[j];
-	      if (op.base_addr)
+	      if (bswap_res)
+		ops[j] = bswap_res;
+	      else if (op.base_addr)
 		{
 		  FOR_EACH_VEC_ELT (split_store->orig_stores, k, info)
 		    orig_stmts.safe_push (info->ops[j].stmt);
@@ -3124,6 +3568,24 @@ imm_store_chain_info::output_merged_stor
 		  src = gimple_assign_lhs (stmt);
 		}
 	      break;
+	    case LROTATE_EXPR:
+	    case NOP_EXPR:
+	      src = ops[0];
+	      if (!is_gimple_val (src))
+		{
+		  stmt = gimple_build_assign (make_ssa_name (TREE_TYPE (src)),
+					      src);
+		  gimple_seq_add_stmt_without_update (&seq, stmt);
+		  src = gimple_assign_lhs (stmt);
+		}
+	      if (!useless_type_conversion_p (int_type, TREE_TYPE (src)))
+		{
+		  stmt = gimple_build_assign (make_ssa_name (int_type),
+					      NOP_EXPR, src);
+		  gimple_seq_add_stmt_without_update (&seq, stmt);
+		  src = gimple_assign_lhs (stmt);
+		}
+	      break;
 	    default:
 	      src = ops[0];
 	      break;
@@ -3494,6 +3956,8 @@ pass_store_merging::process_store (gimpl
 		       && (TREE_CODE (rhs) != INTEGER_CST)));
   enum tree_code rhs_code = ERROR_MARK;
   bool bit_not_p = false;
+  struct symbolic_number n;
+  gimple *ins_stmt = NULL;
   store_operand_info ops[2];
   if (invalid)
     ;
@@ -3558,6 +4022,35 @@ pass_store_merging::process_store (gimpl
 	    invalid = true;
 	    break;
 	  }
+      if ((bitsize % BITS_PER_UNIT) == 0
+	  && (bitpos % BITS_PER_UNIT) == 0
+	  && bitsize <= 64
+	  && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN)
+	{
+	  ins_stmt = find_bswap_or_nop_1 (def_stmt, &n, 12);
+	  if (ins_stmt)
+	    {
+	      uint64_t nn = n.n;
+	      for (unsigned HOST_WIDE_INT i = 0;
+		   i < bitsize; i += BITS_PER_UNIT, nn >>= BITS_PER_MARKER)
+		if ((nn & MARKER_MASK) == 0
+		    || (nn & MARKER_MASK) == MARKER_BYTE_UNKNOWN)
+		  {
+		    ins_stmt = NULL;
+		    break;
+		  }
+	      if (ins_stmt)
+		{
+		  if (invalid)
+		    {
+		      rhs_code = LROTATE_EXPR;
+		      ops[0].base_addr = NULL_TREE;
+		      ops[1].base_addr = NULL_TREE;
+		    }
+		  invalid = false;
+		}
+	    }
+	}
     }
 
   if (invalid)
@@ -3566,6 +4059,9 @@ pass_store_merging::process_store (gimpl
       return;
     }
 
+  if (!ins_stmt)
+    memset (&n, 0, sizeof (n));
+
   struct imm_store_chain_info **chain_info = NULL;
   if (base_addr)
     chain_info = m_stores.get (base_addr);
@@ -3576,6 +4072,7 @@ pass_store_merging::process_store (gimpl
       unsigned int ord = (*chain_info)->m_store_info.length ();
       info = new store_immediate_info (bitsize, bitpos, bitregion_start,
 				       bitregion_end, stmt, ord, rhs_code,
+				       n, ins_stmt,
 				       bit_not_p, ops[0], ops[1]);
       if (dump_file && (dump_flags & TDF_DETAILS))
 	{
@@ -3604,6 +4101,7 @@ pass_store_merging::process_store (gimpl
     = new imm_store_chain_info (m_stores_head, base_addr);
   info = new store_immediate_info (bitsize, bitpos, bitregion_start,
 				   bitregion_end, stmt, 0, rhs_code,
+				   n, ins_stmt,
 				   bit_not_p, ops[0], ops[1]);
   new_chain->m_store_info.safe_push (info);
   m_stores.put (base_addr, new_chain);
@@ -3628,6 +4126,8 @@ pass_store_merging::execute (function *f
   basic_block bb;
   hash_set<gimple *> orig_stmts;
 
+  calculate_dominance_info (CDI_DOMINATORS);
+
   FOR_EACH_BB_FN (bb, fun)
     {
       gimple_stmt_iterator gsi;
--- gcc/testsuite/gcc.dg/store_merging_16.c.jj	2017-11-16 13:36:06.550536836 +0100
+++ gcc/testsuite/gcc.dg/store_merging_16.c	2017-11-16 15:12:55.231898459 +0100
@@ -0,0 +1,157 @@
+/* Only test on some 64-bit targets which do have bswap{si,di}2 patterns and
+   are either big or little endian (not pdp endian).  */
+/* { dg-do compile { target { lp64 && { i?86-*-* x86_64-*-* powerpc*-*-* aarch64*-*-* } } } } */
+/* { dg-require-effective-target store_merge } */
+/* { dg-options "-O2 -fdump-tree-store-merging" } */
+
+__attribute__((noipa)) void
+f1 (unsigned char *p, unsigned long long q)
+{
+  p[0] = q;
+  p[1] = q >> 8;
+  p[2] = q >> 16;
+  p[3] = q >> 24;
+  p[4] = q >> 32;
+  p[5] = q >> 40;
+  p[6] = q >> 48;
+  p[7] = q >> 56;
+}
+
+__attribute__((noipa)) void
+f2 (unsigned char *p, unsigned long long q)
+{
+  p[0] = q >> 56;
+  p[1] = q >> 48;
+  p[2] = q >> 40;
+  p[3] = q >> 32;
+  p[4] = q >> 24;
+  p[5] = q >> 16;
+  p[6] = q >> 8;
+  p[7] = q;
+}
+
+__attribute__((noipa)) void
+f3 (unsigned char *__restrict p, unsigned char *__restrict q)
+{
+  unsigned char q3 = q[3];
+  unsigned char q2 = q[2];
+  unsigned char q1 = q[1];
+  unsigned char q0 = q[0];
+  p[0] = q3;
+  p[1] = q2;
+  p[2] = q1;
+  p[3] = q0;
+}
+
+__attribute__((noipa)) void
+f4 (unsigned char *__restrict p, unsigned char *__restrict q)
+{
+  p[0] = q[3];
+  p[1] = q[2];
+  p[2] = q[1];
+  p[3] = q[0];
+}
+
+struct S { unsigned char a, b; unsigned short c; };
+
+__attribute__((noipa)) void
+f5 (struct S *__restrict p, struct S *__restrict q)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  unsigned char pa = q->c >> 8;
+  unsigned char pb = q->c;
+  unsigned short pc = (q->a << 8) | q->b;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  unsigned char pa = q->c;
+  unsigned char pb = q->c >> 8;
+  unsigned short pc = q->a | (q->b << 8);
+#endif
+  p->a = pa;
+  p->b = pb;
+  p->c = pc;
+}
+
+__attribute__((noipa)) void
+f6 (struct S *__restrict p, struct S *__restrict q)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  p->a = q->c >> 8;
+  p->b = q->c;
+  p->c = (q->a << 8) | q->b;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  p->a = q->c;
+  p->b = q->c >> 8;
+  p->c = q->a | (q->b << 8);
+#endif
+}
+
+struct T { unsigned long long a : 8, b : 8, c : 8, d : 8, e : 8, f : 8, g : 8, h : 8; };
+
+__attribute__((noipa)) void
+f7 (struct T *__restrict p, struct T *__restrict q)
+{
+  p->a = q->h;
+  p->b = q->g;
+  p->c = q->f;
+  p->d = q->e;
+  p->e = q->d;
+  p->f = q->c;
+  p->g = q->b;
+  p->h = q->a;
+}
+
+struct S b = { 0x11, 0x12,
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	       0x1413
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	       0x1314
+#endif
+	     };
+struct T e = { 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28 };
+
+int
+main ()
+{
+  unsigned char a[8];
+  int i;
+  struct S b, c, d;
+  f1 (a, 0x0102030405060708ULL);
+  for (i = 0; i < 8; ++i)
+    if (a[i] != 8 - i)
+      __builtin_abort ();
+  f2 (a, 0x0102030405060708ULL);
+  for (i = 0; i < 8; ++i)
+    if (a[i] != 1 + i)
+      __builtin_abort ();
+  f3 (a, a + 4);
+  for (i = 0; i < 8; ++i)
+    if (a[i] != (i < 4 ? 8 - i : 1 + i))
+      __builtin_abort ();
+  f2 (a, 0x090a0b0c0d0e0f10ULL);
+  f4 (a + 4, a);
+  for (i = 0; i < 8; ++i)
+    if (a[i] != (i < 4 ? 9 + i : 16 - i))
+      __builtin_abort ();
+  f5 (&c, &b);
+  if (c.a != 0x14 || c.b != 0x13
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+      || c.c != 0x1112
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+      || c.c != 0x1211
+#endif
+      )
+    __builtin_abort ();
+  f6 (&d, &c);
+  if (d.a != 0x11 || d.b != 0x12 || d.c != b.c)
+    __builtin_abort ();
+  struct T f;
+  f7 (&f, &e);
+  if (f.a != 0x28 || f.b != 0x27 || f.c != 0x26 || f.d != 0x25
+      || f.e != 0x24 || f.f != 0x23 || f.g != 0x22 || f.h != 0x21)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "Merging successful" 7 "store-merging" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_bswap64" 2 "store-merging" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_bswap32" 4 "store-merging" } } */

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]