This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Thoughts on memcmp expansion (PR43052)


On 05/13/2016 10:40 PM, Joseph Myers wrote:
On Fri, 13 May 2016, Bernd Schmidt wrote:

Thanks. So, this would seem to suggest that BITS_PER_UNIT in memcmp/memcpy
expansion is more accurate than a plain 8, although pedantically we might want
to use CHAR_TYPE_SIZE? Should I adjust my patch to use the latter or leave
these parts as-is?

I'd say use CHAR_TYPE_SIZE.

Here's the current patch, retested as before. Since I had to make some adjustments since the approval to cope with changes such as the removal of memcmp_libfunc, I'll leave it for a few days for comments before checking it in.


Bernd

Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c	(revision 236909)
+++ gcc/builtins.c	(working copy)
@@ -3671,53 +3671,24 @@ expand_cmpstr (insn_code icode, rtx targ
   return NULL_RTX;
 }
 
-/* Try to expand cmpstrn or cmpmem operation ICODE with the given operands.
-   ARG3_TYPE is the type of ARG3_RTX.  Return the result rtx on success,
-   otherwise return null.  */
-
-static rtx
-expand_cmpstrn_or_cmpmem (insn_code icode, rtx target, rtx arg1_rtx,
-			  rtx arg2_rtx, tree arg3_type, rtx arg3_rtx,
-			  HOST_WIDE_INT align)
-{
-  machine_mode insn_mode = insn_data[icode].operand[0].mode;
-
-  if (target && (!REG_P (target) || HARD_REGISTER_P (target)))
-    target = NULL_RTX;
-
-  struct expand_operand ops[5];
-  create_output_operand (&ops[0], target, insn_mode);
-  create_fixed_operand (&ops[1], arg1_rtx);
-  create_fixed_operand (&ops[2], arg2_rtx);
-  create_convert_operand_from (&ops[3], arg3_rtx, TYPE_MODE (arg3_type),
-			       TYPE_UNSIGNED (arg3_type));
-  create_integer_operand (&ops[4], align);
-  if (maybe_expand_insn (icode, 5, ops))
-    return ops[0].value;
-  return NULL_RTX;
-}
-
 /* Expand expression EXP, which is a call to the memcmp built-in function.
    Return NULL_RTX if we failed and the caller should emit a normal call,
-   otherwise try to get the result in TARGET, if convenient.  */
+   otherwise try to get the result in TARGET, if convenient.
+   RESULT_EQ is true if we can relax the returned value to be either zero
+   or nonzero, without caring about the sign.  */
 
 static rtx
-expand_builtin_memcmp (tree exp, rtx target)
+expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
 {
   if (!validate_arglist (exp,
  			 POINTER_TYPE, POINTER_TYPE, INTEGER_TYPE, VOID_TYPE))
     return NULL_RTX;
 
-  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
-     implementing memcmp because it will stop if it encounters two
-     zero bytes.  */
-  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
-  if (icode == CODE_FOR_nothing)
-    return NULL_RTX;
-
   tree arg1 = CALL_EXPR_ARG (exp, 0);
   tree arg2 = CALL_EXPR_ARG (exp, 1);
   tree len = CALL_EXPR_ARG (exp, 2);
+  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+  location_t loc = EXPR_LOCATION (exp);
 
   unsigned int arg1_align = get_pointer_alignment (arg1) / BITS_PER_UNIT;
   unsigned int arg2_align = get_pointer_alignment (arg2) / BITS_PER_UNIT;
@@ -3726,22 +3697,38 @@ expand_builtin_memcmp (tree exp, rtx tar
   if (arg1_align == 0 || arg2_align == 0)
     return NULL_RTX;
 
-  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
-  location_t loc = EXPR_LOCATION (exp);
   rtx arg1_rtx = get_memory_rtx (arg1, len);
   rtx arg2_rtx = get_memory_rtx (arg2, len);
-  rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
+  rtx len_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
 
   /* Set MEM_SIZE as appropriate.  */
-  if (CONST_INT_P (arg3_rtx))
+  if (CONST_INT_P (len_rtx))
     {
-      set_mem_size (arg1_rtx, INTVAL (arg3_rtx));
-      set_mem_size (arg2_rtx, INTVAL (arg3_rtx));
+      set_mem_size (arg1_rtx, INTVAL (len_rtx));
+      set_mem_size (arg2_rtx, INTVAL (len_rtx));
     }
 
-  rtx result = expand_cmpstrn_or_cmpmem (icode, target, arg1_rtx, arg2_rtx,
-					 TREE_TYPE (len), arg3_rtx,
-					 MIN (arg1_align, arg2_align));
+  by_pieces_constfn constfn = NULL;
+
+  const char *src_str = c_getstr (arg1);
+  if (src_str == NULL)
+    src_str = c_getstr (arg2);
+  else
+    std::swap (arg1_rtx, arg2_rtx);
+
+  /* If SRC is a string constant and block move would be done
+     by pieces, we can avoid loading the string from memory
+     and only stored the computed constants.  */
+  if (src_str
+      && CONST_INT_P (len_rtx)
+      && (unsigned HOST_WIDE_INT) INTVAL (len_rtx) <= strlen (src_str) + 1)
+    constfn = builtin_memcpy_read_str;
+
+  rtx result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
+				     TREE_TYPE (len), target,
+				     result_eq, constfn,
+				     CONST_CAST (char *, src_str));
+
   if (result)
     {
       /* Return the value in the proper mode for this function.  */
@@ -6073,9 +6060,15 @@ expand_builtin (tree exp, rtx target, rt
 
     case BUILT_IN_BCMP:
     case BUILT_IN_MEMCMP:
-      target = expand_builtin_memcmp (exp, target);
+    case BUILT_IN_MEMCMP_EQ:
+      target = expand_builtin_memcmp (exp, target, fcode == BUILT_IN_MEMCMP_EQ);
       if (target)
 	return target;
+      if (fcode == BUILT_IN_MEMCMP_EQ)
+	{
+	  tree newdecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
+	  TREE_OPERAND (exp, 1) = build_fold_addr_expr (newdecl);
+	}
       break;
 
     case BUILT_IN_SETJMP:
Index: gcc/builtins.def
===================================================================
--- gcc/builtins.def	(revision 236909)
+++ gcc/builtins.def	(working copy)
@@ -864,6 +864,10 @@ DEF_BUILTIN_STUB (BUILT_IN_STACK_SAVE, "
 DEF_BUILTIN_STUB (BUILT_IN_STACK_RESTORE, "__builtin_stack_restore")
 DEF_BUILTIN_STUB (BUILT_IN_ALLOCA_WITH_ALIGN, "__builtin_alloca_with_align")
 
+/* An internal version of memcmp, used when the result is only tested for
+   equality with zero.  */
+DEF_BUILTIN_STUB (BUILT_IN_MEMCMP_EQ, "__builtin_memcmp_eq")
+
 /* Object size checking builtins.  */
 DEF_GCC_BUILTIN	       (BUILT_IN_OBJECT_SIZE, "object_size", BT_FN_SIZE_CONST_PTR_INT, ATTR_PURE_NOTHROW_LEAF_LIST)
 DEF_EXT_LIB_BUILTIN_CHKP (BUILT_IN_MEMCPY_CHK, "__memcpy_chk", BT_FN_PTR_PTR_CONST_PTR_SIZE_SIZE, ATTR_RET1_NOTHROW_NONNULL_LEAF)
Index: gcc/defaults.h
===================================================================
--- gcc/defaults.h	(revision 236909)
+++ gcc/defaults.h	(working copy)
@@ -1039,6 +1039,11 @@ see the files COPYING3 and COPYING.RUNTI
 #define STORE_MAX_PIECES  MIN (MOVE_MAX_PIECES, 2 * sizeof (HOST_WIDE_INT))
 #endif
 
+/* Likewise for block comparisons.  */
+#ifndef COMPARE_MAX_PIECES
+#define COMPARE_MAX_PIECES  MOVE_MAX_PIECES
+#endif
+
 #ifndef MAX_MOVE_MAX
 #define MAX_MOVE_MAX MOVE_MAX
 #endif
Index: gcc/doc/tm.texi
===================================================================
--- gcc/doc/tm.texi	(revision 236909)
+++ gcc/doc/tm.texi	(working copy)
@@ -6311,8 +6311,9 @@ Both @var{size} and @var{alignment} are
 units.
 
 The parameter @var{op} is one of: @code{CLEAR_BY_PIECES},
-@code{MOVE_BY_PIECES}, @code{SET_BY_PIECES}, @code{STORE_BY_PIECES}.
-These describe the type of memory operation under consideration.
+@code{MOVE_BY_PIECES}, @code{SET_BY_PIECES}, @code{STORE_BY_PIECES} or
+@code{COMPARE_BY_PIECES}.  These describe the type of memory operation
+under consideration.
 
 The parameter @var{speed_p} is true if the code is currently being
 optimized for speed rather than size.
@@ -6329,11 +6330,33 @@ in code size, for example where the numb
 move would be greater than that of a library call.
 @end deftypefn
 
+@deftypefn {Target Hook} int TARGET_COMPARE_BY_PIECES_BRANCH_RATIO (machine_mode @var{mode})
+When expanding a block comparison in MODE, gcc can try to reduce the
+number of branches at the expense of more memory operations.  This hook
+allows the target to override the default choice.  It should return the
+factor by which branches should be reduced over the plain expansion with
+one comparison per @var{mode}-sized piece.  A port can also prevent a
+particular mode from being used for block comparisons by returning a
+negative number from this hook.
+@end deftypefn
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
 @end defmac
 
+@defmac STORE_MAX_PIECES
+A C expression used by @code{store_by_pieces} to determine the largest unit
+a store used to memory is.  Defaults to @code{MOVE_MAX_PIECES}, or two times
+the size of @code{HOST_WIDE_INT}, whichever is smaller.
+@end defmac
+
+@defmac COMPARE_MAX_PIECES
+A C expression used by @code{compare_by_pieces} to determine the largest unit
+a load or store used to compare memory is.  Defaults to
+@code{MOVE_MAX_PIECES}.
+@end defmac
+
 @defmac CLEAR_RATIO (@var{speed})
 The threshold of number of scalar move insns, @emph{below} which a sequence
 of insns should be generated to clear memory instead of a string clear insn
Index: gcc/doc/tm.texi.in
===================================================================
--- gcc/doc/tm.texi.in	(revision 236909)
+++ gcc/doc/tm.texi.in	(working copy)
@@ -4653,11 +4653,25 @@ If you don't define this, a reasonable d
 
 @hook TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
 
+@hook TARGET_COMPARE_BY_PIECES_BRANCH_RATIO
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
 @end defmac
 
+@defmac STORE_MAX_PIECES
+A C expression used by @code{store_by_pieces} to determine the largest unit
+a store used to memory is.  Defaults to @code{MOVE_MAX_PIECES}, or two times
+the size of @code{HOST_WIDE_INT}, whichever is smaller.
+@end defmac
+
+@defmac COMPARE_MAX_PIECES
+A C expression used by @code{compare_by_pieces} to determine the largest unit
+a load or store used to compare memory is.  Defaults to
+@code{MOVE_MAX_PIECES}.
+@end defmac
+
 @defmac CLEAR_RATIO (@var{speed})
 The threshold of number of scalar move insns, @emph{below} which a sequence
 of insns should be generated to clear memory instead of a string clear insn
Index: gcc/expr.c
===================================================================
--- gcc/expr.c	(revision 236909)
+++ gcc/expr.c	(working copy)
@@ -70,51 +70,12 @@ along with GCC; see the file COPYING3.
    the same indirect address eventually.  */
 int cse_not_expected;
 
-/* This structure is used by move_by_pieces to describe the move to
-   be performed.  */
-struct move_by_pieces_d
-{
-  rtx to;
-  rtx to_addr;
-  int autinc_to;
-  int explicit_inc_to;
-  rtx from;
-  rtx from_addr;
-  int autinc_from;
-  int explicit_inc_from;
-  unsigned HOST_WIDE_INT len;
-  HOST_WIDE_INT offset;
-  int reverse;
-};
-
-/* This structure is used by store_by_pieces to describe the clear to
-   be performed.  */
-
-struct store_by_pieces_d
-{
-  rtx to;
-  rtx to_addr;
-  int autinc_to;
-  int explicit_inc_to;
-  unsigned HOST_WIDE_INT len;
-  HOST_WIDE_INT offset;
-  rtx (*constfun) (void *, HOST_WIDE_INT, machine_mode);
-  void *constfundata;
-  int reverse;
-};
-
-static void move_by_pieces_1 (insn_gen_fn, machine_mode,
-			      struct move_by_pieces_d *);
 static bool block_move_libcall_safe_for_call_parm (void);
 static bool emit_block_move_via_movmem (rtx, rtx, rtx, unsigned, unsigned, HOST_WIDE_INT,
 					unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					unsigned HOST_WIDE_INT);
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
-static rtx clear_by_pieces_1 (void *, HOST_WIDE_INT, machine_mode);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
-static void store_by_pieces_1 (struct store_by_pieces_d *, unsigned int);
-static void store_by_pieces_2 (insn_gen_fn, machine_mode,
-			       struct store_by_pieces_d *);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
 static void store_constructor_field (rtx, unsigned HOST_WIDE_INT,
@@ -767,276 +728,799 @@ widest_int_mode_for_size (unsigned int s
   return mode;
 }
 
+/* Determine whether an operation OP on LEN bytes with alignment ALIGN can
+   and should be performed piecewise.  */
+
+static bool
+can_do_by_pieces (unsigned HOST_WIDE_INT len, unsigned int align,
+		  enum by_pieces_operation op)
+{
+  return targetm.use_by_pieces_infrastructure_p (len, align, op,
+						 optimize_insn_for_speed_p ());
+}
+
 /* Determine whether the LEN bytes can be moved by using several move
    instructions.  Return nonzero if a call to move_by_pieces should
    succeed.  */
 
-int
-can_move_by_pieces (unsigned HOST_WIDE_INT len,
-		    unsigned int align)
+bool
+can_move_by_pieces (unsigned HOST_WIDE_INT len, unsigned int align)
 {
-  return targetm.use_by_pieces_infrastructure_p (len, align, MOVE_BY_PIECES,
-						 optimize_insn_for_speed_p ());
+  return can_do_by_pieces (len, align, MOVE_BY_PIECES);
 }
 
-/* Generate several move instructions to copy LEN bytes from block FROM to
-   block TO.  (These are MEM rtx's with BLKmode).
+/* Return number of insns required to perform operation OP by pieces
+   for L bytes.  ALIGN (in bits) is maximum alignment we can assume.  */
 
-   If PUSH_ROUNDING is defined and TO is NULL, emit_single_push_insn is
-   used to push FROM to the stack.
+unsigned HOST_WIDE_INT
+by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
+		  unsigned int max_size, by_pieces_operation op)
+{
+  unsigned HOST_WIDE_INT n_insns = 0;
 
-   ALIGN is maximum stack alignment we can assume.
+  align = alignment_for_piecewise_move (MOVE_MAX_PIECES, align);
 
-   If ENDP is 0 return to, if ENDP is 1 return memory at the end ala
-   mempcpy, and if ENDP is 2 return memory the end minus one byte ala
-   stpcpy.  */
+  while (max_size > 1 && l > 0)
+    {
+      machine_mode mode;
+      enum insn_code icode;
 
-rtx
-move_by_pieces (rtx to, rtx from, unsigned HOST_WIDE_INT len,
-		unsigned int align, int endp)
+      mode = widest_int_mode_for_size (max_size);
+
+      if (mode == VOIDmode)
+	break;
+      unsigned int modesize = GET_MODE_SIZE (mode);
+
+      icode = optab_handler (mov_optab, mode);
+      if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
+	{
+	  unsigned HOST_WIDE_INT n_pieces = l / modesize;
+	  l %= modesize;
+	  switch (op)
+	    {
+	    default:
+	      n_insns += n_pieces;
+	      break;
+
+	    case COMPARE_BY_PIECES:
+	      int batch = targetm.compare_by_pieces_branch_ratio (mode);
+	      int batch_ops = 4 * batch - 1;
+	      int full = n_pieces / batch;
+	      n_insns += full * batch_ops;
+	      if (n_pieces % batch != 0)
+		n_insns++;
+	      break;
+
+	    }
+	}
+      max_size = modesize;
+    }
+
+  gcc_assert (!l);
+  return n_insns;
+}
+
+/* Used when performing piecewise block operations, holds information
+   about one of the memory objects involved.  The member functions
+   can be used to generate code for loading from the object and
+   updating the address when iterating.  */
+
+class pieces_addr
 {
-  struct move_by_pieces_d data;
-  machine_mode to_addr_mode;
-  machine_mode from_addr_mode = get_address_mode (from);
-  rtx to_addr, from_addr = XEXP (from, 0);
-  unsigned int max_size = MOVE_MAX_PIECES + 1;
-  enum insn_code icode;
+  /* The object being referenced, a MEM.  Can be NULL_RTX to indicate
+     stack pushes.  */
+  rtx m_obj;
+  /* The address of the object.  Can differ from that seen in the
+     MEM rtx if we copied the address to a register.  */
+  rtx m_addr;
+  /* Nonzero if the address on the object has an autoincrement already,
+     signifies whether that was an increment or decrement.  */
+  signed char m_addr_inc;
+  /* Nonzero if we intend to use autoinc without the address already
+     having autoinc form.  We will insert add insns around each memory
+     reference, expecting later passes to form autoinc addressing modes.
+     The only supported options are predecrement and postincrement.  */
+  signed char m_explicit_inc;
+  /* True if we have either of the two possible cases of using
+     autoincrement.  */
+  bool m_auto;
+  /* True if this is an address to be used for load operations rather
+     than stores.  */
+  bool m_is_load;
 
-  align = MIN (to ? MEM_ALIGN (to) : align, MEM_ALIGN (from));
+  /* Optionally, a function to obtain constants for any given offset into
+     the objects, and data associated with it.  */
+  by_pieces_constfn m_constfn;
+  void *m_cfndata;
+public:
+  pieces_addr (rtx, bool, by_pieces_constfn, void *);
+  rtx adjust (machine_mode, HOST_WIDE_INT);
+  void increment_address (HOST_WIDE_INT);
+  void maybe_predec (HOST_WIDE_INT);
+  void maybe_postinc (HOST_WIDE_INT);
+  void decide_autoinc (machine_mode, bool, HOST_WIDE_INT);
+  int get_addr_inc ()
+  {
+    return m_addr_inc;
+  }
+};
 
-  data.offset = 0;
-  data.from_addr = from_addr;
-  if (to)
+/* Initialize a pieces_addr structure from an object OBJ.  IS_LOAD is
+   true if the operation to be performed on this object is a load
+   rather than a store.  For stores, OBJ can be NULL, in which case we
+   assume the operation is a stack push.  For loads, the optional
+   CONSTFN and its associated CFNDATA can be used in place of the
+   memory load.  */
+
+pieces_addr::pieces_addr (rtx obj, bool is_load, by_pieces_constfn constfn,
+			  void *cfndata)
+  : m_obj (obj), m_is_load (is_load), m_constfn (constfn), m_cfndata (cfndata)
+{
+  m_addr_inc = 0;
+  m_auto = false;
+  if (obj)
     {
-      to_addr_mode = get_address_mode (to);
-      to_addr = XEXP (to, 0);
-      data.to = to;
-      data.autinc_to
-	= (GET_CODE (to_addr) == PRE_INC || GET_CODE (to_addr) == PRE_DEC
-	   || GET_CODE (to_addr) == POST_INC || GET_CODE (to_addr) == POST_DEC);
-      data.reverse
-	= (GET_CODE (to_addr) == PRE_DEC || GET_CODE (to_addr) == POST_DEC);
+      rtx addr = XEXP (obj, 0);
+      rtx_code code = GET_CODE (addr);
+      m_addr = addr;
+      bool dec = code == PRE_DEC || code == POST_DEC;
+      bool inc = code == PRE_INC || code == POST_INC;
+      m_auto = inc || dec;
+      if (m_auto)
+	m_addr_inc = dec ? -1 : 1;
+
+      /* While we have always looked for these codes here, the code
+	 implementing the memory operation has never handled them.
+	 Support could be added later if necessary or beneficial.  */
+      gcc_assert (code != PRE_INC && code != POST_DEC);
     }
   else
     {
-      to_addr_mode = VOIDmode;
-      to_addr = NULL_RTX;
-      data.to = NULL_RTX;
-      data.autinc_to = 1;
-      if (STACK_GROWS_DOWNWARD)
-	data.reverse = 1;
+      m_addr = NULL_RTX;
+      if (!is_load)
+	{
+	  m_auto = true;
+	  if (STACK_GROWS_DOWNWARD)
+	    m_addr_inc = -1;
+	  else
+	    m_addr_inc = 1;
+	}
       else
-	data.reverse = 0;
+	gcc_assert (constfn != NULL);
     }
-  data.to_addr = to_addr;
-  data.from = from;
-  data.autinc_from
-    = (GET_CODE (from_addr) == PRE_INC || GET_CODE (from_addr) == PRE_DEC
-       || GET_CODE (from_addr) == POST_INC
-       || GET_CODE (from_addr) == POST_DEC);
+  m_explicit_inc = 0;
+  if (constfn)
+    gcc_assert (is_load);
+}
 
-  data.explicit_inc_from = 0;
-  data.explicit_inc_to = 0;
-  if (data.reverse) data.offset = len;
-  data.len = len;
+/* Decide whether to use autoinc for an address involved in a memory op.
+   MODE is the mode of the accesses, REVERSE is true if we've decided to
+   perform the operation starting from the end, and LEN is the length of
+   the operation.  Don't override an earlier decision to set m_auto.  */
+
+void
+pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,
+			     HOST_WIDE_INT len)
+{
+  if (m_auto || m_obj == NULL_RTX)
+    return;
+
+  bool use_predec = (m_is_load
+		     ? USE_LOAD_PRE_DECREMENT (mode)
+		     : USE_STORE_PRE_DECREMENT (mode));
+  bool use_postinc = (m_is_load
+		      ? USE_LOAD_POST_INCREMENT (mode)
+		      : USE_STORE_POST_INCREMENT (mode));
+  machine_mode addr_mode = get_address_mode (m_obj);
+
+  if (use_predec && reverse)
+    {
+      m_addr = copy_to_mode_reg (addr_mode,
+				 plus_constant (addr_mode,
+						m_addr, len));
+      m_auto = true;
+      m_explicit_inc = -1;
+    }
+  else if (use_postinc && !reverse)
+    {
+      m_addr = copy_to_mode_reg (addr_mode, m_addr);
+      m_auto = true;
+      m_explicit_inc = 1;
+    }
+  else if (CONSTANT_P (m_addr))
+    m_addr = copy_to_mode_reg (addr_mode, m_addr);
+}
+
+/* Adjust the address to refer to the data at OFFSET in MODE.  If we
+   are using autoincrement for this address, we don't add the offset,
+   but we still modify the MEM's properties.  */
+
+rtx
+pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset)
+{
+  if (m_constfn)
+    return m_constfn (m_cfndata, offset, mode);
+  if (m_obj == NULL_RTX)
+    return NULL_RTX;
+  if (m_auto)
+    return adjust_automodify_address (m_obj, mode, m_addr, offset);
+  else
+    return adjust_address (m_obj, mode, offset);
+}
+
+/* Emit an add instruction to increment the address by SIZE.  */
+
+void
+pieces_addr::increment_address (HOST_WIDE_INT size)
+{
+  rtx amount = gen_int_mode (size, GET_MODE (m_addr));
+  emit_insn (gen_add2_insn (m_addr, amount));
+}
+
+/* If we are supposed to decrement the address after each access, emit code
+   to do so now.  Increment by SIZE (which has should have the correct sign
+   already).  */
+
+void
+pieces_addr::maybe_predec (HOST_WIDE_INT size)
+{
+  if (m_explicit_inc >= 0)
+    return;
+  gcc_assert (HAVE_PRE_DECREMENT);
+  increment_address (size);
+}
+
+/* If we are supposed to decrement the address after each access, emit code
+   to do so now.  Increment by SIZE.  */
+
+void
+pieces_addr::maybe_postinc (HOST_WIDE_INT size)
+{
+  if (m_explicit_inc <= 0)
+    return;
+  gcc_assert (HAVE_POST_INCREMENT);
+  increment_address (size);
+}
+
+/* This structure is used by do_op_by_pieces to describe the operation
+   to be performed.  */
+
+class op_by_pieces_d
+{
+ protected:
+  pieces_addr m_to, m_from;
+  unsigned HOST_WIDE_INT m_len;
+  HOST_WIDE_INT m_offset;
+  unsigned int m_align;
+  unsigned int m_max_size;
+  bool m_reverse;
+
+  /* Virtual functions, overriden by derived classes for the specific
+     operation.  */
+  virtual void generate (rtx, rtx, machine_mode) = 0;
+  virtual bool prepare_mode (machine_mode, unsigned int) = 0;
+  virtual void finish_mode (machine_mode)
+  {
+  }
+
+ public:
+  op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
+		  unsigned HOST_WIDE_INT, unsigned int);
+  void run ();
+};
+
+/* The constructor for an op_by_pieces_d structure.  We require two
+   objects named TO and FROM, which are identified as loads or stores
+   by TO_LOAD and FROM_LOAD.  If FROM is a load, the optional FROM_CFN
+   and its associated FROM_CFN_DATA can be used to replace loads with
+   constant values.  LEN describes the length of the operation.  */
+
+op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
+				rtx from, bool from_load,
+				by_pieces_constfn from_cfn,
+				void *from_cfn_data,
+				unsigned HOST_WIDE_INT len,
+				unsigned int align)
+  : m_to (to, to_load, NULL, NULL),
+    m_from (from, from_load, from_cfn, from_cfn_data),
+    m_len (len), m_max_size (MOVE_MAX_PIECES + 1)
+{
+  int toi = m_to.get_addr_inc ();
+  int fromi = m_from.get_addr_inc ();
+  if (toi >= 0 && fromi >= 0)
+    m_reverse = false;
+  else if (toi <= 0 && fromi <= 0)
+    m_reverse = true;
+  else
+    gcc_unreachable ();
+
+  m_offset = m_reverse ? len : 0;
+  align = MIN (to ? MEM_ALIGN (to) : align,
+	       from ? MEM_ALIGN (from) : align);
 
   /* If copying requires more than two move insns,
      copy addresses to registers (to make displacements shorter)
      and use post-increment if available.  */
-  if (!(data.autinc_from && data.autinc_to)
-      && move_by_pieces_ninsns (len, align, max_size) > 2)
+  if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2)
     {
-      /* Find the mode of the largest move...
-	 MODE might not be used depending on the definitions of the
-	 USE_* macros below.  */
-      machine_mode mode ATTRIBUTE_UNUSED
-	= widest_int_mode_for_size (max_size);
+      /* Find the mode of the largest comparison.  */
+      machine_mode mode = widest_int_mode_for_size (m_max_size);
 
-      if (USE_LOAD_PRE_DECREMENT (mode) && data.reverse && ! data.autinc_from)
-	{
-	  data.from_addr = copy_to_mode_reg (from_addr_mode,
-					     plus_constant (from_addr_mode,
-							    from_addr, len));
-	  data.autinc_from = 1;
-	  data.explicit_inc_from = -1;
-	}
-      if (USE_LOAD_POST_INCREMENT (mode) && ! data.autinc_from)
-	{
-	  data.from_addr = copy_to_mode_reg (from_addr_mode, from_addr);
-	  data.autinc_from = 1;
-	  data.explicit_inc_from = 1;
-	}
-      if (!data.autinc_from && CONSTANT_P (from_addr))
-	data.from_addr = copy_to_mode_reg (from_addr_mode, from_addr);
-      if (USE_STORE_PRE_DECREMENT (mode) && data.reverse && ! data.autinc_to)
-	{
-	  data.to_addr = copy_to_mode_reg (to_addr_mode,
-					   plus_constant (to_addr_mode,
-							  to_addr, len));
-	  data.autinc_to = 1;
-	  data.explicit_inc_to = -1;
-	}
-      if (USE_STORE_POST_INCREMENT (mode) && ! data.reverse && ! data.autinc_to)
-	{
-	  data.to_addr = copy_to_mode_reg (to_addr_mode, to_addr);
-	  data.autinc_to = 1;
-	  data.explicit_inc_to = 1;
-	}
-      if (!data.autinc_to && CONSTANT_P (to_addr))
-	data.to_addr = copy_to_mode_reg (to_addr_mode, to_addr);
+      m_from.decide_autoinc (mode, m_reverse, len);
+      m_to.decide_autoinc (mode, m_reverse, len);
     }
 
   align = alignment_for_piecewise_move (MOVE_MAX_PIECES, align);
+  m_align = align;
+}
 
-  /* First move what we can in the largest integer mode, then go to
-     successively smaller modes.  */
+/* This function contains the main loop used for expanding a block
+   operation.  First move what we can in the largest integer mode,
+   then go to successively smaller modes.  For every access, call
+   GENFUN with the two operands and the EXTRA_DATA.  */
 
-  while (max_size > 1 && data.len > 0)
+void
+op_by_pieces_d::run ()
+{
+  while (m_max_size > 1 && m_len > 0)
     {
-      machine_mode mode = widest_int_mode_for_size (max_size);
+      machine_mode mode = widest_int_mode_for_size (m_max_size);
 
       if (mode == VOIDmode)
 	break;
 
-      icode = optab_handler (mov_optab, mode);
-      if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
-	move_by_pieces_1 (GEN_FCN (icode), mode, &data);
+      if (prepare_mode (mode, m_align))
+	{
+	  unsigned int size = GET_MODE_SIZE (mode);
+	  rtx to1 = NULL_RTX, from1;
 
-      max_size = GET_MODE_SIZE (mode);
+	  while (m_len >= size)
+	    {
+	      if (m_reverse)
+		m_offset -= size;
+
+	      to1 = m_to.adjust (mode, m_offset);
+	      from1 = m_from.adjust (mode, m_offset);
+
+	      m_to.maybe_predec (-(HOST_WIDE_INT)size);
+	      m_from.maybe_predec (-(HOST_WIDE_INT)size);
+
+	      generate (to1, from1, mode);
+
+	      m_to.maybe_postinc (size);
+	      m_from.maybe_postinc (size);
+
+	      if (!m_reverse)
+		m_offset += size;
+
+	      m_len -= size;
+	    }
+
+	  finish_mode (mode);
+	}
+
+      m_max_size = GET_MODE_SIZE (mode);
     }
 
   /* The code above should have handled everything.  */
-  gcc_assert (!data.len);
+  gcc_assert (!m_len);
+}
+
+/* Derived class from op_by_pieces_d, providing support for block move
+   operations.  */
+
+class move_by_pieces_d : public op_by_pieces_d
+{
+  insn_gen_fn m_gen_fun;
+  void generate (rtx, rtx, machine_mode);
+  bool prepare_mode (machine_mode, unsigned int);
+
+ public:
+  move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
+		    unsigned int align)
+    : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align)
+  {
+  }
+  rtx finish_endp (int);
+};
+
+/* Return true if MODE can be used for a set of copies, given an
+   alignment ALIGN.  Prepare whatever data is necessary for later
+   calls to generate.  */
+
+bool
+move_by_pieces_d::prepare_mode (machine_mode mode, unsigned int align)
+{
+  insn_code icode = optab_handler (mov_optab, mode);
+  m_gen_fun = GEN_FCN (icode);
+  return icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode);
+}
+
+/* A callback used when iterating for a compare_by_pieces_operation.
+   OP0 and OP1 are the values that have been loaded and should be
+   compared in MODE.  If OP0 is NULL, this means we should generate a
+   push; otherwise EXTRA_DATA holds a pointer to a pointer to the insn
+   gen function that should be used to generate the mode.  */
+
+void
+move_by_pieces_d::generate (rtx op0, rtx op1, machine_mode mode)
+{
+#ifdef PUSH_ROUNDING
+  if (op0 == NULL_RTX)
+    {
+      emit_single_push_insn (mode, op1, NULL);
+      return;
+    }
+#endif
+  emit_insn (m_gen_fun (op0, op1));
+}
+
+/* Perform the final adjustment at the end of a string to obtain the
+   correct return value for the block operation.  If ENDP is 1 return
+   memory at the end ala mempcpy, and if ENDP is 2 return memory the
+   end minus one byte ala stpcpy.  */
+
+rtx
+move_by_pieces_d::finish_endp (int endp)
+{
+  gcc_assert (!m_reverse);
+  if (endp == 2)
+    {
+      m_to.maybe_postinc (-1);
+      --m_offset;
+    }
+  return m_to.adjust (QImode, m_offset);
+}
+
+/* Generate several move instructions to copy LEN bytes from block FROM to
+   block TO.  (These are MEM rtx's with BLKmode).
+
+   If PUSH_ROUNDING is defined and TO is NULL, emit_single_push_insn is
+   used to push FROM to the stack.
+
+   ALIGN is maximum stack alignment we can assume.
+
+   If ENDP is 0 return to, if ENDP is 1 return memory at the end ala
+   mempcpy, and if ENDP is 2 return memory the end minus one byte ala
+   stpcpy.  */
+
+rtx
+move_by_pieces (rtx to, rtx from, unsigned HOST_WIDE_INT len,
+		unsigned int align, int endp)
+{
+#ifndef PUSH_ROUNDING
+  if (to == NULL)
+    gcc_unreachable ();
+#endif
+
+  move_by_pieces_d data (to, from, len, align);
+
+  data.run ();
 
   if (endp)
+    return data.finish_endp (endp);
+  else
+    return to;
+}
+
+/* Derived class from op_by_pieces_d, providing support for block move
+   operations.  */
+
+class store_by_pieces_d : public op_by_pieces_d
+{
+  insn_gen_fn m_gen_fun;
+  void generate (rtx, rtx, machine_mode);
+  bool prepare_mode (machine_mode, unsigned int);
+
+ public:
+  store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
+		     unsigned HOST_WIDE_INT len, unsigned int align)
+    : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, align)
+  {
+  }
+  rtx finish_endp (int);
+};
+
+/* Return true if MODE can be used for a set of stores, given an
+   alignment ALIGN.  Prepare whatever data is necessary for later
+   calls to generate.  */
+
+bool
+store_by_pieces_d::prepare_mode (machine_mode mode, unsigned int align)
+{
+  insn_code icode = optab_handler (mov_optab, mode);
+  m_gen_fun = GEN_FCN (icode);
+  return icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode);
+}
+
+/* A callback used when iterating for a store_by_pieces_operation.
+   OP0 and OP1 are the values that have been loaded and should be
+   compared in MODE.  If OP0 is NULL, this means we should generate a
+   push; otherwise EXTRA_DATA holds a pointer to a pointer to the insn
+   gen function that should be used to generate the mode.  */
+
+void
+store_by_pieces_d::generate (rtx op0, rtx op1, machine_mode)
+{
+  emit_insn (m_gen_fun (op0, op1));
+}
+
+/* Perform the final adjustment at the end of a string to obtain the
+   correct return value for the block operation.  If ENDP is 1 return
+   memory at the end ala mempcpy, and if ENDP is 2 return memory the
+   end minus one byte ala stpcpy.  */
+
+rtx
+store_by_pieces_d::finish_endp (int endp)
+{
+  gcc_assert (!m_reverse);
+  if (endp == 2)
     {
-      rtx to1;
+      m_to.maybe_postinc (-1);
+      --m_offset;
+    }
+  return m_to.adjust (QImode, m_offset);
+}
 
-      gcc_assert (!data.reverse);
-      if (data.autinc_to)
+/* Determine whether the LEN bytes generated by CONSTFUN can be
+   stored to memory using several move instructions.  CONSTFUNDATA is
+   a pointer which will be passed as argument in every CONSTFUN call.
+   ALIGN is maximum alignment we can assume.  MEMSETP is true if this is
+   a memset operation and false if it's a copy of a constant string.
+   Return nonzero if a call to store_by_pieces should succeed.  */
+
+int
+can_store_by_pieces (unsigned HOST_WIDE_INT len,
+		     rtx (*constfun) (void *, HOST_WIDE_INT, machine_mode),
+		     void *constfundata, unsigned int align, bool memsetp)
+{
+  unsigned HOST_WIDE_INT l;
+  unsigned int max_size;
+  HOST_WIDE_INT offset = 0;
+  machine_mode mode;
+  enum insn_code icode;
+  int reverse;
+  /* cst is set but not used if LEGITIMATE_CONSTANT doesn't use it.  */
+  rtx cst ATTRIBUTE_UNUSED;
+
+  if (len == 0)
+    return 1;
+
+  if (!targetm.use_by_pieces_infrastructure_p (len, align,
+					       memsetp
+						 ? SET_BY_PIECES
+						 : STORE_BY_PIECES,
+					       optimize_insn_for_speed_p ()))
+    return 0;
+
+  align = alignment_for_piecewise_move (STORE_MAX_PIECES, align);
+
+  /* We would first store what we can in the largest integer mode, then go to
+     successively smaller modes.  */
+
+  for (reverse = 0;
+       reverse <= (HAVE_PRE_DECREMENT || HAVE_POST_DECREMENT);
+       reverse++)
+    {
+      l = len;
+      max_size = STORE_MAX_PIECES + 1;
+      while (max_size > 1 && l > 0)
 	{
-	  if (endp == 2)
+	  mode = widest_int_mode_for_size (max_size);
+
+	  if (mode == VOIDmode)
+	    break;
+
+	  icode = optab_handler (mov_optab, mode);
+	  if (icode != CODE_FOR_nothing
+	      && align >= GET_MODE_ALIGNMENT (mode))
 	    {
-	      if (HAVE_POST_INCREMENT && data.explicit_inc_to > 0)
-		emit_insn (gen_add2_insn (data.to_addr, constm1_rtx));
-	      else
-		data.to_addr = copy_to_mode_reg (to_addr_mode,
-						 plus_constant (to_addr_mode,
-								data.to_addr,
-								-1));
+	      unsigned int size = GET_MODE_SIZE (mode);
+
+	      while (l >= size)
+		{
+		  if (reverse)
+		    offset -= size;
+
+		  cst = (*constfun) (constfundata, offset, mode);
+		  if (!targetm.legitimate_constant_p (mode, cst))
+		    return 0;
+
+		  if (!reverse)
+		    offset += size;
+
+		  l -= size;
+		}
 	    }
-	  to1 = adjust_automodify_address (data.to, QImode, data.to_addr,
-					   data.offset);
-	}
-      else
-	{
-	  if (endp == 2)
-	    --data.offset;
-	  to1 = adjust_address (data.to, QImode, data.offset);
+
+	  max_size = GET_MODE_SIZE (mode);
 	}
-      return to1;
+
+      /* The code above should have handled everything.  */
+      gcc_assert (!l);
+    }
+
+  return 1;
+}
+
+/* Generate several move instructions to store LEN bytes generated by
+   CONSTFUN to block TO.  (A MEM rtx with BLKmode).  CONSTFUNDATA is a
+   pointer which will be passed as argument in every CONSTFUN call.
+   ALIGN is maximum alignment we can assume.  MEMSETP is true if this is
+   a memset operation and false if it's a copy of a constant string.
+   If ENDP is 0 return to, if ENDP is 1 return memory at the end ala
+   mempcpy, and if ENDP is 2 return memory the end minus one byte ala
+   stpcpy.  */
+
+rtx
+store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
+		 rtx (*constfun) (void *, HOST_WIDE_INT, machine_mode),
+		 void *constfundata, unsigned int align, bool memsetp, int endp)
+{
+  if (len == 0)
+    {
+      gcc_assert (endp != 2);
+      return to;
     }
+
+  gcc_assert (targetm.use_by_pieces_infrastructure_p
+		(len, align,
+		 memsetp ? SET_BY_PIECES : STORE_BY_PIECES,
+		 optimize_insn_for_speed_p ()));
+
+  store_by_pieces_d data (to, constfun, constfundata, len, align);
+  data.run ();
+
+  if (endp)
+    return data.finish_endp (endp);
   else
-    return data.to;
+    return to;
 }
 
-/* Return number of insns required to move L bytes by pieces.
-   ALIGN (in bits) is maximum alignment we can assume.  */
+/* Callback routine for clear_by_pieces.
+   Return const0_rtx unconditionally.  */
 
-unsigned HOST_WIDE_INT
-move_by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
-		       unsigned int max_size)
+static rtx
+clear_by_pieces_1 (void *, HOST_WIDE_INT, machine_mode)
 {
-  unsigned HOST_WIDE_INT n_insns = 0;
+  return const0_rtx;
+}
 
-  align = alignment_for_piecewise_move (MOVE_MAX_PIECES, align);
+/* Generate several move instructions to clear LEN bytes of block TO.  (A MEM
+   rtx with BLKmode).  ALIGN is maximum alignment we can assume.  */
 
-  while (max_size > 1 && l > 0)
-    {
-      machine_mode mode;
-      enum insn_code icode;
+static void
+clear_by_pieces (rtx to, unsigned HOST_WIDE_INT len, unsigned int align)
+{
+  if (len == 0)
+    return;
 
-      mode = widest_int_mode_for_size (max_size);
+  store_by_pieces_d data (to, clear_by_pieces_1, NULL, len, align);
+  data.run ();
+}
 
-      if (mode == VOIDmode)
-	break;
+/* Context used by compare_by_pieces_genfn.  It stores the fail label
+   to jump to in case of miscomparison, and for branch ratios greater than 1,
+   it stores an accumulator and the current and maximum counts before
+   emitting another branch.  */
 
-      icode = optab_handler (mov_optab, mode);
-      if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
-	n_insns += l / GET_MODE_SIZE (mode), l %= GET_MODE_SIZE (mode);
+class compare_by_pieces_d : public op_by_pieces_d
+{
+  rtx_code_label *m_fail_label;
+  rtx m_accumulator;
+  int m_count, m_batch;
 
-      max_size = GET_MODE_SIZE (mode);
+  void generate (rtx, rtx, machine_mode);
+  bool prepare_mode (machine_mode, unsigned int);
+  void finish_mode (machine_mode);
+ public:
+  compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
+		       void *op1_cfn_data, HOST_WIDE_INT len, int align,
+		       rtx_code_label *fail_label)
+    : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len, align)
+  {
+    m_fail_label = fail_label;
+  }
+};
+
+/* A callback used when iterating for a compare_by_pieces_operation.
+   OP0 and OP1 are the values that have been loaded and should be
+   compared in MODE.  DATA holds a pointer to the compare_by_pieces_data
+   context structure.  */
+
+void
+compare_by_pieces_d::generate (rtx op0, rtx op1, machine_mode mode)
+{
+  if (m_batch > 1)
+    {
+      rtx temp = expand_binop (mode, sub_optab, op0, op1, NULL_RTX,
+			       true, OPTAB_LIB_WIDEN);
+      if (m_count != 0)
+	temp = expand_binop (mode, ior_optab, m_accumulator, temp, temp,
+			     true, OPTAB_LIB_WIDEN);
+      m_accumulator = temp;
+
+      if (++m_count < m_batch)
+	return;
+
+      m_count = 0;
+      op0 = m_accumulator;
+      op1 = const0_rtx;
+      m_accumulator = NULL_RTX;
     }
+  do_compare_rtx_and_jump (op0, op1, NE, true, mode, NULL_RTX, NULL,
+			   m_fail_label, -1);
+}
 
-  gcc_assert (!l);
-  return n_insns;
+/* Return true if MODE can be used for a set of moves and comparisons,
+   given an alignment ALIGN.  Prepare whatever data is necessary for
+   later calls to generate.  */
+
+bool
+compare_by_pieces_d::prepare_mode (machine_mode mode, unsigned int align)
+{
+  insn_code icode = optab_handler (mov_optab, mode);
+  if (icode == CODE_FOR_nothing
+      || align < GET_MODE_ALIGNMENT (mode)
+      || !can_compare_p (EQ, mode, ccp_jump))
+    return false;
+  m_batch = targetm.compare_by_pieces_branch_ratio (mode);
+  if (m_batch < 0)
+    return false;
+  m_accumulator = NULL_RTX;
+  m_count = 0;
+  return true;
 }
 
-/* Subroutine of move_by_pieces.  Move as many bytes as appropriate
-   with move instructions for mode MODE.  GENFUN is the gen_... function
-   to make a move insn for that mode.  DATA has all the other info.  */
+/* Called after expanding a series of comparisons in MODE.  If we have
+   accumulated results for which we haven't emitted a branch yet, do
+   so now.  */
 
-static void
-move_by_pieces_1 (insn_gen_fn genfun, machine_mode mode,
-		  struct move_by_pieces_d *data)
+void
+compare_by_pieces_d::finish_mode (machine_mode mode)
 {
-  unsigned int size = GET_MODE_SIZE (mode);
-  rtx to1 = NULL_RTX, from1;
+  if (m_accumulator != NULL_RTX)
+    do_compare_rtx_and_jump (m_accumulator, const0_rtx, NE, true, mode,
+			     NULL_RTX, NULL, m_fail_label, -1);
+}
 
-  while (data->len >= size)
-    {
-      if (data->reverse)
-	data->offset -= size;
+/* Generate several move instructions to compare LEN bytes from blocks
+   ARG0 and ARG1.  (These are MEM rtx's with BLKmode).
 
-      if (data->to)
-	{
-	  if (data->autinc_to)
-	    to1 = adjust_automodify_address (data->to, mode, data->to_addr,
-					     data->offset);
-	  else
-	    to1 = adjust_address (data->to, mode, data->offset);
-	}
+   If PUSH_ROUNDING is defined and TO is NULL, emit_single_push_insn is
+   used to push FROM to the stack.
 
-      if (data->autinc_from)
-	from1 = adjust_automodify_address (data->from, mode, data->from_addr,
-					   data->offset);
-      else
-	from1 = adjust_address (data->from, mode, data->offset);
+   ALIGN is maximum stack alignment we can assume.
 
-      if (HAVE_PRE_DECREMENT && data->explicit_inc_to < 0)
-	emit_insn (gen_add2_insn (data->to_addr,
-				  gen_int_mode (-(HOST_WIDE_INT) size,
-						GET_MODE (data->to_addr))));
-      if (HAVE_PRE_DECREMENT && data->explicit_inc_from < 0)
-	emit_insn (gen_add2_insn (data->from_addr,
-				  gen_int_mode (-(HOST_WIDE_INT) size,
-						GET_MODE (data->from_addr))));
+   Optionally, the caller can pass a constfn and associated data in A1_CFN
+   and A1_CFN_DATA. describing that the second operand being compared is a
+   known constant and how to obtain its data.  */
 
-      if (data->to)
-	emit_insn ((*genfun) (to1, from1));
-      else
-	{
-#ifdef PUSH_ROUNDING
-	  emit_single_push_insn (mode, from1, NULL);
-#else
-	  gcc_unreachable ();
-#endif
-	}
+static rtx
+compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
+		   rtx target, unsigned int align,
+		   by_pieces_constfn a1_cfn, void *a1_cfn_data)
+{
+  rtx_code_label *fail_label = gen_label_rtx ();
+  rtx_code_label *end_label = gen_label_rtx ();
 
-      if (HAVE_POST_INCREMENT && data->explicit_inc_to > 0)
-	emit_insn (gen_add2_insn (data->to_addr,
-				  gen_int_mode (size,
-						GET_MODE (data->to_addr))));
-      if (HAVE_POST_INCREMENT && data->explicit_inc_from > 0)
-	emit_insn (gen_add2_insn (data->from_addr,
-				  gen_int_mode (size,
-						GET_MODE (data->from_addr))));
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
 
-      if (! data->reverse)
-	data->offset += size;
+  compare_by_pieces_d data (arg0, arg1, a1_cfn, a1_cfn_data, len, align,
+			    fail_label);
 
-      data->len -= size;
-    }
+  data.run ();
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (end_label);
+  emit_barrier ();
+  emit_label (fail_label);
+  emit_move_insn (target, const1_rtx);
+  emit_label (end_label);
+
+  return target;
 }
 
 /* Emit code to move a block Y to a block X.  This may be done with
@@ -1066,8 +1550,7 @@ emit_block_move_hints (rtx x, rtx y, rtx
   unsigned int align;
 
   gcc_assert (size);
-  if (CONST_INT_P (size)
-      && INTVAL (size) == 0)
+  if (CONST_INT_P (size) && INTVAL (size) == 0)
     return 0;
 
   switch (method)
@@ -1394,6 +1877,99 @@ emit_block_op_via_libcall (enum built_in
 
   return expand_call (call_expr, NULL_RTX, false);
 }
+
+/* Try to expand cmpstrn or cmpmem operation ICODE with the given operands.
+   ARG3_TYPE is the type of ARG3_RTX.  Return the result rtx on success,
+   otherwise return null.  */
+
+rtx
+expand_cmpstrn_or_cmpmem (insn_code icode, rtx target, rtx arg1_rtx,
+			  rtx arg2_rtx, tree arg3_type, rtx arg3_rtx,
+			  HOST_WIDE_INT align)
+{
+  machine_mode insn_mode = insn_data[icode].operand[0].mode;
+
+  if (target && (!REG_P (target) || HARD_REGISTER_P (target)))
+    target = NULL_RTX;
+
+  struct expand_operand ops[5];
+  create_output_operand (&ops[0], target, insn_mode);
+  create_fixed_operand (&ops[1], arg1_rtx);
+  create_fixed_operand (&ops[2], arg2_rtx);
+  create_convert_operand_from (&ops[3], arg3_rtx, TYPE_MODE (arg3_type),
+			       TYPE_UNSIGNED (arg3_type));
+  create_integer_operand (&ops[4], align);
+  if (maybe_expand_insn (icode, 5, ops))
+    return ops[0].value;
+  return NULL_RTX;
+}
+
+/* Expand a block compare between X and Y with length LEN using the
+   cmpmem optab, placing the result in TARGET.  LEN_TYPE is the type
+   of the expression that was used to calculate the length.  ALIGN
+   gives the known minimum common alignment.  */
+
+static rtx
+emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			   unsigned align)
+{
+  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
+     implementing memcmp because it will stop if it encounters two
+     zero bytes.  */
+  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
+
+  if (icode == CODE_FOR_nothing)
+    return NULL_RTX;
+
+  return expand_cmpstrn_or_cmpmem (icode, target, x, y, len_type, len, align);
+}
+
+/* Emit code to compare a block Y to a block X.  This may be done with
+   string-compare instructions, with multiple scalar instructions,
+   or with a library call.
+
+   Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
+   they are.  LEN_TYPE is the type of the expression that was used to
+   calculate it.
+
+   If EQUALITY_ONLY is true, it means we don't have to return the tri-state
+   value of a normal memcmp call, instead we can just compare for equality.
+   If FORCE_LIBCALL is true, we should emit a call to memcmp rather than
+   returning NULL_RTX.
+
+   Optionally, the caller can pass a constfn and associated data in Y_CFN
+   and Y_CFN_DATA. describing that the second operand being compared is a
+   known constant and how to obtain its data.
+   Return the result of the comparison, or NULL_RTX if we failed to
+   perform the operation.  */
+
+rtx
+emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
+		      bool equality_only, by_pieces_constfn y_cfn,
+		      void *y_cfndata)
+{
+  rtx result = 0;
+
+  if (CONST_INT_P (len) && INTVAL (len) == 0)
+    return const0_rtx;
+
+  gcc_assert (MEM_P (x) && MEM_P (y));
+  unsigned int align = MIN (MEM_ALIGN (x), MEM_ALIGN (y));
+  gcc_assert (align >= BITS_PER_UNIT);
+
+  x = adjust_address (x, BLKmode, 0);
+  y = adjust_address (y, BLKmode, 0);
+
+  if (equality_only
+      && CONST_INT_P (len)
+      && can_do_by_pieces (INTVAL (len), align, COMPARE_BY_PIECES))
+    result = compare_by_pieces (x, y, INTVAL (len), target, align,
+				y_cfn, y_cfndata);
+  else
+    result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+
+  return result;
+}
 
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
@@ -2330,308 +2906,6 @@ get_def_for_expr_class (tree name, enum
   return def_stmt;
 }
 
-
-/* Determine whether the LEN bytes generated by CONSTFUN can be
-   stored to memory using several move instructions.  CONSTFUNDATA is
-   a pointer which will be passed as argument in every CONSTFUN call.
-   ALIGN is maximum alignment we can assume.  MEMSETP is true if this is
-   a memset operation and false if it's a copy of a constant string.
-   Return nonzero if a call to store_by_pieces should succeed.  */
-
-int
-can_store_by_pieces (unsigned HOST_WIDE_INT len,
-		     rtx (*constfun) (void *, HOST_WIDE_INT, machine_mode),
-		     void *constfundata, unsigned int align, bool memsetp)
-{
-  unsigned HOST_WIDE_INT l;
-  unsigned int max_size;
-  HOST_WIDE_INT offset = 0;
-  machine_mode mode;
-  enum insn_code icode;
-  int reverse;
-  /* cst is set but not used if LEGITIMATE_CONSTANT doesn't use it.  */
-  rtx cst ATTRIBUTE_UNUSED;
-
-  if (len == 0)
-    return 1;
-
-  if (!targetm.use_by_pieces_infrastructure_p (len, align,
-					       memsetp
-						 ? SET_BY_PIECES
-						 : STORE_BY_PIECES,
-					       optimize_insn_for_speed_p ()))
-    return 0;
-
-  align = alignment_for_piecewise_move (STORE_MAX_PIECES, align);
-
-  /* We would first store what we can in the largest integer mode, then go to
-     successively smaller modes.  */
-
-  for (reverse = 0;
-       reverse <= (HAVE_PRE_DECREMENT || HAVE_POST_DECREMENT);
-       reverse++)
-    {
-      l = len;
-      max_size = STORE_MAX_PIECES + 1;
-      while (max_size > 1 && l > 0)
-	{
-	  mode = widest_int_mode_for_size (max_size);
-
-	  if (mode == VOIDmode)
-	    break;
-
-	  icode = optab_handler (mov_optab, mode);
-	  if (icode != CODE_FOR_nothing
-	      && align >= GET_MODE_ALIGNMENT (mode))
-	    {
-	      unsigned int size = GET_MODE_SIZE (mode);
-
-	      while (l >= size)
-		{
-		  if (reverse)
-		    offset -= size;
-
-		  cst = (*constfun) (constfundata, offset, mode);
-		  if (!targetm.legitimate_constant_p (mode, cst))
-		    return 0;
-
-		  if (!reverse)
-		    offset += size;
-
-		  l -= size;
-		}
-	    }
-
-	  max_size = GET_MODE_SIZE (mode);
-	}
-
-      /* The code above should have handled everything.  */
-      gcc_assert (!l);
-    }
-
-  return 1;
-}
-
-/* Generate several move instructions to store LEN bytes generated by
-   CONSTFUN to block TO.  (A MEM rtx with BLKmode).  CONSTFUNDATA is a
-   pointer which will be passed as argument in every CONSTFUN call.
-   ALIGN is maximum alignment we can assume.  MEMSETP is true if this is
-   a memset operation and false if it's a copy of a constant string.
-   If ENDP is 0 return to, if ENDP is 1 return memory at the end ala
-   mempcpy, and if ENDP is 2 return memory the end minus one byte ala
-   stpcpy.  */
-
-rtx
-store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
-		 rtx (*constfun) (void *, HOST_WIDE_INT, machine_mode),
-		 void *constfundata, unsigned int align, bool memsetp, int endp)
-{
-  machine_mode to_addr_mode = get_address_mode (to);
-  struct store_by_pieces_d data;
-
-  if (len == 0)
-    {
-      gcc_assert (endp != 2);
-      return to;
-    }
-
-  gcc_assert (targetm.use_by_pieces_infrastructure_p
-		(len, align,
-		 memsetp
-		   ? SET_BY_PIECES
-		   : STORE_BY_PIECES,
-		 optimize_insn_for_speed_p ()));
-
-  data.constfun = constfun;
-  data.constfundata = constfundata;
-  data.len = len;
-  data.to = to;
-  store_by_pieces_1 (&data, align);
-  if (endp)
-    {
-      rtx to1;
-
-      gcc_assert (!data.reverse);
-      if (data.autinc_to)
-	{
-	  if (endp == 2)
-	    {
-	      if (HAVE_POST_INCREMENT && data.explicit_inc_to > 0)
-		emit_insn (gen_add2_insn (data.to_addr, constm1_rtx));
-	      else
-		data.to_addr = copy_to_mode_reg (to_addr_mode,
-						 plus_constant (to_addr_mode,
-								data.to_addr,
-								-1));
-	    }
-	  to1 = adjust_automodify_address (data.to, QImode, data.to_addr,
-					   data.offset);
-	}
-      else
-	{
-	  if (endp == 2)
-	    --data.offset;
-	  to1 = adjust_address (data.to, QImode, data.offset);
-	}
-      return to1;
-    }
-  else
-    return data.to;
-}
-
-/* Generate several move instructions to clear LEN bytes of block TO.  (A MEM
-   rtx with BLKmode).  ALIGN is maximum alignment we can assume.  */
-
-static void
-clear_by_pieces (rtx to, unsigned HOST_WIDE_INT len, unsigned int align)
-{
-  struct store_by_pieces_d data;
-
-  if (len == 0)
-    return;
-
-  data.constfun = clear_by_pieces_1;
-  data.constfundata = NULL;
-  data.len = len;
-  data.to = to;
-  store_by_pieces_1 (&data, align);
-}
-
-/* Callback routine for clear_by_pieces.
-   Return const0_rtx unconditionally.  */
-
-static rtx
-clear_by_pieces_1 (void *data ATTRIBUTE_UNUSED,
-		   HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
-		   machine_mode mode ATTRIBUTE_UNUSED)
-{
-  return const0_rtx;
-}
-
-/* Subroutine of clear_by_pieces and store_by_pieces.
-   Generate several move instructions to store LEN bytes of block TO.  (A MEM
-   rtx with BLKmode).  ALIGN is maximum alignment we can assume.  */
-
-static void
-store_by_pieces_1 (struct store_by_pieces_d *data ATTRIBUTE_UNUSED,
-		   unsigned int align ATTRIBUTE_UNUSED)
-{
-  machine_mode to_addr_mode = get_address_mode (data->to);
-  rtx to_addr = XEXP (data->to, 0);
-  unsigned int max_size = STORE_MAX_PIECES + 1;
-  enum insn_code icode;
-
-  data->offset = 0;
-  data->to_addr = to_addr;
-  data->autinc_to
-    = (GET_CODE (to_addr) == PRE_INC || GET_CODE (to_addr) == PRE_DEC
-       || GET_CODE (to_addr) == POST_INC || GET_CODE (to_addr) == POST_DEC);
-
-  data->explicit_inc_to = 0;
-  data->reverse
-    = (GET_CODE (to_addr) == PRE_DEC || GET_CODE (to_addr) == POST_DEC);
-  if (data->reverse)
-    data->offset = data->len;
-
-  /* If storing requires more than two move insns,
-     copy addresses to registers (to make displacements shorter)
-     and use post-increment if available.  */
-  if (!data->autinc_to
-      && move_by_pieces_ninsns (data->len, align, max_size) > 2)
-    {
-      /* Determine the main mode we'll be using.
-	 MODE might not be used depending on the definitions of the
-	 USE_* macros below.  */
-      machine_mode mode ATTRIBUTE_UNUSED
-	= widest_int_mode_for_size (max_size);
-
-      if (USE_STORE_PRE_DECREMENT (mode) && data->reverse && ! data->autinc_to)
-	{
-	  data->to_addr = copy_to_mode_reg (to_addr_mode,
-					    plus_constant (to_addr_mode,
-							   to_addr,
-							   data->len));
-	  data->autinc_to = 1;
-	  data->explicit_inc_to = -1;
-	}
-
-      if (USE_STORE_POST_INCREMENT (mode) && ! data->reverse
-	  && ! data->autinc_to)
-	{
-	  data->to_addr = copy_to_mode_reg (to_addr_mode, to_addr);
-	  data->autinc_to = 1;
-	  data->explicit_inc_to = 1;
-	}
-
-      if ( !data->autinc_to && CONSTANT_P (to_addr))
-	data->to_addr = copy_to_mode_reg (to_addr_mode, to_addr);
-    }
-
-  align = alignment_for_piecewise_move (STORE_MAX_PIECES, align);
-
-  /* First store what we can in the largest integer mode, then go to
-     successively smaller modes.  */
-
-  while (max_size > 1 && data->len > 0)
-    {
-      machine_mode mode = widest_int_mode_for_size (max_size);
-
-      if (mode == VOIDmode)
-	break;
-
-      icode = optab_handler (mov_optab, mode);
-      if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
-	store_by_pieces_2 (GEN_FCN (icode), mode, data);
-
-      max_size = GET_MODE_SIZE (mode);
-    }
-
-  /* The code above should have handled everything.  */
-  gcc_assert (!data->len);
-}
-
-/* Subroutine of store_by_pieces_1.  Store as many bytes as appropriate
-   with move instructions for mode MODE.  GENFUN is the gen_... function
-   to make a move insn for that mode.  DATA has all the other info.  */
-
-static void
-store_by_pieces_2 (insn_gen_fn genfun, machine_mode mode,
-		   struct store_by_pieces_d *data)
-{
-  unsigned int size = GET_MODE_SIZE (mode);
-  rtx to1, cst;
-
-  while (data->len >= size)
-    {
-      if (data->reverse)
-	data->offset -= size;
-
-      if (data->autinc_to)
-	to1 = adjust_automodify_address (data->to, mode, data->to_addr,
-					 data->offset);
-      else
-	to1 = adjust_address (data->to, mode, data->offset);
-
-      if (HAVE_PRE_DECREMENT && data->explicit_inc_to < 0)
-	emit_insn (gen_add2_insn (data->to_addr,
-				  gen_int_mode (-(HOST_WIDE_INT) size,
-						GET_MODE (data->to_addr))));
-
-      cst = (*data->constfun) (data->constfundata, data->offset, mode);
-      emit_insn ((*genfun) (to1, cst));
-
-      if (HAVE_POST_INCREMENT && data->explicit_inc_to > 0)
-	emit_insn (gen_add2_insn (data->to_addr,
-				  gen_int_mode (size,
-						GET_MODE (data->to_addr))));
-
-      if (! data->reverse)
-	data->offset += size;
-
-      data->len -= size;
-    }
-}
-
 /* Write zeros through the storage of OBJECT.  If OBJECT has BLKmode, SIZE is
    its length in bytes.  */
 
Index: gcc/expr.h
===================================================================
--- gcc/expr.h	(revision 236909)
+++ gcc/expr.h	(working copy)
@@ -103,12 +103,16 @@ enum block_op_methods
   BLOCK_OP_TAILCALL
 };
 
+typedef rtx (*by_pieces_constfn) (void *, HOST_WIDE_INT, machine_mode);
+
 extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT);
+extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
+				 by_pieces_constfn, void *);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
@@ -173,6 +177,11 @@ extern void use_regs (rtx *, int, int);
 /* Mark a PARALLEL as holding a parameter for the next CALL_INSN.  */
 extern void use_group_regs (rtx *, rtx);
 
+#ifdef GCC_INSN_CODES_H
+extern rtx expand_cmpstrn_or_cmpmem (insn_code, rtx, rtx, rtx, tree, rtx,
+				     HOST_WIDE_INT);
+#endif
+
 /* Write zeros through the storage of OBJECT.
    If OBJECT has BLKmode, SIZE is its length in bytes.  */
 extern rtx clear_storage (rtx, rtx, enum block_op_methods);
@@ -191,10 +200,6 @@ extern bool set_storage_via_setmem (rtx,
 				    unsigned HOST_WIDE_INT,
 				    unsigned HOST_WIDE_INT);
 
-extern unsigned HOST_WIDE_INT move_by_pieces_ninsns (unsigned HOST_WIDE_INT,
-						     unsigned int,
-						     unsigned int);
-
 /* Return nonzero if it is desirable to store LEN bytes generated by
    CONSTFUN with several move instructions by store_by_pieces
    function.  CONSTFUNDATA is a pointer which will be passed as argument
@@ -203,8 +208,7 @@ extern unsigned HOST_WIDE_INT move_by_pi
    MEMSETP is true if this is a real memset/bzero, not a copy
    of a const string.  */
 extern int can_store_by_pieces (unsigned HOST_WIDE_INT,
-				rtx (*) (void *, HOST_WIDE_INT,
-					 machine_mode),
+				by_pieces_constfn,
 				void *, unsigned int, bool);
 
 /* Generate several move instructions to store LEN bytes generated by
@@ -213,8 +217,7 @@ extern int can_store_by_pieces (unsigned
    ALIGN is maximum alignment we can assume.
    MEMSETP is true if this is a real memset/bzero, not a copy.
    Returns TO + LEN.  */
-extern rtx store_by_pieces (rtx, unsigned HOST_WIDE_INT,
-			    rtx (*) (void *, HOST_WIDE_INT, machine_mode),
+extern rtx store_by_pieces (rtx, unsigned HOST_WIDE_INT, by_pieces_constfn,
 			    void *, unsigned int, bool, int);
 
 /* Emit insns to set X from Y.  */
@@ -295,7 +298,7 @@ rtx get_personality_function (tree);
 /* Determine whether the LEN bytes can be moved by using several move
    instructions.  Return nonzero if a call to move_by_pieces should
    succeed.  */
-extern int can_move_by_pieces (unsigned HOST_WIDE_INT, unsigned int);
+extern bool can_move_by_pieces (unsigned HOST_WIDE_INT, unsigned int);
 
 extern unsigned HOST_WIDE_INT highest_pow2_factor (const_tree);
 
Index: gcc/target.def
===================================================================
--- gcc/target.def	(revision 236909)
+++ gcc/target.def	(working copy)
@@ -3397,8 +3397,9 @@ Both @var{size} and @var{alignment} are
 units.\n\
 \n\
 The parameter @var{op} is one of: @code{CLEAR_BY_PIECES},\n\
-@code{MOVE_BY_PIECES}, @code{SET_BY_PIECES}, @code{STORE_BY_PIECES}.\n\
-These describe the type of memory operation under consideration.\n\
+@code{MOVE_BY_PIECES}, @code{SET_BY_PIECES}, @code{STORE_BY_PIECES} or\n\
+@code{COMPARE_BY_PIECES}.  These describe the type of memory operation\n\
+under consideration.\n\
 \n\
 The parameter @var{speed_p} is true if the code is currently being\n\
 optimized for speed rather than size.\n\
@@ -3418,6 +3419,18 @@ move would be greater than that of a lib
  default_use_by_pieces_infrastructure_p)
 
 DEFHOOK
+(compare_by_pieces_branch_ratio,
+ "When expanding a block comparison in MODE, gcc can try to reduce the\n\
+number of branches at the expense of more memory operations.  This hook\n\
+allows the target to override the default choice.  It should return the\n\
+factor by which branches should be reduced over the plain expansion with\n\
+one comparison per @var{mode}-sized piece.  A port can also prevent a\n\
+particular mode from being used for block comparisons by returning a\n\
+negative number from this hook.",
+ int, (machine_mode mode),
+ default_compare_by_pieces_branch_ratio)
+
+DEFHOOK
 (optab_supported_p,
  "Return true if the optimizers should use optab @var{op} with\n\
 modes @var{mode1} and @var{mode2} for optimization type @var{opt_type}.\n\
Index: gcc/target.h
===================================================================
--- gcc/target.h	(revision 236909)
+++ gcc/target.h	(working copy)
@@ -79,16 +79,23 @@ enum print_switch_type
 };
 
 /* Types of memory operation understood by the "by_pieces" infrastructure.
-   Used by the TARGET_USE_BY_PIECES_INFRASTRUCTURE_P target hook.  */
+   Used by the TARGET_USE_BY_PIECES_INFRASTRUCTURE_P target hook and
+   internally by the functions in expr.c.  */
 
 enum by_pieces_operation
 {
   CLEAR_BY_PIECES,
   MOVE_BY_PIECES,
   SET_BY_PIECES,
-  STORE_BY_PIECES
+  STORE_BY_PIECES,
+  COMPARE_BY_PIECES
 };
 
+extern unsigned HOST_WIDE_INT by_pieces_ninsns (unsigned HOST_WIDE_INT,
+						unsigned int,
+						unsigned int,
+						by_pieces_operation);
+
 typedef int (* print_switch_fn_type) (print_switch_type, const char *);
 
 /* An example implementation for ELF targets.  Defined in varasm.c  */
Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c	(revision 236909)
+++ gcc/targhooks.c	(working copy)
@@ -1482,25 +1482,40 @@ default_use_by_pieces_infrastructure_p (
 
   switch (op)
     {
-      case CLEAR_BY_PIECES:
-	max_size = STORE_MAX_PIECES;
-	ratio = CLEAR_RATIO (speed_p);
-	break;
-      case MOVE_BY_PIECES:
-	max_size = MOVE_MAX_PIECES;
-	ratio = get_move_ratio (speed_p);
-	break;
-      case SET_BY_PIECES:
-	max_size = STORE_MAX_PIECES;
-	ratio = SET_RATIO (speed_p);
-	break;
-      case STORE_BY_PIECES:
-	max_size = STORE_MAX_PIECES;
-	ratio = get_move_ratio (speed_p);
-	break;
+    case CLEAR_BY_PIECES:
+      max_size = STORE_MAX_PIECES;
+      ratio = CLEAR_RATIO (speed_p);
+      break;
+    case MOVE_BY_PIECES:
+      max_size = MOVE_MAX_PIECES;
+      ratio = get_move_ratio (speed_p);
+      break;
+    case SET_BY_PIECES:
+      max_size = STORE_MAX_PIECES;
+      ratio = SET_RATIO (speed_p);
+      break;
+    case STORE_BY_PIECES:
+      max_size = STORE_MAX_PIECES;
+      ratio = get_move_ratio (speed_p);
+      break;
+    case COMPARE_BY_PIECES:
+      max_size = COMPARE_MAX_PIECES;
+      /* Pick a likely default, just as in get_move_ratio.  */
+      ratio = speed_p ? 15 : 3;
+      break;
     }
 
-  return move_by_pieces_ninsns (size, alignment, max_size + 1) < ratio;
+  return by_pieces_ninsns (size, alignment, max_size + 1, op) < ratio;
+}
+
+/* This hook controls code generation for expanding a memcmp operation by
+   pieces.  Return 1 for the normal pattern of compare/jump after each pair
+   of loads, or a higher number to reduce the number of branches.  */
+
+int
+default_compare_by_pieces_branch_ratio (machine_mode)
+{
+  return 1;
 }
 
 bool
Index: gcc/targhooks.h
===================================================================
--- gcc/targhooks.h	(revision 236909)
+++ gcc/targhooks.h	(working copy)
@@ -199,6 +199,7 @@ extern bool default_use_by_pieces_infras
 						    unsigned int,
 						    enum by_pieces_operation,
 						    bool);
+extern int default_compare_by_pieces_branch_ratio (machine_mode);
 
 extern bool default_profile_before_prologue (void);
 extern reg_class_t default_preferred_reload_class (rtx, reg_class_t);
Index: gcc/testsuite/gcc.dg/pr52171.c
===================================================================
--- gcc/testsuite/gcc.dg/pr52171.c	(revision 0)
+++ gcc/testsuite/gcc.dg/pr52171.c	(working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not "memcmp" } } */
+#include <string.h>
+struct A { int x; } a, b;
+
+extern char s[], t[];
+
+int foo ()
+{
+  return memcmp (&a, &b, sizeof (struct A)) == 0;
+}
Index: gcc/testsuite/gcc.target/i386/pr52171.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr52171.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr52171.c	(working copy)
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not "memcmp" } } */
+/* { dg-final { scan-assembler "1752394086" } } */
+
+/* This should turn into four compare/jump pairs with -m32, within the
+   limit of what the tuning considers acceptable for -O2.  */
+int cmp (char *p, char *q)
+{
+  char *pa = __builtin_assume_aligned (p, 4);
+  char *qa = __builtin_assume_aligned (q, 4);
+  if (__builtin_memcmp (pa, qa, 16) != 0)
+    return 1;
+  return 0;
+}
+/* Since we have fast unaligned access, we should make a single
+   constant comparison.  The constant becomes 1752394086.  */
+int cmp2 (char *p)
+{
+  if (__builtin_memcmp (p, "fish", 4) != 0)
+    return 1;
+  return 0;
+}
Index: gcc/tree-ssa-strlen.c
===================================================================
--- gcc/tree-ssa-strlen.c	(revision 236909)
+++ gcc/tree-ssa-strlen.c	(working copy)
@@ -44,6 +44,7 @@ along with GCC; see the file COPYING3.
 #include "params.h"
 #include "ipa-chkp.h"
 #include "tree-hash-traits.h"
+#include "builtins.h"
 
 /* A vector indexed by SSA_NAME_VERSION.  0 means unknown, positive value
    is an index into strinfo vector, negative value stands for
@@ -1843,6 +1844,88 @@ handle_builtin_memset (gimple_stmt_itera
   return false;
 }
 
+/* Handle a call to memcmp.  We try to handle small comparisons by
+   converting them to load and compare, and replacing the call to memcmp
+   with a __builtin_memcmp_eq call where possible.  */
+
+static bool
+handle_builtin_memcmp (gimple_stmt_iterator *gsi)
+{
+  gcall *stmt2 = as_a <gcall *> (gsi_stmt (*gsi));
+  tree res = gimple_call_lhs (stmt2);
+  tree arg1 = gimple_call_arg (stmt2, 0);
+  tree arg2 = gimple_call_arg (stmt2, 1);
+  tree len = gimple_call_arg (stmt2, 2);
+  unsigned HOST_WIDE_INT leni;
+  use_operand_p use_p;
+  imm_use_iterator iter;
+
+  if (!res)
+    return true;
+
+  FOR_EACH_IMM_USE_FAST (use_p, iter, res)
+    {
+      gimple *ustmt = USE_STMT (use_p);
+
+      if (gimple_code (ustmt) == GIMPLE_ASSIGN)
+	{
+	  gassign *asgn = as_a <gassign *> (ustmt);
+	  tree_code code = gimple_assign_rhs_code (asgn);
+	  if ((code != EQ_EXPR && code != NE_EXPR)
+	      || !integer_zerop (gimple_assign_rhs2 (asgn)))
+	    return true;
+	}
+      else if (gimple_code (ustmt) == GIMPLE_COND)
+	{
+	  tree_code code = gimple_cond_code (ustmt);
+	  if ((code != EQ_EXPR && code != NE_EXPR)
+	      || !integer_zerop (gimple_cond_rhs (ustmt)))
+	    return true;
+	}
+      else
+	return true;
+    }
+
+  if (tree_fits_uhwi_p (len)
+      && (leni = tree_to_uhwi (len)) <= GET_MODE_SIZE (word_mode)
+      && exact_log2 (leni) != -1)
+    {
+      leni *= CHAR_TYPE_SIZE;
+      unsigned align1 = get_pointer_alignment (arg1);
+      unsigned align2 = get_pointer_alignment (arg2);
+      unsigned align = MIN (align1, align2);
+      machine_mode mode = mode_for_size (leni, MODE_INT, 1);
+      if (mode != BLKmode
+	  && (align >= leni || !SLOW_UNALIGNED_ACCESS (mode, align)))
+	{
+	  location_t loc = gimple_location (stmt2);
+	  tree type, off;
+	  type = build_nonstandard_integer_type (leni, 1);
+	  gcc_assert (GET_MODE_BITSIZE (TYPE_MODE (type)) == leni);
+	  tree ptrtype = build_pointer_type_for_mode (char_type_node,
+						      ptr_mode, true);
+	  off = build_int_cst (ptrtype, 0);
+	  arg1 = build2_loc (loc, MEM_REF, type, arg1, off);
+	  arg2 = build2_loc (loc, MEM_REF, type, arg2, off);
+	  tree tem1 = fold_const_aggregate_ref (arg1);
+	  if (tem1)
+	    arg1 = tem1;
+	  tree tem2 = fold_const_aggregate_ref (arg2);
+	  if (tem2)
+	    arg2 = tem2;
+	  res = fold_convert_loc (loc, TREE_TYPE (res),
+				  fold_build2_loc (loc, NE_EXPR,
+						   boolean_type_node,
+						   arg1, arg2));
+	  gimplify_and_update_call_from_tree (gsi, res);
+	  return false;
+	}
+    }
+
+  gimple_call_set_fndecl (stmt2, builtin_decl_explicit (BUILT_IN_MEMCMP_EQ));
+  return false;
+}
+
 /* Handle a POINTER_PLUS_EXPR statement.
    For p = "abcd" + 2; compute associated length, or if
    p = q + off is pointing to a '\0' character of a string, call
@@ -2100,6 +2183,10 @@ strlen_optimize_stmt (gimple_stmt_iterat
 	    if (!handle_builtin_memset (gsi))
 	      return false;
 	    break;
+	  case BUILT_IN_MEMCMP:
+	    if (!handle_builtin_memcmp (gsi))
+	      return false;
+	    break;
 	  default:
 	    break;
 	  }
Index: gcc/tree.c
===================================================================
--- gcc/tree.c	(revision 236909)
+++ gcc/tree.c	(working copy)
@@ -10603,6 +10603,13 @@ build_common_builtin_nodes (void)
 			BUILT_IN_STACK_RESTORE,
 			"__builtin_stack_restore", ECF_NOTHROW | ECF_LEAF);
 
+  ftype = build_function_type_list (integer_type_node, const_ptr_type_node,
+				    const_ptr_type_node, size_type_node,
+				    NULL_TREE);
+  local_define_builtin ("__builtin_memcmp_eq", ftype, BUILT_IN_MEMCMP_EQ,
+			"__builtin_memcmp_eq",
+			ECF_PURE | ECF_NOTHROW | ECF_LEAF);
+
   /* If there's a possibility that we might use the ARM EABI, build the
     alternate __cxa_end_cleanup node used to resume from C++ and Java.  */
   if (targetm.arm_eabi_unwinder)

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]