This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH, rs6000] generate loop code for memcmp inline expansion


On Tue, 2017-12-12 at 10:13 -0600, Segher Boessenkool wrote:
> Please fix those trivialities, and it's okay for trunk (after the
> rtlanal patch is approved too).  Thanks!

Here's the final version of this, which is committed as 256351.


2018-01-08  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>

	* config/rs6000/rs6000-string.c (do_load_for_compare_from_addr): New
	function.
	(do_ifelse): New function.
	(do_isel): New function.
	(do_sub3): New function.
	(do_add3): New function.
	(do_load_mask_compare): New function.
	(do_overlap_load_compare): New function.
	(expand_compare_loop): New function.
	(expand_block_compare): Call expand_compare_loop() when appropriate.
	* config/rs6000/rs6000.opt (-mblock-compare-inline-limit): Change
	option description.
	(-mblock-compare-inline-loop-limit): New option.


-- 
Aaron Sawdey, Ph.D.  acsawdey@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain
Index: gcc/config/rs6000/rs6000-string.c
===================================================================
--- gcc/config/rs6000/rs6000-string.c	(revision 256350)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -303,6 +303,959 @@
   return MIN (base_align, offset & -offset);
 }
 
+/* Prepare address and then do a load.
+
+   MODE is the mode to use for the load.
+   DEST is the destination register for the data.
+   ADDR is the address to be loaded.
+   ORIG_ADDR is the original address expression.  */
+static void
+do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
+			       rtx orig_addr)
+{
+  rtx mem = gen_rtx_MEM (mode, addr);
+  MEM_COPY_ATTRIBUTES (mem, orig_addr);
+  set_mem_size (mem, GET_MODE_SIZE (mode));
+  do_load_for_compare (dest, mem, mode);
+  return;
+}
+
+/* Do a branch for an if/else decision.
+
+   CMPMODE is the mode to use for the comparison.
+   COMPARISON is the rtx code for the compare needed.
+   A is the first thing to be compared.
+   B is the second thing to be compared.
+   CR is the condition code reg input, or NULL_RTX.
+   TRUE_LABEL is the label to branch to if the condition is true.
+
+   The return value is the CR used for the comparison.
+   If CR is null_rtx, then a new register of CMPMODE is generated.
+   If A and B are both null_rtx, then CR must not be null, and the
+   compare is not generated so you can use this with a dot form insn.  */
+
+static void
+do_ifelse (machine_mode cmpmode, rtx_code comparison,
+	   rtx a, rtx b, rtx cr, rtx true_label)
+{
+  gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
+	      || (a != NULL_RTX && b != NULL_RTX));
+
+  if (cr != NULL_RTX)
+    gcc_assert (GET_MODE (cr) == cmpmode);
+  else
+    cr = gen_reg_rtx (cmpmode);
+
+  rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
+
+  if (a != NULL_RTX)
+    emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
+
+  rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
+
+  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
+  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  JUMP_LABEL (j) = true_label;
+  LABEL_NUSES (true_label) += 1;
+}
+
+/* Emit an isel of the proper mode for DEST.
+
+   DEST is the isel destination register.
+   SRC1 is the isel source if CR is true.
+   SRC2 is the isel source if CR is false.
+   CR is the condition for the isel.  */
+static void
+do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
+{
+  if (GET_MODE (dest) == DImode)
+    emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
+  else
+    emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
+}
+
+/* Emit a subtract of the proper mode for DEST.
+
+   DEST is the destination register for the subtract.
+   SRC1 is the first subtract input.
+   SRC2 is the second subtract input.
+
+   Computes DEST = SRC1-SRC2.  */
+static void
+do_sub3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+    emit_insn (gen_subdi3 (dest, src1, src2));
+  else
+    emit_insn (gen_subsi3 (dest, src1, src2));
+}
+
+/* Emit an add of the proper mode for DEST.
+
+   DEST is the destination register for the add.
+   SRC1 is the first add input.
+   SRC2 is the second add input.
+
+   Computes DEST = SRC1+SRC2.  */
+static void
+do_add3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+    emit_insn (gen_adddi3 (dest, src1, src2));
+  else
+    emit_insn (gen_addsi3 (dest, src1, src2));
+}
+
+/* Generate rtl for a load, shift, and compare of less than a full word.
+
+   LOAD_MODE is the machine mode for the loads.
+   DIFF is the reg for the difference.
+   CMP_REM is the reg containing the remaining bytes to compare.
+   DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
+   SRC1_ADDR is the first source address.
+   SRC2_ADDR is the second source address.
+   ORIG_SRC1 is the original first source block's address rtx.
+   ORIG_SRC2 is the original second source block's address rtx.  */
+static void
+do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
+		      rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
+{
+  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
+  rtx shift_amount = gen_reg_rtx (word_mode);
+  rtx d1 = gen_reg_rtx (word_mode);
+  rtx d2 = gen_reg_rtx (word_mode);
+
+  do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
+  do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
+  do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
+
+  if (word_mode == DImode)
+    {
+      emit_insn (gen_ashldi3 (shift_amount, shift_amount,
+			      GEN_INT (LOG2_BITS_PER_UNIT)));
+      emit_insn (gen_lshrdi3 (d1, d1,
+			      gen_lowpart (SImode, shift_amount)));
+      emit_insn (gen_lshrdi3 (d2, d2,
+			      gen_lowpart (SImode, shift_amount)));
+    }
+  else
+    {
+      emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
+			      GEN_INT (LOG2_BITS_PER_UNIT)));
+      emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
+      emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
+    }
+
+  if (TARGET_P9_MISC)
+    {
+      /* Generate a compare, and convert with a setb later.  */
+      rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
+      emit_insn (gen_rtx_SET (dcond, cmp));
+    }
+  else
+    {
+      if (word_mode == DImode)
+	emit_insn (gen_subfdi3_carry (diff, d2, d1));
+      else
+	emit_insn (gen_subfsi3_carry (diff, d2, d1));
+    }
+}
+
+/* Generate rtl for an overlapping load and compare of less than a
+   full load_mode.  This assumes that the previous word is part of the
+   block being compared so it's ok to back up part of a word so we can
+   compare the last unaligned full word that ends at the end of the block.
+
+   LOAD_MODE is the machine mode for the loads.
+   ISCONST tells whether the remaining length is a constant or in a register.
+   BYTES_REM is the remaining length if ISCONST is true.
+   DIFF is the reg for the difference.
+   CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
+   DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
+   SRC1_ADDR is the first source address.
+   SRC2_ADDR is the second source address.
+   ORIG_SRC1 is the original first source block's address rtx.
+   ORIG_SRC2 is the original second source block's address rtx.  */
+static void
+do_overlap_load_compare (machine_mode load_mode, bool isConst,
+			HOST_WIDE_INT bytes_rem, rtx diff,
+			rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
+			rtx orig_src1, rtx orig_src2)
+{
+  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
+  HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
+  rtx d1 = gen_reg_rtx (word_mode);
+  rtx d2 = gen_reg_rtx (word_mode);
+
+  rtx addr1, addr2;
+  if (!isConst || addr_adj)
+    {
+      rtx adj_reg = gen_reg_rtx (word_mode);
+      if (isConst)
+	emit_move_insn (adj_reg, GEN_INT (-addr_adj));
+      else
+	{
+	  rtx reg_lms = gen_reg_rtx (word_mode);
+	  emit_move_insn (reg_lms, GEN_INT (load_mode_size));
+	  do_sub3 (adj_reg, cmp_rem, reg_lms);
+	}
+
+      addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
+      addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
+    }
+  else
+    {
+      addr1 = src1_addr;
+      addr2 = src2_addr;
+    }
+
+  do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
+  do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
+
+  if (TARGET_P9_MISC)
+    {
+      /* Generate a compare, and convert with a setb later.  */
+      rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
+      emit_insn (gen_rtx_SET (dcond, cmp));
+    }
+  else
+    {
+      if (word_mode == DImode)
+	emit_insn (gen_subfdi3_carry (diff, d2, d1));
+      else
+	emit_insn (gen_subfsi3_carry (diff, d2, d1));
+    }
+}
+
+/* Expand a block compare operation using loop code, and return true
+   if successful.  Return false if we should let the compiler generate
+   normal code, probably a memcmp call.
+
+   OPERANDS[0] is the target (result).
+   OPERANDS[1] is the first source.
+   OPERANDS[2] is the second source.
+   OPERANDS[3] is the length.
+   OPERANDS[4] is the alignment.  */
+bool
+expand_compare_loop (rtx operands[])
+{
+  rtx target = operands[0];
+  rtx orig_src1 = operands[1];
+  rtx orig_src2 = operands[2];
+  rtx bytes_rtx = operands[3];
+  rtx align_rtx = operands[4];
+
+  /* This case is complicated to handle because the subtract
+     with carry instructions do not generate the 64-bit
+     carry and so we must emit code to calculate it ourselves.
+     We choose not to implement this yet.  */
+  if (TARGET_32BIT && TARGET_POWERPC64)
+    return false;
+
+  /* Allow non-const length.  */
+  int bytes_is_const = CONST_INT_P (bytes_rtx);
+
+  /* This must be a fixed size alignment.  */
+  if (!CONST_INT_P (align_rtx))
+    return false;
+
+  HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
+  HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
+  HOST_WIDE_INT minalign = MIN (align1, align2);
+
+  bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
+
+  gcc_assert (GET_MODE (target) == SImode);
+
+  /* Anything to move?	*/
+  HOST_WIDE_INT bytes = 0;
+  if (bytes_is_const)
+    bytes = INTVAL (bytes_rtx);
+
+  if (bytes_is_const && bytes == 0)
+    return true;
+
+  /* Limit the amount we compare, if known statically.  */
+  HOST_WIDE_INT max_bytes;
+  switch (rs6000_tune)
+    {
+    case PROCESSOR_POWER7:
+      if (!bytes_is_const)
+	if (minalign < 8)
+	  max_bytes = 0;
+	else
+	  max_bytes = 128;
+      else
+	if (minalign < 8)
+	  max_bytes = 32;
+	else
+	  max_bytes = 128;
+      break;
+    case PROCESSOR_POWER8:
+      if (!bytes_is_const)
+	max_bytes = 0;
+      else
+	if (minalign < 8)
+	  max_bytes = 128;
+	else
+	  max_bytes = 64;
+      break;
+    case PROCESSOR_POWER9:
+      if (bytes_is_const)
+	max_bytes = 191;
+      else
+	max_bytes = 0;
+      break;
+    default:
+      max_bytes = 128;
+    }
+
+  /* Allow the option to override the default.  */
+  if (rs6000_block_compare_inline_loop_limit >= 0)
+    max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
+
+  if (max_bytes == 0)
+    return false;
+
+  rtx cmp_rem = gen_reg_rtx (word_mode);  /* Remainder for library call.  */
+  rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop.  */
+  HOST_WIDE_INT niter;
+  rtx iter = gen_reg_rtx (word_mode);
+  rtx iv1 = gen_reg_rtx (word_mode);
+  rtx iv2 = gen_reg_rtx (word_mode);
+  rtx d1_1 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv1 */
+  rtx d1_2 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv2 */
+  rtx d2_1 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv1 */
+  rtx d2_2 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv2 */
+
+  /* Strip unneeded subreg from length if there is one.  */
+  if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
+    bytes_rtx = SUBREG_REG (bytes_rtx);
+  /* Extend bytes_rtx to word_mode if needed.  But, we expect only to
+   maybe have to deal with the case were bytes_rtx is SImode and
+   word_mode is DImode.  */
+  if (!bytes_is_const)
+    {
+      if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
+	/* Do not expect length longer than word_mode.  */
+	return false; 
+      else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
+	{
+	  bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
+	  bytes_rtx = force_reg (word_mode,
+				 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
+						bytes_rtx));
+	}
+      else
+	/* Make sure it's in a register before we get started.  */
+	bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
+    }
+
+  machine_mode load_mode = word_mode;
+  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
+
+  /* Number of bytes per iteration of the unrolled loop.  */
+  HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
+  /* max iters and bytes compared in the loop.  */
+  HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
+  HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
+  int l2lb = floor_log2 (loop_bytes);
+
+  if (bytes_is_const && (max_bytes < load_mode_size
+			 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
+    return false;
+
+  bool no_remainder_code = false;
+  rtx final_label = gen_label_rtx ();
+  rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
+  rtx diff_label = gen_label_rtx ();
+  rtx library_call_label = NULL;
+  rtx cleanup_label = gen_label_rtx ();
+
+  rtx cr;
+
+  rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
+  rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
+
+  /* Difference found is stored here before jump to diff_label.  */
+  rtx diff = gen_reg_rtx (word_mode);
+  rtx j;
+
+  /* Example of generated code for 35 bytes aligned 1 byte.
+     
+	     mtctr 8
+	     li 6,0
+	     li 5,8
+     .L13:
+	     ldbrx 7,3,6
+	     ldbrx 9,10,6
+	     ldbrx 0,3,5
+	     ldbrx 4,10,5
+	     addi 6,6,16
+	     addi 5,5,16
+	     subfc. 9,9,7
+	     bne 0,.L10
+	     subfc. 9,4,0
+	     bdnzt 2,.L13
+	     bne 0,.L10
+	     add 3,3,6
+	     add 10,10,6
+	     addi 9,3,-5
+	     ldbrx 7,0,9
+	     addi 9,10,-5
+	     ldbrx 9,0,9
+	     subfc 9,9,7
+	     .p2align 4,,15
+     .L10:
+	     popcntd 9,9
+	     subfe 10,10,10
+	     or 9,9,10
+     
+     Compiled with -fno-reorder-blocks for clarity.  */
+
+  /* Structure of what we're going to do:
+     Two separate lengths: what we will compare before bailing to library
+	call (max_bytes), and the total length to be checked.
+     if length <= 16, branch to linear cleanup code starting with
+	remainder length check (length not known at compile time)
+     set up 2 iv's and load count reg, compute remainder length
+     unrollx2 compare loop
+     if loop exit due to a difference, branch to difference handling code
+     if remainder length < 8, branch to final cleanup compare
+     load and compare 8B
+     final cleanup comparison (depends on alignment and length)
+	load 8B, shift off bytes past length, compare
+	load 8B ending at last byte and compare
+	load/compare 1 byte at a time (short block abutting 4k boundary)
+     difference handling, 64->32 conversion
+     final result
+     branch around memcmp call
+     memcmp library call
+  */
+
+  /* If bytes is not const, compare length and branch directly
+     to the cleanup code that can handle 0-16 bytes if length
+     is >= 16.  Stash away bytes-max_bytes for the library call.  */
+  if (bytes_is_const)
+    {
+      /* These need to be set for some of the places we may jump to.  */
+      if (bytes > max_bytes)
+	{
+	  no_remainder_code = true;
+	  niter = max_loop_iter;
+	  library_call_label = gen_label_rtx ();
+	}
+      else
+	{
+	  niter = bytes / loop_bytes;
+	}
+      emit_move_insn (iter, GEN_INT (niter));
+      emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
+      emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
+    }
+  else
+    {
+      library_call_label = gen_label_rtx ();
+
+      /* If we go to the cleanup code, it expects length to be in cmp_rem.  */
+      emit_move_insn (cmp_rem, bytes_rtx);
+
+      /* Check for > max_bytes bytes.  We want to bail out as quickly as
+	 possible if we have to go over to memcmp.  */
+      do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
+		 NULL_RTX, library_call_label);
+
+      /* Check for < loop_bytes bytes.  */
+      do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
+		 NULL_RTX, cleanup_label);
+
+      /* Loop compare bytes and iterations if bytes>max_bytes.  */
+      rtx mb_reg = gen_reg_rtx (word_mode);
+      emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
+      rtx mi_reg = gen_reg_rtx (word_mode);
+      emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
+
+      /* Compute number of loop iterations if bytes <= max_bytes.  */
+      if (word_mode == DImode)
+	emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
+      else
+	emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
+
+      /* Compute bytes to compare in loop if bytes <= max_bytes.  */
+      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
+      if (word_mode == DImode)
+	{
+	  emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
+	}
+      else
+	{
+	  emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
+	}
+
+      /* Check for bytes <= max_bytes.  */
+      if (TARGET_ISEL)
+	{
+	  /* P9 has fast isel so we use one compare and two isel.  */
+	  cr = gen_reg_rtx (CCmode);
+	  rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
+					     GEN_INT (max_bytes));
+	  emit_move_insn (cr, compare_rtx);
+	  rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
+	  do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
+	  do_isel (iter, cmp_rtx, iter, mi_reg, cr);
+	}
+      else
+	{
+	  rtx lab_after = gen_label_rtx ();
+	  do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
+		     NULL_RTX, lab_after);
+	  emit_move_insn (loop_cmp, mb_reg);
+	  emit_move_insn (iter, mi_reg);
+	  emit_label (lab_after);
+	}
+
+      /* Now compute remainder bytes which isn't used until after the loop.  */
+      do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
+    }
+
+  rtx dcond = NULL_RTX; /* Used for when we jump to diff_label.  */
+  /* For p9 we need to have just one of these as multiple places define
+     it and it gets used by the setb at the end.  */
+  if (TARGET_P9_MISC)
+    dcond = gen_reg_rtx (CCUNSmode);
+
+  if (!bytes_is_const || bytes >= loop_bytes)
+    {
+      /* It should not be possible to come here if remaining bytes is
+	 < 16 in the runtime case either.  Compute number of loop
+	 iterations.  We compare 2*word_mode per iteration so 16B for
+	 64-bit code and 8B for 32-bit.  Set up two induction
+	 variables and load count register.  */
+
+      /* HACK ALERT: create hard reg for CTR here.  If we just use a
+	 pseudo, cse will get rid of it and then the allocator will
+	 see it used in the lshr above and won't give us ctr.  */
+      rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
+      emit_move_insn (ctr, iter);
+      emit_move_insn (diff, GEN_INT (0));
+      emit_move_insn (iv1, GEN_INT (0));
+      emit_move_insn (iv2, GEN_INT (load_mode_size));
+
+      /* inner loop to compare 2*word_mode */
+      rtx loop_top_label = gen_label_rtx ();
+      emit_label (loop_top_label);
+
+      rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
+      rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
+
+      do_load_for_compare_from_addr (load_mode, d1_1,
+				     src1_ix1, orig_src1);
+      do_load_for_compare_from_addr (load_mode, d2_1,
+				     src2_ix1, orig_src2);
+      do_add3 (iv1, iv1, GEN_INT (loop_bytes));
+
+      rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
+      rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
+
+      do_load_for_compare_from_addr (load_mode, d1_2,
+				     src1_ix2, orig_src1);
+      do_load_for_compare_from_addr (load_mode, d2_2,
+				     src2_ix2, orig_src2);
+      do_add3 (iv2, iv2, GEN_INT (loop_bytes));
+
+      if (TARGET_P9_MISC)
+	{
+	  /* Generate a compare, and convert with a setb later.  */
+	  rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
+	  emit_insn (gen_rtx_SET (dcond, cmp));
+	}
+      else
+	{
+	  dcond = gen_reg_rtx (CCmode);
+	  if (word_mode == DImode)
+	    emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
+	  else
+	    emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
+	}
+
+      do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
+		 dcond, diff_label);
+
+      if (TARGET_P9_MISC)
+	{
+	  /* Generate a compare, and convert with a setb later.  */
+	  rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
+	  emit_insn (gen_rtx_SET (dcond, cmp));
+	}
+      else
+	{
+	  dcond = gen_reg_rtx (CCmode);
+	  if (word_mode == DImode)
+	    emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
+	  else
+	    emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
+	}
+
+      rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
+      if (TARGET_64BIT)
+	j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
+					   eqrtx, dcond));
+      else
+	j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
+					   eqrtx, dcond));
+      JUMP_LABEL (j) = loop_top_label;
+      LABEL_NUSES (loop_top_label) += 1;
+    }
+
+  HOST_WIDE_INT bytes_remaining = 0;
+  if (bytes_is_const)
+    bytes_remaining = (bytes % loop_bytes);
+
+  /* If diff is nonzero, branch to difference handling
+     code.  If we exit here with a nonzero diff, it is
+     because the second word differed.  */
+  if (TARGET_P9_MISC)
+    do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, diff_label);
+  else
+    do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, diff_label);
+
+  if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
+    {
+      /* If the length is known at compile time, then we will always
+	 have a remainder to go to the library call with.  */
+      rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
+      j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
+      JUMP_LABEL (j) = library_call_label;
+      LABEL_NUSES (library_call_label) += 1;
+      emit_barrier ();
+    }
+
+  if (bytes_is_const && bytes_remaining == 0)
+    {
+      /* No remainder and if we are here then diff is 0 so just return 0 */
+      if (TARGET_64BIT)
+	emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
+      else
+	emit_move_insn (target, diff);
+      j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
+      JUMP_LABEL (j) = final_label;
+      LABEL_NUSES (final_label) += 1;
+      emit_barrier ();
+    }
+  else if (!no_remainder_code)
+    {
+      /* Update addresses to point to the next word to examine.  */
+      do_add3 (src1_addr, src1_addr, iv1);
+      do_add3 (src2_addr, src2_addr, iv1);
+
+      emit_label (cleanup_label);
+
+      if (!bytes_is_const)
+	{
+	  /* If we're dealing with runtime length, we have to check if
+	     it's zero after the loop. When length is known at compile
+	     time the no-remainder condition is dealt with above.  By
+	     doing this after cleanup_label, we also deal with the
+	     case where length is 0 at the start and we bypass the
+	     loop with a branch to cleanup_label.  */
+	  emit_move_insn (target, const0_rtx);
+	  do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
+		     NULL_RTX, final_label);
+	}
+
+      rtx final_cleanup = gen_label_rtx ();
+      rtx cmp_rem_before = gen_reg_rtx (word_mode);
+      /* Compare one more word_mode chunk if needed.  */
+      if (!bytes_is_const
+	  || (bytes_is_const && bytes_remaining >= load_mode_size))
+	{
+	  /* If remainder length < word length, branch to final
+	     cleanup compare.  */
+	  if (!bytes_is_const)
+	    do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
+		       NULL_RTX, final_cleanup);
+
+	  /* load and compare 8B */
+	  do_load_for_compare_from_addr (load_mode, d1_1,
+					 src1_addr, orig_src1);
+	  do_load_for_compare_from_addr (load_mode, d2_1,
+					 src2_addr, orig_src2);
+
+	  /* Compare the word, see if we need to do the last partial.  */
+	  if (TARGET_P9_MISC)
+	    {
+	      /* Generate a compare, and convert with a setb later.  */
+	      rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
+	      emit_insn (gen_rtx_SET (dcond, cmp));
+	    }
+	  else
+	    {
+	      dcond = gen_reg_rtx (CCmode);
+	      if (word_mode == DImode)
+		emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
+	      else
+		emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
+	    }
+
+	  do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
+		     dcond, diff_label);
+
+	  do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
+	  do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
+	  emit_move_insn (cmp_rem_before, cmp_rem);
+	  do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
+	  if (bytes_is_const)
+	    bytes_remaining -= load_mode_size;
+	  else
+	    /* See if remaining length is now zero.  We previously set
+	       target to 0 so we can just jump to the end.  */
+	    do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
+		       NULL_RTX, final_label);
+
+	}
+
+      /* Cases:
+	 bytes_is_const
+	   We can always shift back to do an overlapping compare
+	   of the last chunk because we know length >= 8.
+
+	 !bytes_is_const
+	   align>=load_mode_size
+	     Read word_mode and mask
+	   align<load_mode_size
+	     avoid stepping past end
+
+	  Three strategies:
+	  * decrement address and do overlapping compare
+	  * read word_mode and mask
+	  * carefully avoid crossing 4k boundary
+       */
+
+      if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
+	  && align1 >= load_mode_size && align2 >= load_mode_size)
+	{
+	  /* Alignment is larger than word_mode so we do not need to be
+	     concerned with extra page crossings.  But, we do not know
+	     that the length is larger than load_mode_size so we might
+	     end up compareing against data before the block if we try
+	     an overlapping compare.  Also we use this on P7 for fixed length
+	     remainder because P7 doesn't like overlapping unaligned.
+	     Strategy: load 8B, shift off bytes past length, and compare.  */
+	  emit_label (final_cleanup);
+	  do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
+				src1_addr, src2_addr, orig_src1, orig_src2);
+	}
+      else if (bytes_remaining && bytes_is_const)
+	{
+	  /* We do not do loop expand if length < 32 so we know at the
+	     end we can do an overlapping compare.
+	     Strategy: shift address back and do word_mode load that
+	     ends at the end of the block.  */
+	  emit_label (final_cleanup);
+	  do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
+				   cmp_rem, dcond, src1_addr, src2_addr,
+				   orig_src1, orig_src2);
+	}
+      else if (!bytes_is_const)
+	{
+	  rtx handle4k_label = gen_label_rtx ();
+	  rtx nonconst_overlap = gen_label_rtx ();
+	  emit_label (nonconst_overlap);
+
+	  /* Here we have to handle the case where whe have runtime
+	     length which may be too short for overlap compare, and
+	     alignment is not at least load_mode_size so we have to
+	     tread carefully to avoid stepping across 4k boundaries.  */
+
+	  /* If the length after the loop was larger than word_mode
+	     size, we can just do an overlapping compare and we're
+	     done.  We fall through to this code from the word_mode
+	     compare that preceeds this.  */
+	  do_overlap_load_compare (load_mode, false, 0, diff,
+				   cmp_rem, dcond, src1_addr, src2_addr,
+				   orig_src1, orig_src2);
+
+	  rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
+	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
+	  JUMP_LABEL (j) = diff_label;
+	  LABEL_NUSES (diff_label) += 1;
+	  emit_barrier ();
+
+	  /* If we couldn't do the overlap compare we have to be more
+	     careful of the 4k boundary.  Test to see if either
+	     address is less than word_mode_size away from a 4k
+	     boundary.  If not, then we can do a load/shift/compare
+	     and we are done.  We come to this code if length was less
+	     than word_mode_size.  */
+
+	  emit_label (final_cleanup);
+
+	  /* We can still avoid the slow case if the length was larger
+	     than one loop iteration, in which case go do the overlap
+	     load compare path.  */
+	  do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
+		     NULL_RTX, nonconst_overlap);
+
+	  rtx rem4k = gen_reg_rtx (word_mode);
+	  rtx dist1 = gen_reg_rtx (word_mode);
+	  rtx dist2 = gen_reg_rtx (word_mode);
+	  do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
+	  if (word_mode == SImode)
+	    emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
+	  else
+	    emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
+	  do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, handle4k_label);
+	  if (word_mode == SImode)
+	    emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
+	  else
+	    emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
+	  do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, handle4k_label);
+
+	  /* We don't have a 4k boundary to deal with, so do
+	     a load/shift/compare and jump to diff.  */
+
+	  do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
+				src1_addr, src2_addr, orig_src1, orig_src2);
+
+	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
+	  JUMP_LABEL (j) = diff_label;
+	  LABEL_NUSES (diff_label) += 1;
+	  emit_barrier ();
+
+	  /* Finally in the unlikely case we are inching up to a
+	     4k boundary we use a compact lbzx/compare loop to do
+	     it a byte at a time.  */
+
+	  emit_label (handle4k_label);
+
+	  rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
+	  emit_move_insn (ctr, cmp_rem);
+	  rtx ixreg = gen_reg_rtx (Pmode);
+	  emit_move_insn (ixreg, const0_rtx);
+
+	  rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
+	  rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
+	  rtx d1 = gen_reg_rtx (word_mode);
+	  rtx d2 = gen_reg_rtx (word_mode);
+
+	  rtx fc_loop = gen_label_rtx ();
+	  emit_label (fc_loop);
+
+	  do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
+	  do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
+
+	  do_add3 (ixreg, ixreg, const1_rtx);
+
+	  rtx cond = gen_reg_rtx (CCmode);
+	  rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
+	  rs6000_emit_dot_insn (diff, subexpr, 2, cond);
+
+	  rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
+	  if (TARGET_64BIT)
+	    j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
+					       eqrtx, cond));
+	  else
+	    j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
+					       eqrtx, cond));
+	  JUMP_LABEL (j) = fc_loop;
+	  LABEL_NUSES (fc_loop) += 1;
+
+	  if (TARGET_64BIT)
+	    emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
+	  else
+	    emit_move_insn (target, diff);
+
+	  /* Since we are comparing bytes, the difference can be used
+	     as the final result and we are done here.  */
+	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
+	  JUMP_LABEL (j) = final_label;
+	  LABEL_NUSES (final_label) += 1;
+	  emit_barrier ();
+	}
+    }
+
+  emit_label (diff_label);
+  /* difference handling, 64->32 conversion */
+
+  /* We need to produce DI result from sub, then convert to target SI
+     while maintaining <0 / ==0 / >0 properties.  This sequence works:
+     subfc L,A,B
+     subfe H,H,H
+     popcntd L,L
+     rldimi L,H,6,0
+
+     This is an alternate one Segher cooked up if somebody
+     wants to expand this for something that doesn't have popcntd:
+     subfc L,a,b
+     subfe H,x,x
+     addic t,L,-1
+     subfe v,t,L
+     or z,v,H
+
+     And finally, p9 can just do this:
+     cmpld A,B
+     setb r */
+
+  if (TARGET_P9_MISC)
+    emit_insn (gen_setb_unsigned (target, dcond));
+  else
+    {
+      if (TARGET_64BIT)
+	{
+	  rtx tmp_reg_ca = gen_reg_rtx (DImode);
+	  emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
+	  emit_insn (gen_popcntddi2 (diff, diff));
+	  emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
+	  emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
+	}
+      else
+	{
+	  rtx tmp_reg_ca = gen_reg_rtx (SImode);
+	  emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
+	  emit_insn (gen_popcntdsi2 (diff, diff));
+	  emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
+	}
+    }
+
+  if (library_call_label != NULL)
+    {
+      /* Branch around memcmp call.  */
+      j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
+      JUMP_LABEL (j) = final_label;
+      LABEL_NUSES (final_label) += 1;
+      emit_barrier ();
+
+      /* Make memcmp library call.  cmp_rem is the remaining bytes that
+	 were compared and cmp_rem is the expected amount to be compared
+	 by memcmp.  If we don't find a difference in the loop compare, do
+	 the library call directly instead of doing a small compare just
+	 to get to an arbitrary boundary before calling it anyway.
+	 Also, update addresses to point to the next word to examine.  */
+      emit_label (library_call_label);
+
+      rtx len_rtx = gen_reg_rtx (word_mode);
+      if (bytes_is_const)
+	{
+	  emit_move_insn (len_rtx, cmp_rem);
+	  do_add3 (src1_addr, src1_addr, iv1);
+	  do_add3 (src2_addr, src2_addr, iv1);
+	}
+      else
+	emit_move_insn (len_rtx, bytes_rtx);
+
+      tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
+      emit_library_call_value (XEXP (DECL_RTL (fun), 0),
+			       target, LCT_NORMAL, GET_MODE (target),
+			       src1_addr, Pmode,
+			       src2_addr, Pmode,
+			       len_rtx, GET_MODE (len_rtx));
+    }
+
+  /* emit final_label */
+  emit_label (final_label);
+  return true;
+}
+
 /* Expand a block compare operation, and return true if successful.
    Return false if we should let the compiler generate normal code,
    probably a memcmp call.
@@ -331,10 +1284,30 @@
   if (TARGET_32BIT && TARGET_POWERPC64)
     return false;
 
-  /* If this is not a fixed size compare, just call memcmp.  */
-  if (!CONST_INT_P (bytes_rtx))
+  bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
+
+  /* Allow this param to shut off all expansion.  */
+  if (rs6000_block_compare_inline_limit == 0)
     return false;
 
+  /* targetm.slow_unaligned_access -- don't do unaligned stuff.
+     However slow_unaligned_access returns true on P7 even though the
+     performance of this code is good there.  */
+  if (!isP7
+      && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
+	  || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
+    return false;
+
+  /* Unaligned l*brx traps on P7 so don't do this.  However this should
+     not affect much because LE isn't really supported on P7 anyway.  */
+  if (isP7 && !BYTES_BIG_ENDIAN)
+    return false;
+
+  /* If this is not a fixed size compare, try generating loop code and
+     if that fails just call memcmp.  */
+  if (!CONST_INT_P (bytes_rtx))
+    return expand_compare_loop (operands);
+
   /* This must be a fixed size alignment.  */
   if (!CONST_INT_P (align_rtx))
     return false;
@@ -341,11 +1314,6 @@
 
   unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
 
-  /* targetm.slow_unaligned_access -- don't do unaligned stuff.  */
-  if (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
-      || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2)))
-    return false;
-
   gcc_assert (GET_MODE (target) == SImode);
 
   /* Anything to move?  */
@@ -353,14 +1321,6 @@
   if (bytes == 0)
     return true;
 
-  /* The code generated for p7 and older is not faster than glibc
-     memcmp if alignment is small and length is not short, so bail
-     out to avoid those conditions.  */
-  if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
-      && ((base_align == 1 && bytes > 16)
-	  || (base_align == 2 && bytes > 32)))
-    return false;
-
   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
   /* P7/P8 code uses cond for subfc. but P9 uses
@@ -383,10 +1343,18 @@
     select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
   unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
 
-  /* We don't want to generate too much code.  */
-  unsigned HOST_WIDE_INT max_bytes =
-    load_mode_size * (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_limit;
+  /* We don't want to generate too much code.  The loop code can take
+     over for lengths greater than 31 bytes.  */
+  unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
   if (!IN_RANGE (bytes, 1, max_bytes))
+    return expand_compare_loop (operands);
+
+  /* The code generated for p7 and older is not faster than glibc
+     memcmp if alignment is small and length is not short, so bail
+     out to avoid those conditions.  */
+  if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
+      && ((base_align == 1 && bytes > 16)
+	  || (base_align == 2 && bytes > 32)))
     return false;
 
   bool generate_6432_conversion = false;
@@ -461,7 +1429,7 @@
 	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
 	  src1 = replace_equiv_address (src1, src1_reg);
 	}
-      set_mem_size (src1, load_mode_size);
+      set_mem_size (src1, cmp_bytes);
 
       if (!REG_P (XEXP (src2, 0)))
 	{
@@ -468,7 +1436,7 @@
 	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
 	  src2 = replace_equiv_address (src2, src2_reg);
 	}
-      set_mem_size (src2, load_mode_size);
+      set_mem_size (src2, cmp_bytes);
 
       do_load_for_compare (tmp_reg_src1, src1, load_mode);
       do_load_for_compare (tmp_reg_src2, src2, load_mode);
@@ -536,7 +1504,7 @@
 		{
 		  rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
 		  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
-		  JUMP_LABEL(j) = final_label;
+		  JUMP_LABEL (j) = final_label;
 		  LABEL_NUSES (final_label) += 1;
 		  emit_barrier ();
 		}
@@ -576,7 +1544,7 @@
 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
 						 cvt_ref, pc_rtx);
 	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-	      JUMP_LABEL(j) = convert_label;
+	      JUMP_LABEL (j) = convert_label;
 	      LABEL_NUSES (convert_label) += 1;
 	    }
 	  else
@@ -791,9 +1759,9 @@
       rtx jmp;
 
       /* Strncmp for power8 in glibc does this:
-	 rldicl	r8,r3,0,52
-	 cmpldi	cr7,r8,4096-16
-	 bgt	cr7,L(pagecross) */
+	 rldicl r8,r3,0,52
+	 cmpldi cr7,r8,4096-16
+	 bgt    cr7,L(pagecross) */
 
       /* Make sure that the length we use for the alignment test and
          the subsequent code generation are in agreement so we do not
Index: gcc/config/rs6000/rs6000.opt
===================================================================
--- gcc/config/rs6000/rs6000.opt	(revision 256350)
+++ gcc/config/rs6000/rs6000.opt	(working copy)
@@ -331,9 +331,13 @@
 Specify how many bytes should be moved inline before calling out to memcpy/memmove.
 
 mblock-compare-inline-limit=
-Target Report Var(rs6000_block_compare_inline_limit) Init(5) RejectNegative Joined UInteger Save
-Specify the maximum number pairs of load instructions that should be generated inline for the compare.  If the number needed exceeds the limit, a call to memcmp will be generated instead.
+Target Report Var(rs6000_block_compare_inline_limit) Init(31) RejectNegative Joined UInteger Save
+Specify the maximum number of bytes to compare inline with non-looping code. If this is set to 0, all inline expansion (non-loop and loop) of memcmp is disabled.
 
+mblock-compare-inline-loop-limit=
+Target Report Var(rs6000_block_compare_inline_loop_limit) Init(-1) RejectNegative Joined UInteger Save
+Specify the maximum number of bytes to compare inline with loop code generation.  If the length is not known at compile time, memcmp will be called after this many bytes are compared. By default, a length will be picked depending on the tuning target.
+
 mstring-compare-inline-limit=
 Target Report Var(rs6000_string_compare_inline_limit) Init(8) RejectNegative Joined UInteger Save
 Specify the maximum number pairs of load instructions that should be generated inline for the compare.  If the number needed exceeds the limit, a call to strncmp will be generated instead.
Index: gcc/testsuite/gcc.dg/memcmp-1.c
===================================================================
--- gcc/testsuite/gcc.dg/memcmp-1.c	(revision 256350)
+++ gcc/testsuite/gcc.dg/memcmp-1.c	(working copy)
@@ -14,11 +14,80 @@
 #ifndef NRAND
 #define NRAND 10000
 #endif
-#define MAX_SZ 200
+#define MAX_SZ 600
 
+#define DEF_RS(ALIGN)                                                      \
+static void test_memcmp_runtime_size_ ## ALIGN (const char *str1, 	   \
+						const char *str2,	   \
+						size_t sz, int expect)	   \
+{									   \
+  char three[8192] __attribute__ ((aligned (4096)));			   \
+  char four[8192] __attribute__ ((aligned (4096)));			   \
+  char *a, *b;								   \
+  int i,j,a1,a2,r;							   \
+  for (j = 0; j < 2; j++)						   \
+    {									   \
+      for (i = 0; i < 2; i++)						   \
+	{								   \
+	  a = three+i*ALIGN+j*(4096-2*i*ALIGN);				   \
+	  b = four+i*ALIGN+j*(4096-2*i*ALIGN);				   \
+	  memcpy(a,str1,sz);						   \
+	  memcpy(b,str2,sz);						   \
+	  asm(" ");							   \
+	  r = memcmp(a,b,sz);						   \
+	  asm(" ");							   \
+	  if ( r < 0 && !(expect < 0) ) abort();			   \
+	  if ( r > 0 && !(expect > 0) )	abort();			   \
+	  if ( r == 0 && !(expect == 0) ) abort();			   \
+	}								   \
+    }									   \
+}
+
+DEF_RS(1)
+DEF_RS(2)
+DEF_RS(4)
+DEF_RS(8)
+DEF_RS(16)
+
+static void test_memcmp_runtime_size (const char *str1, const char *str2,
+				      size_t sz, int expect)
+{
+  char three[8192] __attribute__ ((aligned (4096)));
+  char four[8192] __attribute__ ((aligned (4096)));
+  char *a, *b;
+  int i,j,a1,a2,r;
+  test_memcmp_runtime_size_1 (str1,str2,sz,expect);
+  test_memcmp_runtime_size_2 (str1,str2,sz,expect);
+  test_memcmp_runtime_size_4 (str1,str2,sz,expect);
+  test_memcmp_runtime_size_8 (str1,str2,sz,expect);
+  test_memcmp_runtime_size_16 (str1,str2,sz,expect);
+  for (j = 0; j < 2; j++)
+    {
+      for (i = 0; i < 2; i++)
+	{
+	  for (a1=0; a1 < 2*sizeof(void *); a1++)
+	    {
+	      for (a2=0; a2 < 2*sizeof(void *); a2++)
+		{
+		  a = three+i*a1+j*(4096-2*i*a1);
+		  b = four+i*a2+j*(4096-2*i*a2);
+		  memcpy(a,str1,sz);
+		  memcpy(b,str2,sz);
+		  asm(" ");
+		  r = memcmp(a,b,sz);
+		  asm(" ");
+		  if ( r < 0 && !(expect < 0) ) abort();
+		  if ( r > 0 && !(expect > 0) )	abort();
+		  if ( r == 0 && !(expect == 0) ) abort();
+		}
+	    }
+	}
+    }
+}
+
 static void test_driver_memcmp (void (test_memcmp)(const char *, const char *, int),
 				void (test_strncmp)(const char *, const char *, int),
-				size_t sz, int align)
+  size_t sz, int align)
 {
   char buf1[MAX_SZ*2+10],buf2[MAX_SZ*2+10];
   size_t test_sz = (sz<MAX_SZ)?sz:MAX_SZ;
@@ -35,11 +104,12 @@
 	buf1[j] = rand() & 0xff;
 	buf2[j] = rand() & 0xff;
       }
+      e = lib_memcmp(buf1,buf2,sz);
+      (*test_memcmp)(buf1,buf2,e);
+      test_memcmp_runtime_size (buf1, buf2, sz, e);
+      e = lib_strncmp(buf1,buf2,sz);
+      (*test_strncmp)(buf1,buf2,e);
     }
-    e = lib_memcmp(buf1,buf2,sz);
-    (*test_memcmp)(buf1,buf2,e);
-    e = lib_strncmp(buf1,buf2,sz);
-    (*test_strncmp)(buf1,buf2,e);
   }
   for(diff_pos = ((test_sz>10)?(test_sz-10):0); diff_pos < test_sz+10; diff_pos++)
     for(zero_pos = ((test_sz>10)?(test_sz-10):0); zero_pos < test_sz+10; zero_pos++)
@@ -53,6 +123,9 @@
 	(*test_memcmp)(buf1,buf2,e);
 	(*test_memcmp)(buf2,buf1,-e);
 	(*test_memcmp)(buf2,buf2,0);
+	test_memcmp_runtime_size (buf1, buf2, sz, e);
+	test_memcmp_runtime_size (buf2, buf1, sz, -e);
+	test_memcmp_runtime_size (buf2, buf2, sz, 0);
 	e = lib_strncmp(buf1,buf2,sz);
 	(*test_strncmp)(buf1,buf2,e);
 	(*test_strncmp)(buf2,buf1,-e);
@@ -61,6 +134,7 @@
 	buf2[diff_pos] = 0;
 	e = lib_memcmp(buf1,buf2,sz);
 	(*test_memcmp)(buf1,buf2,e);
+	test_memcmp_runtime_size (buf1, buf2, sz, e);
 	e = lib_strncmp(buf1,buf2,sz);
 	(*test_strncmp)(buf1,buf2,e);
 	memset(buf2+diff_pos,'B',sizeof(buf2)-diff_pos);
@@ -68,6 +142,8 @@
 	e = lib_memcmp(buf1,buf2,sz);
 	(*test_memcmp)(buf1,buf2,e);
 	(*test_memcmp)(buf2,buf1,-e);
+	test_memcmp_runtime_size (buf1, buf2, sz, e);
+	test_memcmp_runtime_size (buf2, buf1, sz, -e);
 	e = lib_strncmp(buf1,buf2,sz);
 	(*test_strncmp)(buf1,buf2,e);
 	(*test_strncmp)(buf2,buf1,-e);
@@ -371,7 +447,14 @@
 DEF_TEST(100,4)
 DEF_TEST(100,8)
 DEF_TEST(100,16)
+DEF_TEST(191,1)
+DEF_TEST(192,1)
+DEF_TEST(193,1)
+DEF_TEST(200,1)
+DEF_TEST(400,1)
 #else
+DEF_TEST(1,1)
+DEF_TEST(2,1)
 DEF_TEST(3,1)
 DEF_TEST(4,1)
 DEF_TEST(5,1)
@@ -389,6 +472,8 @@
 DEF_TEST(32,1)
 DEF_TEST(100,1)
 DEF_TEST(100,8)
+DEF_TEST(180,1)
+DEF_TEST(180,8)
 #endif
 
 int
@@ -395,7 +480,7 @@
 main(int argc, char **argv)
 {
 #ifdef TEST_ALL
-  RUN_TEST(1,1)
+    RUN_TEST(1,1)
     RUN_TEST(1,2)
     RUN_TEST(1,4)
     RUN_TEST(1,8)
@@ -645,7 +730,14 @@
     RUN_TEST(100,4)
     RUN_TEST(100,8)
     RUN_TEST(100,16)
+    RUN_TEST(191,1)
+    RUN_TEST(192,1)
+    RUN_TEST(193,1)
+    RUN_TEST(200,1)
+    RUN_TEST(400,1)
 #else
+    RUN_TEST(1,1)
+    RUN_TEST(2,1)
     RUN_TEST(3,1)
     RUN_TEST(4,1)
     RUN_TEST(5,1)
@@ -663,5 +755,7 @@
     RUN_TEST(32,1)
     RUN_TEST(100,1)
     RUN_TEST(100,8)
+    RUN_TEST(180,1)
+    RUN_TEST(180,8)
 #endif
 }
Index: gcc/testsuite/gcc.dg/strncmp-2.c
===================================================================
--- gcc/testsuite/gcc.dg/strncmp-2.c	(revision 256350)
+++ gcc/testsuite/gcc.dg/strncmp-2.c	(working copy)
@@ -81,6 +81,15 @@
 DEF_TEST(14)
 DEF_TEST(15)
 DEF_TEST(16)
+DEF_TEST(32)
+DEF_TEST(64)
+DEF_TEST(65)
+DEF_TEST(66)
+DEF_TEST(67)
+DEF_TEST(68)
+DEF_TEST(69)
+DEF_TEST(70)
+DEF_TEST(71)
 
 int
 main(int argc, char **argv)
@@ -101,5 +110,14 @@
   RUN_TEST(14);
   RUN_TEST(15);
   RUN_TEST(16);
+  RUN_TEST(32);
+  RUN_TEST(64);
+  RUN_TEST(65);
+  RUN_TEST(66);
+  RUN_TEST(67);
+  RUN_TEST(68);
+  RUN_TEST(69);
+  RUN_TEST(70);
+  RUN_TEST(71);
   return 0;
 }

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]