This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] reorganize block/string move/compare expansions out of rs6000.c


This patch moves about 1400 lines of code for various block and string
compare/move/zero expansions out of rs6000.c into a new file 
rs6000-string.c. Segher had asked me to do this before I go adding new
code here.

Bootstrap passes on ppc64le, regtest in progress. OK for trunk if that
passes?



2017-06-22  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>

	* config/rs6000/rs6000-string.c (expand_block_clear,
	do_load_for_compare, select_block_compare_mode,
	compute_current_alignment, expand_block_compare,
	expand_strncmp_align_check, expand_strn_compare,
	expand_block_move, rs6000_output_load_multiple)
	Move functions related to string/block move/compare
	to a separate file.
	* config/rs6000/rs6000.c Move above functions to rs6000-string.c.
	* config/rs6000/rs6000-protos.h (rs6000_emit_dot_insn) Add prototype
	for this function which is now used in two files.
	* config/rs6000/t-rs6000 Add rule to compile rs6000-string.o.
	* config.gcc Add rs6000-string.o to extra_objs for
	targets powerpc*-*-* and rs6000*-*-*.

-- 
Aaron Sawdey, Ph.D.  acsawdey@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain
Index: config.gcc
===================================================================
--- config.gcc	(revision 249565)
+++ config.gcc	(working copy)
@@ -454,6 +454,7 @@
 	;;
 powerpc*-*-*)
 	cpu_type=rs6000
+	extra_objs="rs6000-string.o"
 	extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
 	extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h x86intrin.h"
 	extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h"
@@ -471,6 +472,7 @@
 	;;
 rs6000*-*-*)
 	extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt"
+	extra_objs="rs6000-string.o"
 	;;
 sparc*-*-*)
 	cpu_type=sparc
Index: config/rs6000/rs6000-protos.h
===================================================================
--- config/rs6000/rs6000-protos.h	(revision 249565)
+++ config/rs6000/rs6000-protos.h	(working copy)
@@ -134,6 +134,7 @@
 extern void rs6000_emit_cbranch (machine_mode, rtx[]);
 extern char * output_cbranch (rtx, const char *, int, rtx_insn *);
 extern const char * output_probe_stack_range (rtx, rtx);
+extern void rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg);
 extern bool rs6000_emit_set_const (rtx, rtx);
 extern int rs6000_emit_cmove (rtx, rtx, rtx, rtx);
 extern int rs6000_emit_vector_cond_expr (rtx, rtx, rtx, rtx, rtx, rtx);
Index: config/rs6000/rs6000-string.c
===================================================================
--- config/rs6000/rs6000-string.c	(revision 0)
+++ config/rs6000/rs6000-string.c	(working copy)
@@ -0,0 +1,1468 @@
+/* Subroutines used for code generation on IBM RS/6000.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   Contributed by Richard Kenner (kenner@vlsi1.ultra.nyu.edu)
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "tm_p.h"
+#include "ira.h"
+#include "print-tree.h"
+#include "varasm.h"
+#include "explow.h"
+#include "expr.h"
+#include "output.h"
+
+#define min(A,B)	((A) < (B) ? (A) : (B))
+#define max(A,B)	((A) > (B) ? (A) : (B))
+
+/* Expand a block clear operation, and return 1 if successful.  Return 0
+   if we should let the compiler generate normal code.
+
+   operands[0] is the destination
+   operands[1] is the length
+   operands[3] is the alignment */
+
+int
+expand_block_clear (rtx operands[])
+{
+  rtx orig_dest = operands[0];
+  rtx bytes_rtx	= operands[1];
+  rtx align_rtx = operands[3];
+  bool constp	= (GET_CODE (bytes_rtx) == CONST_INT);
+  HOST_WIDE_INT align;
+  HOST_WIDE_INT bytes;
+  int offset;
+  int clear_bytes;
+  int clear_step;
+
+  /* If this is not a fixed size move, just call memcpy */
+  if (! constp)
+    return 0;
+
+  /* This must be a fixed size alignment  */
+  gcc_assert (GET_CODE (align_rtx) == CONST_INT);
+  align = INTVAL (align_rtx) * BITS_PER_UNIT;
+
+  /* Anything to clear? */
+  bytes = INTVAL (bytes_rtx);
+  if (bytes <= 0)
+    return 1;
+
+  /* Use the builtin memset after a point, to avoid huge code bloat.
+     When optimize_size, avoid any significant code bloat; calling
+     memset is about 4 instructions, so allow for one instruction to
+     load zero and three to do clearing.  */
+  if (TARGET_ALTIVEC && align >= 128)
+    clear_step = 16;
+  else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
+    clear_step = 8;
+  else
+    clear_step = 4;
+
+  if (optimize_size && bytes > 3 * clear_step)
+    return 0;
+  if (! optimize_size && bytes > 8 * clear_step)
+    return 0;
+
+  for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
+    {
+      machine_mode mode = BLKmode;
+      rtx dest;
+
+      if (bytes >= 16 && TARGET_ALTIVEC && align >= 128)
+	{
+	  clear_bytes = 16;
+	  mode = V4SImode;
+	}
+      else if (bytes >= 8 && TARGET_POWERPC64
+	       && (align >= 64 || !STRICT_ALIGNMENT))
+	{
+	  clear_bytes = 8;
+	  mode = DImode;
+	  if (offset == 0 && align < 64)
+	    {
+	      rtx addr;
+
+	      /* If the address form is reg+offset with offset not a
+		 multiple of four, reload into reg indirect form here
+		 rather than waiting for reload.  This way we get one
+		 reload, not one per store.  */
+	      addr = XEXP (orig_dest, 0);
+	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
+		  && GET_CODE (XEXP (addr, 1)) == CONST_INT
+		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
+		{
+		  addr = copy_addr_to_reg (addr);
+		  orig_dest = replace_equiv_address (orig_dest, addr);
+		}
+	    }
+	}
+      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
+	{			/* move 4 bytes */
+	  clear_bytes = 4;
+	  mode = SImode;
+	}
+      else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
+	{			/* move 2 bytes */
+	  clear_bytes = 2;
+	  mode = HImode;
+	}
+      else /* move 1 byte at a time */
+	{
+	  clear_bytes = 1;
+	  mode = QImode;
+	}
+
+      dest = adjust_address (orig_dest, mode, offset);
+
+      emit_move_insn (dest, CONST0_RTX (mode));
+    }
+
+  return 1;
+}
+
+/* Figure out the correct instructions to generate to load data for
+   block compare.  MODE is used for the read from memory, and
+   data is zero extended if REG is wider than MODE.  If LE code
+   is being generated, bswap loads are used.
+
+   REG is the destination register to move the data into.
+   MEM is the memory block being read.
+   MODE is the mode of memory to use for the read.  */
+static void
+do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
+{
+  switch (GET_MODE (reg))
+    {
+    case DImode:
+      switch (mode)
+	{
+	case QImode:
+	  emit_insn (gen_zero_extendqidi2 (reg, mem));
+	  break;
+	case HImode:
+	  {
+	    rtx src = mem;
+	    if (!BYTES_BIG_ENDIAN)
+	      {
+		src = gen_reg_rtx (HImode);
+		emit_insn (gen_bswaphi2 (src, mem));
+	      }
+	    emit_insn (gen_zero_extendhidi2 (reg, src));
+	    break;
+	  }
+	case SImode:
+	  {
+	    rtx src = mem;
+	    if (!BYTES_BIG_ENDIAN)
+	      {
+		src = gen_reg_rtx (SImode);
+		emit_insn (gen_bswapsi2 (src, mem));
+	      }
+	    emit_insn (gen_zero_extendsidi2 (reg, src));
+	  }
+	  break;
+	case DImode:
+	  if (!BYTES_BIG_ENDIAN)
+	    emit_insn (gen_bswapdi2 (reg, mem));
+	  else
+	    emit_insn (gen_movdi (reg, mem));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+
+    case SImode:
+      switch (mode)
+	{
+	case QImode:
+	  emit_insn (gen_zero_extendqisi2 (reg, mem));
+	  break;
+	case HImode:
+	  {
+	    rtx src = mem;
+	    if (!BYTES_BIG_ENDIAN)
+	      {
+		src = gen_reg_rtx (HImode);
+		emit_insn (gen_bswaphi2 (src, mem));
+	      }
+	    emit_insn (gen_zero_extendhisi2 (reg, src));
+	    break;
+	  }
+	case SImode:
+	  if (!BYTES_BIG_ENDIAN)
+	    emit_insn (gen_bswapsi2 (reg, mem));
+	  else
+	    emit_insn (gen_movsi (reg, mem));
+	  break;
+	case DImode:
+	  /* DImode is larger than the destination reg so is not expected.  */
+	  gcc_unreachable ();
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+    default:
+      gcc_unreachable ();
+      break;
+    }
+}
+
+/* Select the mode to be used for reading the next chunk of bytes
+   in the compare.
+
+   OFFSET is the current read offset from the beginning of the block.
+   BYTES is the number of bytes remaining to be read.
+   ALIGN is the minimum alignment of the memory blocks being compared in bytes.
+   WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
+   the largest allowable mode.  */
+static machine_mode
+select_block_compare_mode (unsigned HOST_WIDE_INT offset,
+			   unsigned HOST_WIDE_INT bytes,
+			   unsigned HOST_WIDE_INT align, bool word_mode_ok)
+{
+  /* First see if we can do a whole load unit
+     as that will be more efficient than a larger load + shift.  */
+
+  /* If big, use biggest chunk.
+     If exactly chunk size, use that size.
+     If remainder can be done in one piece with shifting, do that.
+     Do largest chunk possible without violating alignment rules.  */
+
+  /* The most we can read without potential page crossing.  */
+  unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
+
+  if (word_mode_ok && bytes >= UNITS_PER_WORD)
+    return word_mode;
+  else if (bytes == GET_MODE_SIZE (SImode))
+    return SImode;
+  else if (bytes == GET_MODE_SIZE (HImode))
+    return HImode;
+  else if (bytes == GET_MODE_SIZE (QImode))
+    return QImode;
+  else if (bytes < GET_MODE_SIZE (SImode)
+	   && offset >= GET_MODE_SIZE (SImode) - bytes)
+    /* This matches the case were we have SImode and 3 bytes
+       and offset >= 1 and permits us to move back one and overlap
+       with the previous read, thus avoiding having to shift
+       unwanted bytes off of the input.  */
+    return SImode;
+  else if (word_mode_ok && bytes < UNITS_PER_WORD
+	   && offset >= UNITS_PER_WORD-bytes)
+    /* Similarly, if we can use DImode it will get matched here and
+       can do an overlapping read that ends at the end of the block.  */
+    return word_mode;
+  else if (word_mode_ok && maxread >= UNITS_PER_WORD)
+    /* It is safe to do all remaining in one load of largest size,
+       possibly with a shift to get rid of unwanted bytes.  */
+    return word_mode;
+  else if (maxread >= GET_MODE_SIZE (SImode))
+    /* It is safe to do all remaining in one SImode load,
+       possibly with a shift to get rid of unwanted bytes.  */
+    return SImode;
+  else if (bytes > GET_MODE_SIZE (SImode))
+    return SImode;
+  else if (bytes > GET_MODE_SIZE (HImode))
+    return HImode;
+
+  /* final fallback is do one byte */
+  return QImode;
+}
+
+/* Compute the alignment of pointer+OFFSET where the original alignment
+   of pointer was BASE_ALIGN.  */
+static unsigned HOST_WIDE_INT
+compute_current_alignment (unsigned HOST_WIDE_INT base_align,
+			   unsigned HOST_WIDE_INT offset)
+{
+  if (offset == 0)
+    return base_align;
+  return min (base_align, offset & -offset);
+}
+
+/* Expand a block compare operation, and return true if successful.
+   Return false if we should let the compiler generate normal code,
+   probably a memcmp call.
+
+   OPERANDS[0] is the target (result).
+   OPERANDS[1] is the first source.
+   OPERANDS[2] is the second source.
+   OPERANDS[3] is the length.
+   OPERANDS[4] is the alignment.  */
+bool
+expand_block_compare (rtx operands[])
+{
+  rtx target = operands[0];
+  rtx orig_src1 = operands[1];
+  rtx orig_src2 = operands[2];
+  rtx bytes_rtx = operands[3];
+  rtx align_rtx = operands[4];
+  HOST_WIDE_INT cmp_bytes = 0;
+  rtx src1 = orig_src1;
+  rtx src2 = orig_src2;
+
+  /* This case is complicated to handle because the subtract
+     with carry instructions do not generate the 64-bit
+     carry and so we must emit code to calculate it ourselves.
+     We choose not to implement this yet.  */
+  if (TARGET_32BIT && TARGET_POWERPC64)
+    return false;
+
+  /* If this is not a fixed size compare, just call memcmp.  */
+  if (!CONST_INT_P (bytes_rtx))
+    return false;
+
+  /* This must be a fixed size alignment.  */
+  if (!CONST_INT_P (align_rtx))
+    return false;
+
+  unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
+
+  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff.  */
+  if (SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src1))
+      || SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src2)))
+    return false;
+
+  gcc_assert (GET_MODE (target) == SImode);
+
+  /* Anything to move?  */
+  unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
+  if (bytes == 0)
+    return true;
+
+  /* The code generated for p7 and older is not faster than glibc
+     memcmp if alignment is small and length is not short, so bail
+     out to avoid those conditions.  */
+  if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
+      && ((base_align == 1 && bytes > 16)
+	  || (base_align == 2 && bytes > 32)))
+    return false;
+
+  rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
+  rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
+  /* P7/P8 code uses cond for subfc. but P9 uses
+     it for cmpld which needs CCUNSmode. */
+  rtx cond;
+  if (TARGET_P9_MISC)
+    cond = gen_reg_rtx (CCUNSmode);
+  else
+    cond = gen_reg_rtx (CCmode);
+
+  /* If we have an LE target without ldbrx and word_mode is DImode,
+     then we must avoid using word_mode.  */
+  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
+		       && word_mode == DImode);
+
+  /* Strategy phase.  How many ops will this take and should we expand it?  */
+
+  unsigned HOST_WIDE_INT offset = 0;
+  machine_mode load_mode =
+    select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
+  unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
+
+  /* We don't want to generate too much code.  */
+  unsigned HOST_WIDE_INT max_bytes =
+    load_mode_size * (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_limit;
+  if (!IN_RANGE (bytes, 1, max_bytes))
+    return false;
+
+  bool generate_6432_conversion = false;
+  rtx convert_label = NULL;
+  rtx final_label = NULL;
+
+  /* Example of generated code for 18 bytes aligned 1 byte.
+     Compiled with -fno-reorder-blocks for clarity.
+             ldbrx 10,31,8
+             ldbrx 9,7,8
+             subfc. 9,9,10
+             bne 0,.L6487
+             addi 9,12,8
+             addi 5,11,8
+             ldbrx 10,0,9
+             ldbrx 9,0,5
+             subfc. 9,9,10
+             bne 0,.L6487
+             addi 9,12,16
+             lhbrx 10,0,9
+             addi 9,11,16
+             lhbrx 9,0,9
+             subf 9,9,10
+             b .L6488
+             .p2align 4,,15
+     .L6487: #convert_label
+             popcntd 9,9
+             subfe 10,10,10
+             or 9,9,10
+     .L6488: #final_label
+             extsw 10,9
+
+     We start off with DImode for two blocks that jump to the DI->SI conversion
+     if the difference is found there, then a final block of HImode that skips
+     the DI->SI conversion.  */
+
+  while (bytes > 0)
+    {
+      unsigned int align = compute_current_alignment (base_align, offset);
+      if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+	load_mode = select_block_compare_mode (offset, bytes, align,
+					       word_mode_ok);
+      else
+	load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
+      load_mode_size = GET_MODE_SIZE (load_mode);
+      if (bytes >= load_mode_size)
+	cmp_bytes = load_mode_size;
+      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+	{
+	  /* Move this load back so it doesn't go past the end.
+	     P8/P9 can do this efficiently.  */
+	  unsigned int extra_bytes = load_mode_size - bytes;
+	  cmp_bytes = bytes;
+	  if (extra_bytes < offset)
+	    {
+	      offset -= extra_bytes;
+	      cmp_bytes = load_mode_size;
+	      bytes = cmp_bytes;
+	    }
+	}
+      else
+	/* P7 and earlier can't do the overlapping load trick fast,
+	   so this forces a non-overlapping load and a shift to get
+	   rid of the extra bytes.  */
+	cmp_bytes = bytes;
+
+      src1 = adjust_address (orig_src1, load_mode, offset);
+      src2 = adjust_address (orig_src2, load_mode, offset);
+
+      if (!REG_P (XEXP (src1, 0)))
+	{
+	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
+	  src1 = replace_equiv_address (src1, src1_reg);
+	}
+      set_mem_size (src1, cmp_bytes);
+
+      if (!REG_P (XEXP (src2, 0)))
+	{
+	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
+	  src2 = replace_equiv_address (src2, src2_reg);
+	}
+      set_mem_size (src2, cmp_bytes);
+
+      do_load_for_compare (tmp_reg_src1, src1, load_mode);
+      do_load_for_compare (tmp_reg_src2, src2, load_mode);
+
+      if (cmp_bytes < load_mode_size)
+	{
+	  /* Shift unneeded bytes off.  */
+	  rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
+	  if (word_mode == DImode)
+	    {
+	      emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
+	      emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
+	    }
+	  else
+	    {
+	      emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
+	      emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
+	    }
+	}
+
+      int remain = bytes - cmp_bytes;
+      if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
+	{
+	  /* Target is larger than load size so we don't need to
+	     reduce result size.  */
+
+	  /* We previously did a block that need 64->32 conversion but
+	     the current block does not, so a label is needed to jump
+	     to the end.  */
+	  if (generate_6432_conversion && !final_label)
+	    final_label = gen_label_rtx ();
+
+	  if (remain > 0)
+	    {
+	      /* This is not the last block, branch to the end if the result
+		 of this subtract is not zero.  */
+	      if (!final_label)
+		final_label = gen_label_rtx ();
+	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
+	      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
+	      rtx cr = gen_reg_rtx (CCmode);
+	      rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
+	      emit_insn (gen_movsi (target,
+				    gen_lowpart (SImode, tmp_reg_src2)));
+	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
+	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
+						 fin_ref, pc_rtx);
+	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+	      JUMP_LABEL (j) = final_label;
+	      LABEL_NUSES (final_label) += 1;
+	    }
+	  else
+	    {
+	      if (word_mode == DImode)
+		{
+		  emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
+					 tmp_reg_src2));
+		  emit_insn (gen_movsi (target,
+					gen_lowpart (SImode, tmp_reg_src2)));
+		}
+	      else
+		emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
+
+	      if (final_label)
+		{
+		  rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
+		  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
+		  JUMP_LABEL(j) = final_label;
+		  LABEL_NUSES (final_label) += 1;
+		  emit_barrier ();
+		}
+	    }
+	}
+      else
+	{
+	  /* Do we need a 64->32 conversion block? We need the 64->32
+	     conversion even if target size == load_mode size because
+	     the subtract generates one extra bit.  */
+	  generate_6432_conversion = true;
+
+	  if (remain > 0)
+	    {
+	      if (!convert_label)
+		convert_label = gen_label_rtx ();
+
+	      /* Compare to zero and branch to convert_label if not zero.  */
+	      rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
+	      if (TARGET_P9_MISC)
+		{
+		/* Generate a compare, and convert with a setb later.  */
+		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
+					     tmp_reg_src2);
+		  emit_insn (gen_rtx_SET (cond, cmp));
+		}
+	      else
+		/* Generate a subfc. and use the longer
+		   sequence for conversion.  */
+		if (TARGET_64BIT)
+		  emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
+						     tmp_reg_src1, cond));
+		else
+		  emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
+						     tmp_reg_src1, cond));
+	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
+						 cvt_ref, pc_rtx);
+	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+	      JUMP_LABEL(j) = convert_label;
+	      LABEL_NUSES (convert_label) += 1;
+	    }
+	  else
+	    {
+	      /* Just do the subtract/compare.  Since this is the last block
+		 the convert code will be generated immediately following.  */
+	      if (TARGET_P9_MISC)
+		{
+		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
+					     tmp_reg_src2);
+		  emit_insn (gen_rtx_SET (cond, cmp));
+		}
+	      else
+		if (TARGET_64BIT)
+		  emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
+						tmp_reg_src1));
+		else
+		  emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
+						tmp_reg_src1));
+	    }
+	}
+
+      offset += cmp_bytes;
+      bytes -= cmp_bytes;
+    }
+
+  if (generate_6432_conversion)
+    {
+      if (convert_label)
+	emit_label (convert_label);
+
+      /* We need to produce DI result from sub, then convert to target SI
+	 while maintaining <0 / ==0 / >0 properties. This sequence works:
+	 subfc L,A,B
+	 subfe H,H,H
+	 popcntd L,L
+	 rldimi L,H,6,0
+
+	 This is an alternate one Segher cooked up if somebody
+	 wants to expand this for something that doesn't have popcntd:
+	 subfc L,a,b
+	 subfe H,x,x
+	 addic t,L,-1
+	 subfe v,t,L
+	 or z,v,H
+
+	 And finally, p9 can just do this:
+	 cmpld A,B
+	 setb r */
+
+      if (TARGET_P9_MISC)
+	{
+	  emit_insn (gen_setb_unsigned (target, cond));
+	}
+      else
+	{
+	  if (TARGET_64BIT)
+	    {
+	      rtx tmp_reg_ca = gen_reg_rtx (DImode);
+	      emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
+	      emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
+	      emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
+	      emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
+	    }
+	  else
+	    {
+	      rtx tmp_reg_ca = gen_reg_rtx (SImode);
+	      emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
+	      emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
+	      emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
+	    }
+	}
+    }
+
+  if (final_label)
+    emit_label (final_label);
+
+  gcc_assert (bytes == 0);
+  return true;
+}
+
+/* Generate alignment check and branch code to set up for
+   strncmp when we don't have DI alignment.
+   STRNCMP_LABEL is the label to branch if there is a page crossing.
+   SRC is the string pointer to be examined.
+   BYTES is the max number of bytes to compare.  */
+static void
+expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
+{
+  rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
+  rtx src_check = copy_addr_to_reg (XEXP (src, 0));
+  if (GET_MODE (src_check) == SImode)
+    emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff)));
+  else
+    emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff)));
+  rtx cond = gen_reg_rtx (CCmode);
+  emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check,
+					 GEN_INT (4096 - bytes)));
+
+  rtx cmp_rtx = gen_rtx_LT (VOIDmode, cond, const0_rtx);
+
+  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
+				     pc_rtx, lab_ref);
+  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  JUMP_LABEL (j) = strncmp_label;
+  LABEL_NUSES (strncmp_label) += 1;
+}
+
+/* Expand a string compare operation with length, and return
+   true if successful. Return false if we should let the
+   compiler generate normal code, probably a strncmp call.
+
+   OPERANDS[0] is the target (result).
+   OPERANDS[1] is the first source.
+   OPERANDS[2] is the second source.
+   If NO_LENGTH is zero, then:
+   OPERANDS[3] is the length.
+   OPERANDS[4] is the alignment in bytes.
+   If NO_LENGTH is nonzero, then:
+   OPERANDS[3] is the alignment in bytes.  */
+bool
+expand_strn_compare (rtx operands[], int no_length)
+{
+  rtx target = operands[0];
+  rtx orig_src1 = operands[1];
+  rtx orig_src2 = operands[2];
+  rtx bytes_rtx, align_rtx;
+  if (no_length)
+    {
+      bytes_rtx = NULL;
+      align_rtx = operands[3];
+    }
+  else
+    {
+      bytes_rtx = operands[3];
+      align_rtx = operands[4];
+    }
+  unsigned HOST_WIDE_INT cmp_bytes = 0;
+  rtx src1 = orig_src1;
+  rtx src2 = orig_src2;
+
+  /* If we have a length, it must be constant. This simplifies things
+     a bit as we don't have to generate code to check if we've exceeded
+     the length. Later this could be expanded to handle this case.  */
+  if (!no_length && !CONST_INT_P (bytes_rtx))
+    return false;
+
+  /* This must be a fixed size alignment.  */
+  if (!CONST_INT_P (align_rtx))
+    return false;
+
+  unsigned int base_align = UINTVAL (align_rtx);
+  int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
+  int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
+
+  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff.  */
+  if (SLOW_UNALIGNED_ACCESS (word_mode, align1)
+      || SLOW_UNALIGNED_ACCESS (word_mode, align2))
+    return false;
+
+  gcc_assert (GET_MODE (target) == SImode);
+
+  /* If we have an LE target without ldbrx and word_mode is DImode,
+     then we must avoid using word_mode.  */
+  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
+		       && word_mode == DImode);
+
+  unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
+
+  unsigned HOST_WIDE_INT offset = 0;
+  unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available.  */
+  unsigned HOST_WIDE_INT compare_length; /* How much to compare inline.  */
+  if (no_length)
+    /* Use this as a standin to determine the mode to use.  */
+    bytes = rs6000_string_compare_inline_limit * word_mode_size;
+  else
+    bytes = UINTVAL (bytes_rtx);
+
+  machine_mode load_mode =
+    select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
+  unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
+  compare_length = rs6000_string_compare_inline_limit * load_mode_size;
+
+  /* If we have equality at the end of the last compare and we have not
+     found the end of the string, we need to call strcmp/strncmp to
+     compare the remainder.  */
+  bool equality_compare_rest = false;
+
+  if (no_length)
+    {
+      bytes = compare_length;
+      equality_compare_rest = true;
+    }
+  else
+    {
+      if (bytes <= compare_length)
+	compare_length = bytes;
+      else
+	equality_compare_rest = true;
+    }
+
+  rtx result_reg = gen_reg_rtx (word_mode);
+  rtx final_move_label = gen_label_rtx ();
+  rtx final_label = gen_label_rtx ();
+  rtx begin_compare_label = NULL;
+
+  if (base_align < 8)
+    {
+      /* Generate code that checks distance to 4k boundary for this case.  */
+      begin_compare_label = gen_label_rtx ();
+      rtx strncmp_label = gen_label_rtx ();
+      rtx jmp;
+
+      /* Strncmp for power8 in glibc does this:
+	 rldicl	r8,r3,0,52
+	 cmpldi	cr7,r8,4096-16
+	 bgt	cr7,L(pagecross) */
+
+      /* Make sure that the length we use for the alignment test and
+         the subsequent code generation are in agreement so we do not
+         go past the length we tested for a 4k boundary crossing.  */
+      unsigned HOST_WIDE_INT align_test = compare_length;
+      if (align_test < 8)
+        {
+          align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
+          base_align = align_test;
+        }
+      else
+        {
+          align_test = ROUND_UP (align_test, 8);
+          base_align = 8;
+        }
+
+      if (align1 < 8)
+        expand_strncmp_align_check (strncmp_label, src1, align_test);
+      if (align2 < 8)
+        expand_strncmp_align_check (strncmp_label, src2, align_test);
+
+      /* Now generate the following sequence:
+	 - branch to begin_compare
+	 - strncmp_label
+	 - call to strncmp
+	 - branch to final_label
+	 - begin_compare_label */
+
+      rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
+      jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
+      JUMP_LABEL (jmp) = begin_compare_label;
+      LABEL_NUSES (begin_compare_label) += 1;
+      emit_barrier ();
+
+      emit_label (strncmp_label);
+
+      if (!REG_P (XEXP (src1, 0)))
+	{
+	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
+	  src1 = replace_equiv_address (src1, src1_reg);
+	}
+
+      if (!REG_P (XEXP (src2, 0)))
+	{
+	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
+	  src2 = replace_equiv_address (src2, src2_reg);
+	}
+
+      if (no_length)
+	{
+	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
+	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
+				   target, LCT_NORMAL, GET_MODE (target), 2,
+				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
+				   force_reg (Pmode, XEXP (src2, 0)), Pmode);
+	}
+      else
+	{
+	  /* -m32 -mpowerpc64 results in word_mode being DImode even
+	     though otherwise it is 32-bit. The length arg to strncmp
+	     is a size_t which will be the same size as pointers.  */
+	  rtx len_rtx;
+	  if (TARGET_64BIT)
+	    len_rtx = gen_reg_rtx (DImode);
+	  else
+	    len_rtx = gen_reg_rtx (SImode);
+
+	  emit_move_insn (len_rtx, bytes_rtx);
+
+	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
+	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
+				   target, LCT_NORMAL, GET_MODE (target), 3,
+				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
+				   force_reg (Pmode, XEXP (src2, 0)), Pmode,
+				   len_rtx, GET_MODE (len_rtx));
+	}
+
+      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
+      jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
+      JUMP_LABEL (jmp) = final_label;
+      LABEL_NUSES (final_label) += 1;
+      emit_barrier ();
+      emit_label (begin_compare_label);
+    }
+
+  rtx cleanup_label = NULL;
+  rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
+  rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
+
+  /* Generate sequence of ld/ldbrx, cmpb to compare out
+     to the length specified.  */
+  unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
+  while (bytes_to_compare > 0)
+    {
+      /* Compare sequence:
+         check each 8B with: ld/ld cmpd bne
+	 If equal, use rldicr/cmpb to check for zero byte.
+         cleanup code at end:
+         cmpb          get byte that differs
+         cmpb          look for zero byte
+         orc           combine
+         cntlzd        get bit of first zero/diff byte
+         subfic        convert for rldcl use
+         rldcl rldcl   extract diff/zero byte
+         subf          subtract for final result
+
+         The last compare can branch around the cleanup code if the
+         result is zero because the strings are exactly equal.  */
+      unsigned int align = compute_current_alignment (base_align, offset);
+      if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+	load_mode = select_block_compare_mode (offset, bytes_to_compare, align,
+					       word_mode_ok);
+      else
+	load_mode = select_block_compare_mode (0, bytes_to_compare, align,
+					       word_mode_ok);
+      load_mode_size = GET_MODE_SIZE (load_mode);
+      if (bytes_to_compare >= load_mode_size)
+	cmp_bytes = load_mode_size;
+      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+	{
+	  /* Move this load back so it doesn't go past the end.
+	     P8/P9 can do this efficiently.  */
+	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
+	  cmp_bytes = bytes_to_compare;
+	  if (extra_bytes < offset)
+	    {
+	      offset -= extra_bytes;
+	      cmp_bytes = load_mode_size;
+	      bytes_to_compare = cmp_bytes;
+	    }
+	}
+      else
+	/* P7 and earlier can't do the overlapping load trick fast,
+	   so this forces a non-overlapping load and a shift to get
+	   rid of the extra bytes.  */
+	cmp_bytes = bytes_to_compare;
+
+      src1 = adjust_address (orig_src1, load_mode, offset);
+      src2 = adjust_address (orig_src2, load_mode, offset);
+
+      if (!REG_P (XEXP (src1, 0)))
+	{
+	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
+	  src1 = replace_equiv_address (src1, src1_reg);
+	}
+      set_mem_size (src1, cmp_bytes);
+
+      if (!REG_P (XEXP (src2, 0)))
+	{
+	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
+	  src2 = replace_equiv_address (src2, src2_reg);
+	}
+      set_mem_size (src2, cmp_bytes);
+
+      do_load_for_compare (tmp_reg_src1, src1, load_mode);
+      do_load_for_compare (tmp_reg_src2, src2, load_mode);
+
+      /* We must always left-align the data we read, and
+	 clear any bytes to the right that are beyond the string.
+	 Otherwise the cmpb sequence won't produce the correct
+	 results.  The beginning of the compare will be done
+	 with word_mode so will not have any extra shifts or
+	 clear rights.  */
+
+      if (load_mode_size < word_mode_size)
+	{
+	  /* Rotate left first. */
+	  rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
+	  if (word_mode == DImode)
+	    {
+	      emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh));
+	      emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh));
+	    }
+	  else
+	    {
+	      emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh));
+	      emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh));
+	    }
+	}
+
+      if (cmp_bytes < word_mode_size)
+	{
+	  /* Now clear right.  This plus the rotate can be
+	     turned into a rldicr instruction. */
+	  HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
+	  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
+	  if (word_mode == DImode)
+	    {
+	      emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
+	      emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
+	    }
+	  else
+	    {
+	      emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
+	      emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
+	    }
+	}
+
+      /* Cases to handle.  A and B are chunks of the two strings.
+	 1: Not end of comparison:
+	 A != B: branch to cleanup code to compute result.
+	 A == B: check for 0 byte, next block if not found.
+	 2: End of the inline comparison:
+	 A != B: branch to cleanup code to compute result.
+	 A == B: check for 0 byte, call strcmp/strncmp
+	 3: compared requested N bytes:
+	 A == B: branch to result 0.
+	 A != B: cleanup code to compute result.  */
+
+      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
+
+      rtx dst_label;
+      if (remain > 0 || equality_compare_rest)
+	{
+	  /* Branch to cleanup code, otherwise fall through to do
+	     more compares.  */
+	  if (!cleanup_label)
+	    cleanup_label = gen_label_rtx ();
+	  dst_label = cleanup_label;
+	}
+      else
+	/* Branch to end and produce result of 0.  */
+	dst_label = final_move_label;
+
+      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
+      rtx cond = gen_reg_rtx (CCmode);
+
+      /* Always produce the 0 result, it is needed if
+	 cmpb finds a 0 byte in this chunk.  */
+      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
+      rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
+
+      rtx cmp_rtx;
+      if (remain == 0 && !equality_compare_rest)
+	cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
+      else
+	cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+
+      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
+					 lab_ref, pc_rtx);
+      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+      JUMP_LABEL (j) = dst_label;
+      LABEL_NUSES (dst_label) += 1;
+
+      if (remain > 0 || equality_compare_rest)
+	{
+	  /* Generate a cmpb to test for a 0 byte and branch
+	     to final result if found.  */
+	  rtx cmpb_zero = gen_reg_rtx (word_mode);
+	  rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
+	  rtx condz = gen_reg_rtx (CCmode);
+	  rtx zero_reg = gen_reg_rtx (word_mode);
+	  if (word_mode == SImode)
+	    {
+	      emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
+	      emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
+	      if (cmp_bytes < word_mode_size)
+		{
+		  /* Don't want to look at zero bytes past end.  */
+		  HOST_WIDE_INT mb =
+		    BITS_PER_UNIT * (word_mode_size - cmp_bytes);
+		  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
+		  emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask));
+		}
+	    }
+	  else
+	    {
+	      emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
+	      emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
+	      if (cmp_bytes < word_mode_size)
+		{
+		  /* Don't want to look at zero bytes past end.  */
+		  HOST_WIDE_INT mb =
+		    BITS_PER_UNIT * (word_mode_size - cmp_bytes);
+		  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
+		  emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask));
+		}
+	    }
+
+	  emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
+	  rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
+	  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
+					     lab_ref_fin, pc_rtx);
+	  rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+	  JUMP_LABEL (j2) = final_move_label;
+	  LABEL_NUSES (final_move_label) += 1;
+
+	}
+
+      offset += cmp_bytes;
+      bytes_to_compare -= cmp_bytes;
+    }
+
+  if (equality_compare_rest)
+    {
+      /* Update pointers past what has been compared already.  */
+      src1 = adjust_address (orig_src1, load_mode, offset);
+      src2 = adjust_address (orig_src2, load_mode, offset);
+
+      if (!REG_P (XEXP (src1, 0)))
+	{
+	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
+	  src1 = replace_equiv_address (src1, src1_reg);
+	}
+      set_mem_size (src1, cmp_bytes);
+
+      if (!REG_P (XEXP (src2, 0)))
+	{
+	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
+	  src2 = replace_equiv_address (src2, src2_reg);
+	}
+      set_mem_size (src2, cmp_bytes);
+
+      /* Construct call to strcmp/strncmp to compare the rest of the string.  */
+      if (no_length)
+	{
+	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
+	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
+				   target, LCT_NORMAL, GET_MODE (target), 2,
+				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
+				   force_reg (Pmode, XEXP (src2, 0)), Pmode);
+	}
+      else
+	{
+	  rtx len_rtx;
+	  if (TARGET_64BIT)
+	    len_rtx = gen_reg_rtx (DImode);
+	  else
+	    len_rtx = gen_reg_rtx (SImode);
+
+	  emit_move_insn (len_rtx, GEN_INT (bytes - compare_length));
+	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
+	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
+				   target, LCT_NORMAL, GET_MODE (target), 3,
+				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
+				   force_reg (Pmode, XEXP (src2, 0)), Pmode,
+				   len_rtx, GET_MODE (len_rtx));
+	}
+
+      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
+      rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
+      JUMP_LABEL (jmp) = final_label;
+      LABEL_NUSES (final_label) += 1;
+      emit_barrier ();
+    }
+
+  if (cleanup_label)
+    emit_label (cleanup_label);
+
+  /* Generate the final sequence that identifies the differing
+     byte and generates the final result, taking into account
+     zero bytes:
+
+     cmpb              cmpb_result1, src1, src2
+     cmpb              cmpb_result2, src1, zero
+     orc               cmpb_result1, cmp_result1, cmpb_result2
+     cntlzd            get bit of first zero/diff byte
+     addi              convert for rldcl use
+     rldcl rldcl       extract diff/zero byte
+     subf              subtract for final result
+  */
+
+  rtx cmpb_diff = gen_reg_rtx (word_mode);
+  rtx cmpb_zero = gen_reg_rtx (word_mode);
+  rtx rot_amt = gen_reg_rtx (word_mode);
+  rtx zero_reg = gen_reg_rtx (word_mode);
+
+  rtx rot1_1 = gen_reg_rtx (word_mode);
+  rtx rot1_2 = gen_reg_rtx (word_mode);
+  rtx rot2_1 = gen_reg_rtx (word_mode);
+  rtx rot2_2 = gen_reg_rtx (word_mode);
+
+  if (word_mode == SImode)
+    {
+      emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
+      emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
+      emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
+      emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
+      emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
+      emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
+      emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
+      emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1,
+			      gen_lowpart (SImode, rot_amt)));
+      emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
+      emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2,
+			      gen_lowpart (SImode, rot_amt)));
+      emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
+      emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2));
+    }
+  else
+    {
+      emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
+      emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
+      emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
+      emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
+      emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
+      emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
+      emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
+      emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1,
+			      gen_lowpart (SImode, rot_amt)));
+      emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
+      emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2,
+			      gen_lowpart (SImode, rot_amt)));
+      emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
+      emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2));
+    }
+
+  emit_label (final_move_label);
+  emit_insn (gen_movsi (target,
+			gen_lowpart (SImode, result_reg)));
+  emit_label (final_label);
+  return true;
+}
+
+/* Expand a block move operation, and return 1 if successful.  Return 0
+   if we should let the compiler generate normal code.
+
+   operands[0] is the destination
+   operands[1] is the source
+   operands[2] is the length
+   operands[3] is the alignment */
+
+#define MAX_MOVE_REG 4
+
+int
+expand_block_move (rtx operands[])
+{
+  rtx orig_dest = operands[0];
+  rtx orig_src	= operands[1];
+  rtx bytes_rtx	= operands[2];
+  rtx align_rtx = operands[3];
+  int constp	= (GET_CODE (bytes_rtx) == CONST_INT);
+  int align;
+  int bytes;
+  int offset;
+  int move_bytes;
+  rtx stores[MAX_MOVE_REG];
+  int num_reg = 0;
+
+  /* If this is not a fixed size move, just call memcpy */
+  if (! constp)
+    return 0;
+
+  /* This must be a fixed size alignment */
+  gcc_assert (GET_CODE (align_rtx) == CONST_INT);
+  align = INTVAL (align_rtx) * BITS_PER_UNIT;
+
+  /* Anything to move? */
+  bytes = INTVAL (bytes_rtx);
+  if (bytes <= 0)
+    return 1;
+
+  if (bytes > rs6000_block_move_inline_limit)
+    return 0;
+
+  for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
+    {
+      union {
+	rtx (*movmemsi) (rtx, rtx, rtx, rtx);
+	rtx (*mov) (rtx, rtx);
+      } gen_func;
+      machine_mode mode = BLKmode;
+      rtx src, dest;
+
+      /* Altivec first, since it will be faster than a string move
+	 when it applies, and usually not significantly larger.  */
+      if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+	{
+	  move_bytes = 16;
+	  mode = V4SImode;
+	  gen_func.mov = gen_movv4si;
+	}
+      else if (TARGET_STRING
+	  && bytes > 24		/* move up to 32 bytes at a time */
+	  && ! fixed_regs[5]
+	  && ! fixed_regs[6]
+	  && ! fixed_regs[7]
+	  && ! fixed_regs[8]
+	  && ! fixed_regs[9]
+	  && ! fixed_regs[10]
+	  && ! fixed_regs[11]
+	  && ! fixed_regs[12])
+	{
+	  move_bytes = (bytes > 32) ? 32 : bytes;
+	  gen_func.movmemsi = gen_movmemsi_8reg;
+	}
+      else if (TARGET_STRING
+	       && bytes > 16	/* move up to 24 bytes at a time */
+	       && ! fixed_regs[5]
+	       && ! fixed_regs[6]
+	       && ! fixed_regs[7]
+	       && ! fixed_regs[8]
+	       && ! fixed_regs[9]
+	       && ! fixed_regs[10])
+	{
+	  move_bytes = (bytes > 24) ? 24 : bytes;
+	  gen_func.movmemsi = gen_movmemsi_6reg;
+	}
+      else if (TARGET_STRING
+	       && bytes > 8	/* move up to 16 bytes at a time */
+	       && ! fixed_regs[5]
+	       && ! fixed_regs[6]
+	       && ! fixed_regs[7]
+	       && ! fixed_regs[8])
+	{
+	  move_bytes = (bytes > 16) ? 16 : bytes;
+	  gen_func.movmemsi = gen_movmemsi_4reg;
+	}
+      else if (bytes >= 8 && TARGET_POWERPC64
+	       && (align >= 64 || !STRICT_ALIGNMENT))
+	{
+	  move_bytes = 8;
+	  mode = DImode;
+	  gen_func.mov = gen_movdi;
+	  if (offset == 0 && align < 64)
+	    {
+	      rtx addr;
+
+	      /* If the address form is reg+offset with offset not a
+		 multiple of four, reload into reg indirect form here
+		 rather than waiting for reload.  This way we get one
+		 reload, not one per load and/or store.  */
+	      addr = XEXP (orig_dest, 0);
+	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
+		  && GET_CODE (XEXP (addr, 1)) == CONST_INT
+		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
+		{
+		  addr = copy_addr_to_reg (addr);
+		  orig_dest = replace_equiv_address (orig_dest, addr);
+		}
+	      addr = XEXP (orig_src, 0);
+	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
+		  && GET_CODE (XEXP (addr, 1)) == CONST_INT
+		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
+		{
+		  addr = copy_addr_to_reg (addr);
+		  orig_src = replace_equiv_address (orig_src, addr);
+		}
+	    }
+	}
+      else if (TARGET_STRING && bytes > 4 && !TARGET_POWERPC64)
+	{			/* move up to 8 bytes at a time */
+	  move_bytes = (bytes > 8) ? 8 : bytes;
+	  gen_func.movmemsi = gen_movmemsi_2reg;
+	}
+      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
+	{			/* move 4 bytes */
+	  move_bytes = 4;
+	  mode = SImode;
+	  gen_func.mov = gen_movsi;
+	}
+      else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
+	{			/* move 2 bytes */
+	  move_bytes = 2;
+	  mode = HImode;
+	  gen_func.mov = gen_movhi;
+	}
+      else if (TARGET_STRING && bytes > 1)
+	{			/* move up to 4 bytes at a time */
+	  move_bytes = (bytes > 4) ? 4 : bytes;
+	  gen_func.movmemsi = gen_movmemsi_1reg;
+	}
+      else /* move 1 byte at a time */
+	{
+	  move_bytes = 1;
+	  mode = QImode;
+	  gen_func.mov = gen_movqi;
+	}
+
+      src = adjust_address (orig_src, mode, offset);
+      dest = adjust_address (orig_dest, mode, offset);
+
+      if (mode != BLKmode)
+	{
+	  rtx tmp_reg = gen_reg_rtx (mode);
+
+	  emit_insn ((*gen_func.mov) (tmp_reg, src));
+	  stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
+	}
+
+      if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
+	{
+	  int i;
+	  for (i = 0; i < num_reg; i++)
+	    emit_insn (stores[i]);
+	  num_reg = 0;
+	}
+
+      if (mode == BLKmode)
+	{
+	  /* Move the address into scratch registers.  The movmemsi
+	     patterns require zero offset.  */
+	  if (!REG_P (XEXP (src, 0)))
+	    {
+	      rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
+	      src = replace_equiv_address (src, src_reg);
+	    }
+	  set_mem_size (src, move_bytes);
+
+	  if (!REG_P (XEXP (dest, 0)))
+	    {
+	      rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
+	      dest = replace_equiv_address (dest, dest_reg);
+	    }
+	  set_mem_size (dest, move_bytes);
+
+	  emit_insn ((*gen_func.movmemsi) (dest, src,
+					   GEN_INT (move_bytes & 31),
+					   align_rtx));
+	}
+    }
+
+  return 1;
+}
+
+
+/* Return a string to perform a load_multiple operation.
+   operands[0] is the vector.
+   operands[1] is the source address.
+   operands[2] is the first destination register.  */
+
+const char *
+rs6000_output_load_multiple (rtx operands[3])
+{
+  /* We have to handle the case where the pseudo used to contain the address
+     is assigned to one of the output registers.  */
+  int i, j;
+  int words = XVECLEN (operands[0], 0);
+  rtx xop[10];
+
+  if (XVECLEN (operands[0], 0) == 1)
+    return "lwz %2,0(%1)";
+
+  for (i = 0; i < words; i++)
+    if (refers_to_regno_p (REGNO (operands[2]) + i, operands[1]))
+      {
+	if (i == words-1)
+	  {
+	    xop[0] = GEN_INT (4 * (words-1));
+	    xop[1] = operands[1];
+	    xop[2] = operands[2];
+	    output_asm_insn ("lswi %2,%1,%0\n\tlwz %1,%0(%1)", xop);
+	    return "";
+	  }
+	else if (i == 0)
+	  {
+	    xop[0] = GEN_INT (4 * (words-1));
+	    xop[1] = operands[1];
+	    xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + 1);
+	    output_asm_insn ("addi %1,%1,4\n\tlswi %2,%1,%0\n\tlwz %1,-4(%1)", xop);
+	    return "";
+	  }
+	else
+	  {
+	    for (j = 0; j < words; j++)
+	      if (j != i)
+		{
+		  xop[0] = GEN_INT (j * 4);
+		  xop[1] = operands[1];
+		  xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + j);
+		  output_asm_insn ("lwz %2,%0(%1)", xop);
+		}
+	    xop[0] = GEN_INT (i * 4);
+	    xop[1] = operands[1];
+	    output_asm_insn ("lwz %1,%0(%1)", xop);
+	    return "";
+	  }
+      }
+
+  return "lswi %2,%1,%N0";
+}
+
Index: config/rs6000/rs6000.c
===================================================================
--- config/rs6000/rs6000.c	(revision 249565)
+++ config/rs6000/rs6000.c	(working copy)
@@ -18585,113 +18585,6 @@
     }
 }
 
-
-/* Expand a block clear operation, and return 1 if successful.  Return 0
-   if we should let the compiler generate normal code.
-
-   operands[0] is the destination
-   operands[1] is the length
-   operands[3] is the alignment */
-
-int
-expand_block_clear (rtx operands[])
-{
-  rtx orig_dest = operands[0];
-  rtx bytes_rtx	= operands[1];
-  rtx align_rtx = operands[3];
-  bool constp	= (GET_CODE (bytes_rtx) == CONST_INT);
-  HOST_WIDE_INT align;
-  HOST_WIDE_INT bytes;
-  int offset;
-  int clear_bytes;
-  int clear_step;
-
-  /* If this is not a fixed size move, just call memcpy */
-  if (! constp)
-    return 0;
-
-  /* This must be a fixed size alignment  */
-  gcc_assert (GET_CODE (align_rtx) == CONST_INT);
-  align = INTVAL (align_rtx) * BITS_PER_UNIT;
-
-  /* Anything to clear? */
-  bytes = INTVAL (bytes_rtx);
-  if (bytes <= 0)
-    return 1;
-
-  /* Use the builtin memset after a point, to avoid huge code bloat.
-     When optimize_size, avoid any significant code bloat; calling
-     memset is about 4 instructions, so allow for one instruction to
-     load zero and three to do clearing.  */
-  if (TARGET_ALTIVEC && align >= 128)
-    clear_step = 16;
-  else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
-    clear_step = 8;
-  else
-    clear_step = 4;
-
-  if (optimize_size && bytes > 3 * clear_step)
-    return 0;
-  if (! optimize_size && bytes > 8 * clear_step)
-    return 0;
-
-  for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
-    {
-      machine_mode mode = BLKmode;
-      rtx dest;
-
-      if (bytes >= 16 && TARGET_ALTIVEC && align >= 128)
-	{
-	  clear_bytes = 16;
-	  mode = V4SImode;
-	}
-      else if (bytes >= 8 && TARGET_POWERPC64
-	       && (align >= 64 || !STRICT_ALIGNMENT))
-	{
-	  clear_bytes = 8;
-	  mode = DImode;
-	  if (offset == 0 && align < 64)
-	    {
-	      rtx addr;
-
-	      /* If the address form is reg+offset with offset not a
-		 multiple of four, reload into reg indirect form here
-		 rather than waiting for reload.  This way we get one
-		 reload, not one per store.  */
-	      addr = XEXP (orig_dest, 0);
-	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
-		  && GET_CODE (XEXP (addr, 1)) == CONST_INT
-		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
-		{
-		  addr = copy_addr_to_reg (addr);
-		  orig_dest = replace_equiv_address (orig_dest, addr);
-		}
-	    }
-	}
-      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
-	{			/* move 4 bytes */
-	  clear_bytes = 4;
-	  mode = SImode;
-	}
-      else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
-	{			/* move 2 bytes */
-	  clear_bytes = 2;
-	  mode = HImode;
-	}
-      else /* move 1 byte at a time */
-	{
-	  clear_bytes = 1;
-	  mode = QImode;
-	}
-
-      dest = adjust_address (orig_dest, mode, offset);
-
-      emit_move_insn (dest, CONST0_RTX (mode));
-    }
-
-  return 1;
-}
-
 /* Emit a potentially record-form instruction, setting DST from SRC.
    If DOT is 0, that is all; otherwise, set CCREG to the result of the
    signed comparison of DST with zero.  If DOT is 1, the generated RTL
@@ -18699,7 +18592,7 @@
    is CR0 do a single dot insn (as a PARALLEL); otherwise, do a SET and
    a separate COMPARE.  */
 
-static void
+void
 rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg)
 {
   if (dot == 0)
@@ -18728,1331 +18621,7 @@
     }
 }
 
-/* Figure out the correct instructions to generate to load data for
-   block compare.  MODE is used for the read from memory, and
-   data is zero extended if REG is wider than MODE.  If LE code
-   is being generated, bswap loads are used.
-
-   REG is the destination register to move the data into.
-   MEM is the memory block being read.
-   MODE is the mode of memory to use for the read.  */
-static void
-do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
-{
-  switch (GET_MODE (reg))
-    {
-    case DImode:
-      switch (mode)
-	{
-	case QImode:
-	  emit_insn (gen_zero_extendqidi2 (reg, mem));
-	  break;
-	case HImode:
-	  {
-	    rtx src = mem;
-	    if (!BYTES_BIG_ENDIAN)
-	      {
-		src = gen_reg_rtx (HImode);
-		emit_insn (gen_bswaphi2 (src, mem));
-	      }
-	    emit_insn (gen_zero_extendhidi2 (reg, src));
-	    break;
-	  }
-	case SImode:
-	  {
-	    rtx src = mem;
-	    if (!BYTES_BIG_ENDIAN)
-	      {
-		src = gen_reg_rtx (SImode);
-		emit_insn (gen_bswapsi2 (src, mem));
-	      }
-	    emit_insn (gen_zero_extendsidi2 (reg, src));
-	  }
-	  break;
-	case DImode:
-	  if (!BYTES_BIG_ENDIAN)
-	    emit_insn (gen_bswapdi2 (reg, mem));
-	  else
-	    emit_insn (gen_movdi (reg, mem));
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-      break;
-
-    case SImode:
-      switch (mode)
-	{
-	case QImode:
-	  emit_insn (gen_zero_extendqisi2 (reg, mem));
-	  break;
-	case HImode:
-	  {
-	    rtx src = mem;
-	    if (!BYTES_BIG_ENDIAN)
-	      {
-		src = gen_reg_rtx (HImode);
-		emit_insn (gen_bswaphi2 (src, mem));
-	      }
-	    emit_insn (gen_zero_extendhisi2 (reg, src));
-	    break;
-	  }
-	case SImode:
-	  if (!BYTES_BIG_ENDIAN)
-	    emit_insn (gen_bswapsi2 (reg, mem));
-	  else
-	    emit_insn (gen_movsi (reg, mem));
-	  break;
-	case DImode:
-	  /* DImode is larger than the destination reg so is not expected.  */
-	  gcc_unreachable ();
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-      break;
-    default:
-      gcc_unreachable ();
-      break;
-    }
-}
-
-/* Select the mode to be used for reading the next chunk of bytes
-   in the compare.
-
-   OFFSET is the current read offset from the beginning of the block.
-   BYTES is the number of bytes remaining to be read.
-   ALIGN is the minimum alignment of the memory blocks being compared in bytes.
-   WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
-   the largest allowable mode.  */
-static machine_mode
-select_block_compare_mode (unsigned HOST_WIDE_INT offset,
-			   unsigned HOST_WIDE_INT bytes,
-			   unsigned HOST_WIDE_INT align, bool word_mode_ok)
-{
-  /* First see if we can do a whole load unit
-     as that will be more efficient than a larger load + shift.  */
-
-  /* If big, use biggest chunk.
-     If exactly chunk size, use that size.
-     If remainder can be done in one piece with shifting, do that.
-     Do largest chunk possible without violating alignment rules.  */
-
-  /* The most we can read without potential page crossing.  */
-  unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
-
-  if (word_mode_ok && bytes >= UNITS_PER_WORD)
-    return word_mode;
-  else if (bytes == GET_MODE_SIZE (SImode))
-    return SImode;
-  else if (bytes == GET_MODE_SIZE (HImode))
-    return HImode;
-  else if (bytes == GET_MODE_SIZE (QImode))
-    return QImode;
-  else if (bytes < GET_MODE_SIZE (SImode)
-	   && offset >= GET_MODE_SIZE (SImode) - bytes)
-    /* This matches the case were we have SImode and 3 bytes
-       and offset >= 1 and permits us to move back one and overlap
-       with the previous read, thus avoiding having to shift
-       unwanted bytes off of the input.  */
-    return SImode;
-  else if (word_mode_ok && bytes < UNITS_PER_WORD
-	   && offset >= UNITS_PER_WORD-bytes)
-    /* Similarly, if we can use DImode it will get matched here and
-       can do an overlapping read that ends at the end of the block.  */
-    return word_mode;
-  else if (word_mode_ok && maxread >= UNITS_PER_WORD)
-    /* It is safe to do all remaining in one load of largest size,
-       possibly with a shift to get rid of unwanted bytes.  */
-    return word_mode;
-  else if (maxread >= GET_MODE_SIZE (SImode))
-    /* It is safe to do all remaining in one SImode load,
-       possibly with a shift to get rid of unwanted bytes.  */
-    return SImode;
-  else if (bytes > GET_MODE_SIZE (SImode))
-    return SImode;
-  else if (bytes > GET_MODE_SIZE (HImode))
-    return HImode;
-
-  /* final fallback is do one byte */
-  return QImode;
-}
-
-/* Compute the alignment of pointer+OFFSET where the original alignment
-   of pointer was BASE_ALIGN.  */
-static unsigned HOST_WIDE_INT
-compute_current_alignment (unsigned HOST_WIDE_INT base_align,
-			   unsigned HOST_WIDE_INT offset)
-{
-  if (offset == 0)
-    return base_align;
-  return min (base_align, offset & -offset);
-}
-
-/* Expand a block compare operation, and return true if successful.
-   Return false if we should let the compiler generate normal code,
-   probably a memcmp call.
-
-   OPERANDS[0] is the target (result).
-   OPERANDS[1] is the first source.
-   OPERANDS[2] is the second source.
-   OPERANDS[3] is the length.
-   OPERANDS[4] is the alignment.  */
-bool
-expand_block_compare (rtx operands[])
-{
-  rtx target = operands[0];
-  rtx orig_src1 = operands[1];
-  rtx orig_src2 = operands[2];
-  rtx bytes_rtx = operands[3];
-  rtx align_rtx = operands[4];
-  HOST_WIDE_INT cmp_bytes = 0;
-  rtx src1 = orig_src1;
-  rtx src2 = orig_src2;
-
-  /* This case is complicated to handle because the subtract
-     with carry instructions do not generate the 64-bit
-     carry and so we must emit code to calculate it ourselves.
-     We choose not to implement this yet.  */
-  if (TARGET_32BIT && TARGET_POWERPC64)
-    return false;
-
-  /* If this is not a fixed size compare, just call memcmp.  */
-  if (!CONST_INT_P (bytes_rtx))
-    return false;
-
-  /* This must be a fixed size alignment.  */
-  if (!CONST_INT_P (align_rtx))
-    return false;
-
-  unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
-
-  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff.  */
-  if (SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src1))
-      || SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src2)))
-    return false;
-
-  gcc_assert (GET_MODE (target) == SImode);
-
-  /* Anything to move?  */
-  unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
-  if (bytes == 0)
-    return true;
-
-  /* The code generated for p7 and older is not faster than glibc
-     memcmp if alignment is small and length is not short, so bail
-     out to avoid those conditions.  */
-  if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
-      && ((base_align == 1 && bytes > 16)
-	  || (base_align == 2 && bytes > 32)))
-    return false;
-
-  rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
-  rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
-  /* P7/P8 code uses cond for subfc. but P9 uses
-     it for cmpld which needs CCUNSmode. */
-  rtx cond;
-  if (TARGET_P9_MISC)
-    cond = gen_reg_rtx (CCUNSmode);
-  else
-    cond = gen_reg_rtx (CCmode);
-
-  /* If we have an LE target without ldbrx and word_mode is DImode,
-     then we must avoid using word_mode.  */
-  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
-		       && word_mode == DImode);
-
-  /* Strategy phase.  How many ops will this take and should we expand it?  */
-
-  unsigned HOST_WIDE_INT offset = 0;
-  machine_mode load_mode =
-    select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
-  unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
-
-  /* We don't want to generate too much code.  */
-  unsigned HOST_WIDE_INT max_bytes =
-    load_mode_size * (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_limit;
-  if (!IN_RANGE (bytes, 1, max_bytes))
-    return false;
-
-  bool generate_6432_conversion = false;
-  rtx convert_label = NULL;
-  rtx final_label = NULL;
-
-  /* Example of generated code for 18 bytes aligned 1 byte.
-     Compiled with -fno-reorder-blocks for clarity.
-             ldbrx 10,31,8
-             ldbrx 9,7,8
-             subfc. 9,9,10
-             bne 0,.L6487
-             addi 9,12,8
-             addi 5,11,8
-             ldbrx 10,0,9
-             ldbrx 9,0,5
-             subfc. 9,9,10
-             bne 0,.L6487
-             addi 9,12,16
-             lhbrx 10,0,9
-             addi 9,11,16
-             lhbrx 9,0,9
-             subf 9,9,10
-             b .L6488
-             .p2align 4,,15
-     .L6487: #convert_label
-             popcntd 9,9
-             subfe 10,10,10
-             or 9,9,10
-     .L6488: #final_label
-             extsw 10,9
-
-     We start off with DImode for two blocks that jump to the DI->SI conversion
-     if the difference is found there, then a final block of HImode that skips
-     the DI->SI conversion.  */
-
-  while (bytes > 0)
-    {
-      unsigned int align = compute_current_alignment (base_align, offset);
-      if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
-	load_mode = select_block_compare_mode (offset, bytes, align,
-					       word_mode_ok);
-      else
-	load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
-      load_mode_size = GET_MODE_SIZE (load_mode);
-      if (bytes >= load_mode_size)
-	cmp_bytes = load_mode_size;
-      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
-	{
-	  /* Move this load back so it doesn't go past the end.
-	     P8/P9 can do this efficiently.  */
-	  unsigned int extra_bytes = load_mode_size - bytes;
-	  cmp_bytes = bytes;
-	  if (extra_bytes < offset)
-	    {
-	      offset -= extra_bytes;
-	      cmp_bytes = load_mode_size;
-	      bytes = cmp_bytes;
-	    }
-	}
-      else
-	/* P7 and earlier can't do the overlapping load trick fast,
-	   so this forces a non-overlapping load and a shift to get
-	   rid of the extra bytes.  */
-	cmp_bytes = bytes;
-
-      src1 = adjust_address (orig_src1, load_mode, offset);
-      src2 = adjust_address (orig_src2, load_mode, offset);
-
-      if (!REG_P (XEXP (src1, 0)))
-	{
-	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
-	  src1 = replace_equiv_address (src1, src1_reg);
-	}
-      set_mem_size (src1, cmp_bytes);
-
-      if (!REG_P (XEXP (src2, 0)))
-	{
-	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
-	  src2 = replace_equiv_address (src2, src2_reg);
-	}
-      set_mem_size (src2, cmp_bytes);
-
-      do_load_for_compare (tmp_reg_src1, src1, load_mode);
-      do_load_for_compare (tmp_reg_src2, src2, load_mode);
-
-      if (cmp_bytes < load_mode_size)
-	{
-	  /* Shift unneeded bytes off.  */
-	  rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
-	  if (word_mode == DImode)
-	    {
-	      emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
-	      emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
-	    }
-	  else
-	    {
-	      emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
-	      emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
-	    }
-	}
-
-      int remain = bytes - cmp_bytes;
-      if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
-	{
-	  /* Target is larger than load size so we don't need to
-	     reduce result size.  */
-
-	  /* We previously did a block that need 64->32 conversion but
-	     the current block does not, so a label is needed to jump
-	     to the end.  */
-	  if (generate_6432_conversion && !final_label)
-	    final_label = gen_label_rtx ();
-
-	  if (remain > 0)
-	    {
-	      /* This is not the last block, branch to the end if the result
-		 of this subtract is not zero.  */
-	      if (!final_label)
-		final_label = gen_label_rtx ();
-	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
-	      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
-	      rtx cr = gen_reg_rtx (CCmode);
-	      rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
-	      emit_insn (gen_movsi (target,
-				    gen_lowpart (SImode, tmp_reg_src2)));
-	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
-	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
-						 fin_ref, pc_rtx);
-	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-	      JUMP_LABEL (j) = final_label;
-	      LABEL_NUSES (final_label) += 1;
-	    }
-	  else
-	    {
-	      if (word_mode == DImode)
-		{
-		  emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
-					 tmp_reg_src2));
-		  emit_insn (gen_movsi (target,
-					gen_lowpart (SImode, tmp_reg_src2)));
-		}
-	      else
-		emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
-
-	      if (final_label)
-		{
-		  rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
-		  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
-		  JUMP_LABEL(j) = final_label;
-		  LABEL_NUSES (final_label) += 1;
-		  emit_barrier ();
-		}
-	    }
-	}
-      else
-	{
-	  /* Do we need a 64->32 conversion block? We need the 64->32
-	     conversion even if target size == load_mode size because
-	     the subtract generates one extra bit.  */
-	  generate_6432_conversion = true;
-
-	  if (remain > 0)
-	    {
-	      if (!convert_label)
-		convert_label = gen_label_rtx ();
-
-	      /* Compare to zero and branch to convert_label if not zero.  */
-	      rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
-	      if (TARGET_P9_MISC)
-		{
-		/* Generate a compare, and convert with a setb later.  */
-		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
-					     tmp_reg_src2);
-		  emit_insn (gen_rtx_SET (cond, cmp));
-		}
-	      else
-		/* Generate a subfc. and use the longer
-		   sequence for conversion.  */
-		if (TARGET_64BIT)
-		  emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
-						     tmp_reg_src1, cond));
-		else
-		  emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
-						     tmp_reg_src1, cond));
-	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
-	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
-						 cvt_ref, pc_rtx);
-	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-	      JUMP_LABEL(j) = convert_label;
-	      LABEL_NUSES (convert_label) += 1;
-	    }
-	  else
-	    {
-	      /* Just do the subtract/compare.  Since this is the last block
-		 the convert code will be generated immediately following.  */
-	      if (TARGET_P9_MISC)
-		{
-		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
-					     tmp_reg_src2);
-		  emit_insn (gen_rtx_SET (cond, cmp));
-		}
-	      else
-		if (TARGET_64BIT)
-		  emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
-						tmp_reg_src1));
-		else
-		  emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
-						tmp_reg_src1));
-	    }
-	}
-
-      offset += cmp_bytes;
-      bytes -= cmp_bytes;
-    }
-
-  if (generate_6432_conversion)
-    {
-      if (convert_label)
-	emit_label (convert_label);
-
-      /* We need to produce DI result from sub, then convert to target SI
-	 while maintaining <0 / ==0 / >0 properties. This sequence works:
-	 subfc L,A,B
-	 subfe H,H,H
-	 popcntd L,L
-	 rldimi L,H,6,0
-
-	 This is an alternate one Segher cooked up if somebody
-	 wants to expand this for something that doesn't have popcntd:
-	 subfc L,a,b
-	 subfe H,x,x
-	 addic t,L,-1
-	 subfe v,t,L
-	 or z,v,H
-
-	 And finally, p9 can just do this:
-	 cmpld A,B
-	 setb r */
-
-      if (TARGET_P9_MISC)
-	{
-	  emit_insn (gen_setb_unsigned (target, cond));
-	}
-      else
-	{
-	  if (TARGET_64BIT)
-	    {
-	      rtx tmp_reg_ca = gen_reg_rtx (DImode);
-	      emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
-	      emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
-	      emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
-	      emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
-	    }
-	  else
-	    {
-	      rtx tmp_reg_ca = gen_reg_rtx (SImode);
-	      emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
-	      emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
-	      emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
-	    }
-	}
-    }
-
-  if (final_label)
-    emit_label (final_label);
-
-  gcc_assert (bytes == 0);
-  return true;
-}
-
-/* Generate alignment check and branch code to set up for
-   strncmp when we don't have DI alignment.
-   STRNCMP_LABEL is the label to branch if there is a page crossing.
-   SRC is the string pointer to be examined.
-   BYTES is the max number of bytes to compare.  */
-static void
-expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
-{
-  rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
-  rtx src_check = copy_addr_to_reg (XEXP (src, 0));
-  if (GET_MODE (src_check) == SImode)
-    emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff)));
-  else
-    emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff)));
-  rtx cond = gen_reg_rtx (CCmode);
-  emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check,
-					 GEN_INT (4096 - bytes)));
-
-  rtx cmp_rtx = gen_rtx_LT (VOIDmode, cond, const0_rtx);
-
-  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
-				     pc_rtx, lab_ref);
-  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-  JUMP_LABEL (j) = strncmp_label;
-  LABEL_NUSES (strncmp_label) += 1;
-}
-
-/* Expand a string compare operation with length, and return
-   true if successful. Return false if we should let the
-   compiler generate normal code, probably a strncmp call.
-
-   OPERANDS[0] is the target (result).
-   OPERANDS[1] is the first source.
-   OPERANDS[2] is the second source.
-   If NO_LENGTH is zero, then:
-   OPERANDS[3] is the length.
-   OPERANDS[4] is the alignment in bytes.
-   If NO_LENGTH is nonzero, then:
-   OPERANDS[3] is the alignment in bytes.  */
-bool
-expand_strn_compare (rtx operands[], int no_length)
-{
-  rtx target = operands[0];
-  rtx orig_src1 = operands[1];
-  rtx orig_src2 = operands[2];
-  rtx bytes_rtx, align_rtx;
-  if (no_length)
-    {
-      bytes_rtx = NULL;
-      align_rtx = operands[3];
-    }
-  else
-    {
-      bytes_rtx = operands[3];
-      align_rtx = operands[4];
-    }
-  unsigned HOST_WIDE_INT cmp_bytes = 0;
-  rtx src1 = orig_src1;
-  rtx src2 = orig_src2;
-
-  /* If we have a length, it must be constant. This simplifies things
-     a bit as we don't have to generate code to check if we've exceeded
-     the length. Later this could be expanded to handle this case.  */
-  if (!no_length && !CONST_INT_P (bytes_rtx))
-    return false;
-
-  /* This must be a fixed size alignment.  */
-  if (!CONST_INT_P (align_rtx))
-    return false;
-
-  unsigned int base_align = UINTVAL (align_rtx);
-  int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
-  int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
-
-  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff.  */
-  if (SLOW_UNALIGNED_ACCESS (word_mode, align1)
-      || SLOW_UNALIGNED_ACCESS (word_mode, align2))
-    return false;
-
-  gcc_assert (GET_MODE (target) == SImode);
-
-  /* If we have an LE target without ldbrx and word_mode is DImode,
-     then we must avoid using word_mode.  */
-  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
-		       && word_mode == DImode);
-
-  unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
-
-  unsigned HOST_WIDE_INT offset = 0;
-  unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available.  */
-  unsigned HOST_WIDE_INT compare_length; /* How much to compare inline.  */
-  if (no_length)
-    /* Use this as a standin to determine the mode to use.  */
-    bytes = rs6000_string_compare_inline_limit * word_mode_size;
-  else
-    bytes = UINTVAL (bytes_rtx);
-
-  machine_mode load_mode =
-    select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
-  unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
-  compare_length = rs6000_string_compare_inline_limit * load_mode_size;
-
-  /* If we have equality at the end of the last compare and we have not
-     found the end of the string, we need to call strcmp/strncmp to
-     compare the remainder.  */
-  bool equality_compare_rest = false;
-
-  if (no_length)
-    {
-      bytes = compare_length;
-      equality_compare_rest = true;
-    }
-  else
-    {
-      if (bytes <= compare_length)
-	compare_length = bytes;
-      else
-	equality_compare_rest = true;
-    }
-
-  rtx result_reg = gen_reg_rtx (word_mode);
-  rtx final_move_label = gen_label_rtx ();
-  rtx final_label = gen_label_rtx ();
-  rtx begin_compare_label = NULL;
-
-  if (base_align < 8)
-    {
-      /* Generate code that checks distance to 4k boundary for this case.  */
-      begin_compare_label = gen_label_rtx ();
-      rtx strncmp_label = gen_label_rtx ();
-      rtx jmp;
-
-      /* Strncmp for power8 in glibc does this:
-	 rldicl	r8,r3,0,52
-	 cmpldi	cr7,r8,4096-16
-	 bgt	cr7,L(pagecross) */
-
-      /* Make sure that the length we use for the alignment test and
-         the subsequent code generation are in agreement so we do not
-         go past the length we tested for a 4k boundary crossing.  */
-      unsigned HOST_WIDE_INT align_test = compare_length;
-      if (align_test < 8)
-        {
-          align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
-          base_align = align_test;
-        }
-      else
-        {
-          align_test = ROUND_UP (align_test, 8);
-          base_align = 8;
-        }
-
-      if (align1 < 8)
-        expand_strncmp_align_check (strncmp_label, src1, align_test);
-      if (align2 < 8)
-        expand_strncmp_align_check (strncmp_label, src2, align_test);
-
-      /* Now generate the following sequence:
-	 - branch to begin_compare
-	 - strncmp_label
-	 - call to strncmp
-	 - branch to final_label
-	 - begin_compare_label */
-
-      rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
-      jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
-      JUMP_LABEL (jmp) = begin_compare_label;
-      LABEL_NUSES (begin_compare_label) += 1;
-      emit_barrier ();
-
-      emit_label (strncmp_label);
-
-      if (!REG_P (XEXP (src1, 0)))
-	{
-	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
-	  src1 = replace_equiv_address (src1, src1_reg);
-	}
-
-      if (!REG_P (XEXP (src2, 0)))
-	{
-	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
-	  src2 = replace_equiv_address (src2, src2_reg);
-	}
-
-      if (no_length)
-	{
-	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
-	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
-				   target, LCT_NORMAL, GET_MODE (target), 2,
-				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
-				   force_reg (Pmode, XEXP (src2, 0)), Pmode);
-	}
-      else
-	{
-	  /* -m32 -mpowerpc64 results in word_mode being DImode even
-	     though otherwise it is 32-bit. The length arg to strncmp
-	     is a size_t which will be the same size as pointers.  */
-	  rtx len_rtx;
-	  if (TARGET_64BIT)
-	    len_rtx = gen_reg_rtx (DImode);
-	  else
-	    len_rtx = gen_reg_rtx (SImode);
-
-	  emit_move_insn (len_rtx, bytes_rtx);
-
-	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
-	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
-				   target, LCT_NORMAL, GET_MODE (target), 3,
-				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
-				   force_reg (Pmode, XEXP (src2, 0)), Pmode,
-				   len_rtx, GET_MODE (len_rtx));
-	}
-
-      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
-      jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
-      JUMP_LABEL (jmp) = final_label;
-      LABEL_NUSES (final_label) += 1;
-      emit_barrier ();
-      emit_label (begin_compare_label);
-    }
-
-  rtx cleanup_label = NULL;
-  rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
-  rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
-
-  /* Generate sequence of ld/ldbrx, cmpb to compare out
-     to the length specified.  */
-  unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
-  while (bytes_to_compare > 0)
-    {
-      /* Compare sequence:
-         check each 8B with: ld/ld cmpd bne
-	 If equal, use rldicr/cmpb to check for zero byte.
-         cleanup code at end:
-         cmpb          get byte that differs
-         cmpb          look for zero byte
-         orc           combine
-         cntlzd        get bit of first zero/diff byte
-         subfic        convert for rldcl use
-         rldcl rldcl   extract diff/zero byte
-         subf          subtract for final result
-
-         The last compare can branch around the cleanup code if the
-         result is zero because the strings are exactly equal.  */
-      unsigned int align = compute_current_alignment (base_align, offset);
-      if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
-	load_mode = select_block_compare_mode (offset, bytes_to_compare, align,
-					       word_mode_ok);
-      else
-	load_mode = select_block_compare_mode (0, bytes_to_compare, align,
-					       word_mode_ok);
-      load_mode_size = GET_MODE_SIZE (load_mode);
-      if (bytes_to_compare >= load_mode_size)
-	cmp_bytes = load_mode_size;
-      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
-	{
-	  /* Move this load back so it doesn't go past the end.
-	     P8/P9 can do this efficiently.  */
-	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
-	  cmp_bytes = bytes_to_compare;
-	  if (extra_bytes < offset)
-	    {
-	      offset -= extra_bytes;
-	      cmp_bytes = load_mode_size;
-	      bytes_to_compare = cmp_bytes;
-	    }
-	}
-      else
-	/* P7 and earlier can't do the overlapping load trick fast,
-	   so this forces a non-overlapping load and a shift to get
-	   rid of the extra bytes.  */
-	cmp_bytes = bytes_to_compare;
-
-      src1 = adjust_address (orig_src1, load_mode, offset);
-      src2 = adjust_address (orig_src2, load_mode, offset);
-
-      if (!REG_P (XEXP (src1, 0)))
-	{
-	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
-	  src1 = replace_equiv_address (src1, src1_reg);
-	}
-      set_mem_size (src1, cmp_bytes);
-
-      if (!REG_P (XEXP (src2, 0)))
-	{
-	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
-	  src2 = replace_equiv_address (src2, src2_reg);
-	}
-      set_mem_size (src2, cmp_bytes);
-
-      do_load_for_compare (tmp_reg_src1, src1, load_mode);
-      do_load_for_compare (tmp_reg_src2, src2, load_mode);
-
-      /* We must always left-align the data we read, and
-	 clear any bytes to the right that are beyond the string.
-	 Otherwise the cmpb sequence won't produce the correct
-	 results.  The beginning of the compare will be done
-	 with word_mode so will not have any extra shifts or
-	 clear rights.  */
-
-      if (load_mode_size < word_mode_size)
-	{
-	  /* Rotate left first. */
-	  rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
-	  if (word_mode == DImode)
-	    {
-	      emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh));
-	      emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh));
-	    }
-	  else
-	    {
-	      emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh));
-	      emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh));
-	    }
-	}
-
-      if (cmp_bytes < word_mode_size)
-	{
-	  /* Now clear right.  This plus the rotate can be
-	     turned into a rldicr instruction. */
-	  HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
-	  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
-	  if (word_mode == DImode)
-	    {
-	      emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
-	      emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
-	    }
-	  else
-	    {
-	      emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
-	      emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
-	    }
-	}
-
-      /* Cases to handle.  A and B are chunks of the two strings.
-	 1: Not end of comparison:
-	 A != B: branch to cleanup code to compute result.
-	 A == B: check for 0 byte, next block if not found.
-	 2: End of the inline comparison:
-	 A != B: branch to cleanup code to compute result.
-	 A == B: check for 0 byte, call strcmp/strncmp
-	 3: compared requested N bytes:
-	 A == B: branch to result 0.
-	 A != B: cleanup code to compute result.  */
-
-      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
-
-      rtx dst_label;
-      if (remain > 0 || equality_compare_rest)
-	{
-	  /* Branch to cleanup code, otherwise fall through to do
-	     more compares.  */
-	  if (!cleanup_label)
-	    cleanup_label = gen_label_rtx ();
-	  dst_label = cleanup_label;
-	}
-      else
-	/* Branch to end and produce result of 0.  */
-	dst_label = final_move_label;
-
-      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
-      rtx cond = gen_reg_rtx (CCmode);
-
-      /* Always produce the 0 result, it is needed if
-	 cmpb finds a 0 byte in this chunk.  */
-      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
-      rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
-
-      rtx cmp_rtx;
-      if (remain == 0 && !equality_compare_rest)
-	cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
-      else
-	cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
-
-      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
-					 lab_ref, pc_rtx);
-      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-      JUMP_LABEL (j) = dst_label;
-      LABEL_NUSES (dst_label) += 1;
-
-      if (remain > 0 || equality_compare_rest)
-	{
-	  /* Generate a cmpb to test for a 0 byte and branch
-	     to final result if found.  */
-	  rtx cmpb_zero = gen_reg_rtx (word_mode);
-	  rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
-	  rtx condz = gen_reg_rtx (CCmode);
-	  rtx zero_reg = gen_reg_rtx (word_mode);
-	  if (word_mode == SImode)
-	    {
-	      emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
-	      emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
-	      if (cmp_bytes < word_mode_size)
-		{
-		  /* Don't want to look at zero bytes past end.  */
-		  HOST_WIDE_INT mb =
-		    BITS_PER_UNIT * (word_mode_size - cmp_bytes);
-		  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
-		  emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask));
-		}
-	    }
-	  else
-	    {
-	      emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
-	      emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
-	      if (cmp_bytes < word_mode_size)
-		{
-		  /* Don't want to look at zero bytes past end.  */
-		  HOST_WIDE_INT mb =
-		    BITS_PER_UNIT * (word_mode_size - cmp_bytes);
-		  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
-		  emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask));
-		}
-	    }
-
-	  emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
-	  rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
-	  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
-					     lab_ref_fin, pc_rtx);
-	  rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-	  JUMP_LABEL (j2) = final_move_label;
-	  LABEL_NUSES (final_move_label) += 1;
-
-	}
-
-      offset += cmp_bytes;
-      bytes_to_compare -= cmp_bytes;
-    }
-
-  if (equality_compare_rest)
-    {
-      /* Update pointers past what has been compared already.  */
-      src1 = adjust_address (orig_src1, load_mode, offset);
-      src2 = adjust_address (orig_src2, load_mode, offset);
-
-      if (!REG_P (XEXP (src1, 0)))
-	{
-	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
-	  src1 = replace_equiv_address (src1, src1_reg);
-	}
-      set_mem_size (src1, cmp_bytes);
-
-      if (!REG_P (XEXP (src2, 0)))
-	{
-	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
-	  src2 = replace_equiv_address (src2, src2_reg);
-	}
-      set_mem_size (src2, cmp_bytes);
-
-      /* Construct call to strcmp/strncmp to compare the rest of the string.  */
-      if (no_length)
-	{
-	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
-	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
-				   target, LCT_NORMAL, GET_MODE (target), 2,
-				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
-				   force_reg (Pmode, XEXP (src2, 0)), Pmode);
-	}
-      else
-	{
-	  rtx len_rtx;
-	  if (TARGET_64BIT)
-	    len_rtx = gen_reg_rtx (DImode);
-	  else
-	    len_rtx = gen_reg_rtx (SImode);
-
-	  emit_move_insn (len_rtx, GEN_INT (bytes - compare_length));
-	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
-	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
-				   target, LCT_NORMAL, GET_MODE (target), 3,
-				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
-				   force_reg (Pmode, XEXP (src2, 0)), Pmode,
-				   len_rtx, GET_MODE (len_rtx));
-	}
-
-      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
-      rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
-      JUMP_LABEL (jmp) = final_label;
-      LABEL_NUSES (final_label) += 1;
-      emit_barrier ();
-    }
-
-  if (cleanup_label)
-    emit_label (cleanup_label);
-
-  /* Generate the final sequence that identifies the differing
-     byte and generates the final result, taking into account
-     zero bytes:
-
-     cmpb              cmpb_result1, src1, src2
-     cmpb              cmpb_result2, src1, zero
-     orc               cmpb_result1, cmp_result1, cmpb_result2
-     cntlzd            get bit of first zero/diff byte
-     addi              convert for rldcl use
-     rldcl rldcl       extract diff/zero byte
-     subf              subtract for final result
-  */
-
-  rtx cmpb_diff = gen_reg_rtx (word_mode);
-  rtx cmpb_zero = gen_reg_rtx (word_mode);
-  rtx rot_amt = gen_reg_rtx (word_mode);
-  rtx zero_reg = gen_reg_rtx (word_mode);
-
-  rtx rot1_1 = gen_reg_rtx (word_mode);
-  rtx rot1_2 = gen_reg_rtx (word_mode);
-  rtx rot2_1 = gen_reg_rtx (word_mode);
-  rtx rot2_2 = gen_reg_rtx (word_mode);
-
-  if (word_mode == SImode)
-    {
-      emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
-      emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
-      emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
-      emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
-      emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
-      emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
-      emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
-      emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1,
-			      gen_lowpart (SImode, rot_amt)));
-      emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
-      emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2,
-			      gen_lowpart (SImode, rot_amt)));
-      emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
-      emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2));
-    }
-  else
-    {
-      emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
-      emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
-      emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
-      emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
-      emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
-      emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
-      emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
-      emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1,
-			      gen_lowpart (SImode, rot_amt)));
-      emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
-      emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2,
-			      gen_lowpart (SImode, rot_amt)));
-      emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
-      emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2));
-    }
-
-  emit_label (final_move_label);
-  emit_insn (gen_movsi (target,
-			gen_lowpart (SImode, result_reg)));
-  emit_label (final_label);
-  return true;
-}
-
-/* Expand a block move operation, and return 1 if successful.  Return 0
-   if we should let the compiler generate normal code.
-
-   operands[0] is the destination
-   operands[1] is the source
-   operands[2] is the length
-   operands[3] is the alignment */
-
-#define MAX_MOVE_REG 4
-
-int
-expand_block_move (rtx operands[])
-{
-  rtx orig_dest = operands[0];
-  rtx orig_src	= operands[1];
-  rtx bytes_rtx	= operands[2];
-  rtx align_rtx = operands[3];
-  int constp	= (GET_CODE (bytes_rtx) == CONST_INT);
-  int align;
-  int bytes;
-  int offset;
-  int move_bytes;
-  rtx stores[MAX_MOVE_REG];
-  int num_reg = 0;
-
-  /* If this is not a fixed size move, just call memcpy */
-  if (! constp)
-    return 0;
-
-  /* This must be a fixed size alignment */
-  gcc_assert (GET_CODE (align_rtx) == CONST_INT);
-  align = INTVAL (align_rtx) * BITS_PER_UNIT;
-
-  /* Anything to move? */
-  bytes = INTVAL (bytes_rtx);
-  if (bytes <= 0)
-    return 1;
-
-  if (bytes > rs6000_block_move_inline_limit)
-    return 0;
-
-  for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
-    {
-      union {
-	rtx (*movmemsi) (rtx, rtx, rtx, rtx);
-	rtx (*mov) (rtx, rtx);
-      } gen_func;
-      machine_mode mode = BLKmode;
-      rtx src, dest;
-
-      /* Altivec first, since it will be faster than a string move
-	 when it applies, and usually not significantly larger.  */
-      if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
-	{
-	  move_bytes = 16;
-	  mode = V4SImode;
-	  gen_func.mov = gen_movv4si;
-	}
-      else if (TARGET_STRING
-	  && bytes > 24		/* move up to 32 bytes at a time */
-	  && ! fixed_regs[5]
-	  && ! fixed_regs[6]
-	  && ! fixed_regs[7]
-	  && ! fixed_regs[8]
-	  && ! fixed_regs[9]
-	  && ! fixed_regs[10]
-	  && ! fixed_regs[11]
-	  && ! fixed_regs[12])
-	{
-	  move_bytes = (bytes > 32) ? 32 : bytes;
-	  gen_func.movmemsi = gen_movmemsi_8reg;
-	}
-      else if (TARGET_STRING
-	       && bytes > 16	/* move up to 24 bytes at a time */
-	       && ! fixed_regs[5]
-	       && ! fixed_regs[6]
-	       && ! fixed_regs[7]
-	       && ! fixed_regs[8]
-	       && ! fixed_regs[9]
-	       && ! fixed_regs[10])
-	{
-	  move_bytes = (bytes > 24) ? 24 : bytes;
-	  gen_func.movmemsi = gen_movmemsi_6reg;
-	}
-      else if (TARGET_STRING
-	       && bytes > 8	/* move up to 16 bytes at a time */
-	       && ! fixed_regs[5]
-	       && ! fixed_regs[6]
-	       && ! fixed_regs[7]
-	       && ! fixed_regs[8])
-	{
-	  move_bytes = (bytes > 16) ? 16 : bytes;
-	  gen_func.movmemsi = gen_movmemsi_4reg;
-	}
-      else if (bytes >= 8 && TARGET_POWERPC64
-	       && (align >= 64 || !STRICT_ALIGNMENT))
-	{
-	  move_bytes = 8;
-	  mode = DImode;
-	  gen_func.mov = gen_movdi;
-	  if (offset == 0 && align < 64)
-	    {
-	      rtx addr;
-
-	      /* If the address form is reg+offset with offset not a
-		 multiple of four, reload into reg indirect form here
-		 rather than waiting for reload.  This way we get one
-		 reload, not one per load and/or store.  */
-	      addr = XEXP (orig_dest, 0);
-	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
-		  && GET_CODE (XEXP (addr, 1)) == CONST_INT
-		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
-		{
-		  addr = copy_addr_to_reg (addr);
-		  orig_dest = replace_equiv_address (orig_dest, addr);
-		}
-	      addr = XEXP (orig_src, 0);
-	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
-		  && GET_CODE (XEXP (addr, 1)) == CONST_INT
-		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
-		{
-		  addr = copy_addr_to_reg (addr);
-		  orig_src = replace_equiv_address (orig_src, addr);
-		}
-	    }
-	}
-      else if (TARGET_STRING && bytes > 4 && !TARGET_POWERPC64)
-	{			/* move up to 8 bytes at a time */
-	  move_bytes = (bytes > 8) ? 8 : bytes;
-	  gen_func.movmemsi = gen_movmemsi_2reg;
-	}
-      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
-	{			/* move 4 bytes */
-	  move_bytes = 4;
-	  mode = SImode;
-	  gen_func.mov = gen_movsi;
-	}
-      else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
-	{			/* move 2 bytes */
-	  move_bytes = 2;
-	  mode = HImode;
-	  gen_func.mov = gen_movhi;
-	}
-      else if (TARGET_STRING && bytes > 1)
-	{			/* move up to 4 bytes at a time */
-	  move_bytes = (bytes > 4) ? 4 : bytes;
-	  gen_func.movmemsi = gen_movmemsi_1reg;
-	}
-      else /* move 1 byte at a time */
-	{
-	  move_bytes = 1;
-	  mode = QImode;
-	  gen_func.mov = gen_movqi;
-	}
-
-      src = adjust_address (orig_src, mode, offset);
-      dest = adjust_address (orig_dest, mode, offset);
-
-      if (mode != BLKmode)
-	{
-	  rtx tmp_reg = gen_reg_rtx (mode);
-
-	  emit_insn ((*gen_func.mov) (tmp_reg, src));
-	  stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
-	}
-
-      if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
-	{
-	  int i;
-	  for (i = 0; i < num_reg; i++)
-	    emit_insn (stores[i]);
-	  num_reg = 0;
-	}
-
-      if (mode == BLKmode)
-	{
-	  /* Move the address into scratch registers.  The movmemsi
-	     patterns require zero offset.  */
-	  if (!REG_P (XEXP (src, 0)))
-	    {
-	      rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
-	      src = replace_equiv_address (src, src_reg);
-	    }
-	  set_mem_size (src, move_bytes);
-
-	  if (!REG_P (XEXP (dest, 0)))
-	    {
-	      rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
-	      dest = replace_equiv_address (dest, dest_reg);
-	    }
-	  set_mem_size (dest, move_bytes);
-
-	  emit_insn ((*gen_func.movmemsi) (dest, src,
-					   GEN_INT (move_bytes & 31),
-					   align_rtx));
-	}
-    }
-
-  return 1;
-}
-
 
-/* Return a string to perform a load_multiple operation.
-   operands[0] is the vector.
-   operands[1] is the source address.
-   operands[2] is the first destination register.  */
-
-const char *
-rs6000_output_load_multiple (rtx operands[3])
-{
-  /* We have to handle the case where the pseudo used to contain the address
-     is assigned to one of the output registers.  */
-  int i, j;
-  int words = XVECLEN (operands[0], 0);
-  rtx xop[10];
-
-  if (XVECLEN (operands[0], 0) == 1)
-    return "lwz %2,0(%1)";
-
-  for (i = 0; i < words; i++)
-    if (refers_to_regno_p (REGNO (operands[2]) + i, operands[1]))
-      {
-	if (i == words-1)
-	  {
-	    xop[0] = GEN_INT (4 * (words-1));
-	    xop[1] = operands[1];
-	    xop[2] = operands[2];
-	    output_asm_insn ("lswi %2,%1,%0\n\tlwz %1,%0(%1)", xop);
-	    return "";
-	  }
-	else if (i == 0)
-	  {
-	    xop[0] = GEN_INT (4 * (words-1));
-	    xop[1] = operands[1];
-	    xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + 1);
-	    output_asm_insn ("addi %1,%1,4\n\tlswi %2,%1,%0\n\tlwz %1,-4(%1)", xop);
-	    return "";
-	  }
-	else
-	  {
-	    for (j = 0; j < words; j++)
-	      if (j != i)
-		{
-		  xop[0] = GEN_INT (j * 4);
-		  xop[1] = operands[1];
-		  xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + j);
-		  output_asm_insn ("lwz %2,%0(%1)", xop);
-		}
-	    xop[0] = GEN_INT (i * 4);
-	    xop[1] = operands[1];
-	    output_asm_insn ("lwz %1,%0(%1)", xop);
-	    return "";
-	  }
-      }
-
-  return "lswi %2,%1,%N0";
-}
-
-
 /* A validation routine: say whether CODE, a condition code, and MODE
    match.  The other alternatives either don't make sense or should
    never be generated.  */
Index: config/rs6000/t-rs6000
===================================================================
--- config/rs6000/t-rs6000	(revision 249565)
+++ config/rs6000/t-rs6000	(working copy)
@@ -26,6 +26,10 @@
 	$(COMPILE) $<
 	$(POSTCOMPILE)
 
+rs6000-string.o: $(srcdir)/config/rs6000/rs6000-string.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
 $(srcdir)/config/rs6000/rs6000-tables.opt: $(srcdir)/config/rs6000/genopt.sh \
   $(srcdir)/config/rs6000/rs6000-cpus.def
 	$(SHELL) $(srcdir)/config/rs6000/genopt.sh $(srcdir)/config/rs6000 > \

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]