x86_64 merger part 24 - string operations

Jan Hubicka hubicka@atrey.karlin.mff.cuni.cz
Sat Mar 24 16:48:00 GMT 2001


Hi
This patch adds the string patterns.  The code produced by these should be
comparable with 32bit one, except for movstrlensi_unroll_1 case, that should be
updated to work on 64bit values, but should be OK for first pass.

Ne bøe 25 00:39:39 CET 2001  Jan Hubicka  <jh@suse.cz>

	* i386.md (movstrsi): Move offline.
	(movstrdi): New.
	(strmovdi_rex64): New.
	(strmov?i): Accept 64bit.
	(strmov?i_rex64): New.
	(strmov?i_rex_1): New.
	(strmov?i_1): Disable for 64bit.
	(rep_mov?i_rex64): New.
	(rep_mov?i): Disable for 64bit.
	(clrstrsi): Move offline.
	(strset?i_rex64): New.
	(strset?i: Accept 64bit.
	(rep_stos?i): Disable for 64bit.
	(rep_stos?i_rex64): New.
	(strset?i_rex_1): New.
	(strset?i_1): Disable for 64bit.
	(cmpstrsi): Accept 64bit.
	(cmpstrsi_nz_1): Rename to cmpstrqi_nz_1; Disable for 64bit.
	(cmpstrqi_nz_rex_1): New.
	(cmpstrsi_1): Rename to cmpstrqi_1; Disable for 64bit.
	(strlensi): Move offline.
	(strlendi): New.
	(strlenqi_1): Disable for 64bit; fix constraints.
	(strlenqi_rex_1): New.
	* i386.c (ix86_adjust_counter): New static function.
	(ix86_zero_extend_to_Pmode): Likewise.
	(ix86_expand_aligntest): Likweise.
	(ix86_expand_strlensi_unroll_1): Make static; update for 64bit.
	(ix86_expand_movstr): New global function.
	(ix86_expand_clrstr): New global function.
	(ix86_expand_strlen): New global function.
	* i386-protos.h (ix86_expand_movstr, ix86_expand_clrstr,
	ix86_expand_strlen): Declare.
	(ix86_expand_strlensi_unroll_1): Delete.

*** i386.md	Thu Mar 22 21:45:30 2001
--- /p1/new/x86-64/gcc/gcc/config/i386/i386.md	Sun Mar 25 00:35:28 2001
***************
*** 12021,12215 ****
     (use (match_operand:BLK 1 "memory_operand" ""))
     (use (match_operand:SI 2 "nonmemory_operand" ""))
     (use (match_operand:SI 3 "const_int_operand" ""))]
!   ""
    "
  {
!   rtx srcreg, destreg, countreg;
!   int align = 0;
!   int count = -1;
!   rtx insns;
! 
!   start_sequence ();
! 
!   if (GET_CODE (operands[3]) == CONST_INT)
!     align = INTVAL (operands[3]);
! 
!   /* This simple hack avoids all inlining code and simplifies code bellow.  */
!   if (!TARGET_ALIGN_STRINGOPS)
!     align = 32;
! 
!   if (GET_CODE (operands[2]) == CONST_INT)
!     count = INTVAL (operands[2]);
! 
!   destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
!   srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
! 
!   emit_insn (gen_cld ());
! 
!   /* When optimizing for size emit simple rep ; movsb instruction for
!      counts not divisible by 4.  */
  
!   if ((!optimize || optimize_size) 
!       && (count < 0 || (count & 0x03)))
!     {
!       countreg = copy_to_mode_reg (SImode, operands[2]);
!       emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
!       				destreg, srcreg, countreg));
!     }
  
!   /* For constant aligned (or small unaligned) copies use rep movsl
!      followed by code copying the rest.  For PentiumPro ensure 8 byte
!      alignment to allow rep movsl acceleration.  */
  
!   else if (count >= 0 
! 	   && (align >= 8
! 	       || (!TARGET_PENTIUMPRO && align >= 4)
! 	       || optimize_size || count < 64))
!     {
!       if (count & ~0x03)
! 	{
! 	  countreg = copy_to_mode_reg (SImode,
! 	  			       GEN_INT ((count >> 2)
! 						& 0x3fffffff));
! 	  emit_insn (gen_rep_movsi (destreg, srcreg, countreg,
! 				    destreg, srcreg, countreg));
! 	}
!       if (count & 0x02)
! 	emit_insn (gen_strmovhi (destreg, srcreg));
!       if (count & 0x01)
! 	emit_insn (gen_strmovqi (destreg, srcreg));
!     }
!   /* The generic code based on the glibc implementation:
!      - align destination to 4 bytes (8 byte alignment is used for PentiumPro
!        allowing accelerated copying there)
!      - copy the data using rep movsl
!      - copy the rest.  */
!   else
      {
!       rtx countreg2;
!       rtx label = NULL;
! 
!       /* In case we don't know anything about the alignment, default to
!          library version, since it is usually equally fast and result in
! 	 shorter code.  */
!       if (!TARGET_INLINE_ALL_STRINGOPS && align < 4)
! 	{
! 	  end_sequence ();
! 	  FAIL;
! 	}
! 
!       if (TARGET_SINGLE_STRINGOP)
! 	emit_insn (gen_cld ());
! 
!       countreg2 = gen_reg_rtx (SImode);
!       countreg = copy_to_mode_reg (SImode, operands[2]);
! 
!       /* We don't use loops to align destination and to copy parts smaller
! 	 than 4 bytes, because gcc is able to optimize such code better (in
! 	 the case the destination or the count really is aligned, gcc is often
! 	 able to predict the branches) and also it is friendlier to the
! 	 hardware branch prediction.  
! 
! 	 Using loops is benefical for generic case, because we can
! 	 handle small counts using the loops.  Many CPUs (such as Athlon)
! 	 have large REP prefix setup costs.
! 
! 	 This is quite costy.  Maybe we can revisit this decision later or
! 	 add some customizability to this code.  */
! 
!       if (count < 0
! 	  && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4))
! 	{
! 	  label = gen_label_rtx ();
! 	  emit_cmp_and_jump_insns (countreg, GEN_INT (3),
! 				   LEU, 0, SImode, 1, 0, label);
! 	}
!       if (align <= 1)
! 	{
! 	  rtx label = gen_label_rtx ();
! 	  rtx tmpcount = gen_reg_rtx (SImode);
! 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1)));
! 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
! 				   SImode, 1, 0, label);
! 	  emit_insn (gen_strmovqi (destreg, srcreg));
! 	  emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align <= 2)
! 	{
! 	  rtx label = gen_label_rtx ();
! 	  rtx tmpcount = gen_reg_rtx (SImode);
! 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2)));
! 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
! 				   SImode, 1, 0, label);
! 	  emit_insn (gen_strmovhi (destreg, srcreg));
! 	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2)));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260))
! 	{
! 	  rtx label = gen_label_rtx ();
! 	  rtx tmpcount = gen_reg_rtx (SImode);
! 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4)));
! 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
! 				   SImode, 1, 0, label);
! 	  emit_insn (gen_strmovsi (destreg, srcreg));
! 	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4)));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
! 
!       if (!TARGET_SINGLE_STRINGOP)
! 	emit_insn (gen_cld());
!       emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
!       emit_insn (gen_rep_movsi (destreg, srcreg, countreg2,
! 				destreg, srcreg, countreg2));
! 
!       if (label)
! 	{
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align > 2 && count > 0 && (count & 2))
! 	emit_insn (gen_strmovhi (destreg, srcreg));
!       if (align <= 2 || count < 0)
! 	{
! 	  rtx label = gen_label_rtx ();
! 	  rtx tmpcount = gen_reg_rtx (SImode);
! 	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2)));
! 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
! 				   SImode, 1, 0, label);
! 	  emit_insn (gen_strmovhi (destreg, srcreg));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align > 1 && count > 0 && (count & 1))
! 	emit_insn (gen_strmovsi (destreg, srcreg));
!       if (align <= 1 || count < 0)
! 	{
! 	  rtx label = gen_label_rtx ();
! 	  rtx tmpcount = gen_reg_rtx (SImode);
! 	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1)));
! 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
! 				   SImode, 1, 0, label);
! 	  emit_insn (gen_strmovqi (destreg, srcreg));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
      }
! 
!   insns = get_insns ();
!   end_sequence ();
! 
!   ix86_set_move_mem_attrs (insns, operands[0], operands[1], destreg, srcreg);
!   emit_insns (insns);
!   DONE;
  }")
  
- ;; Most CPUs don't like single string operations
- ;; Handle this case here to simplify previous expander.
  
  (define_expand "strmovsi"
    [(set (match_dup 2)
--- 14261,14314 ----
     (use (match_operand:BLK 1 "memory_operand" ""))
     (use (match_operand:SI 2 "nonmemory_operand" ""))
     (use (match_operand:SI 3 "const_int_operand" ""))]
!   "TARGET_64BIT"
    "
  {
!  if (ix86_expand_movstr (operands[0], operands[1], operands[2], operands[3]))
!    DONE;
!  else
!    FAIL;
! }")
  
! (define_expand "movstrdi"
!   [(use (match_operand:BLK 0 "memory_operand" ""))
!    (use (match_operand:BLK 1 "memory_operand" ""))
!    (use (match_operand:DI 2 "nonmemory_operand" ""))
!    (use (match_operand:DI 3 "const_int_operand" ""))]
!   "TARGET_64BIT"
!   "
! {
!  if (ix86_expand_movstr (operands[0], operands[1], operands[2], operands[3]))
!    DONE;
!  else
!    FAIL;
! }")
  
! ;; Most CPUs don't like single string operations
! ;; Handle this case here to simplify previous expander.
  
! (define_expand "strmovdi_rex64"
!   [(set (match_dup 2)
!   	(mem:DI (match_operand:DI 1 "register_operand" "")))
!    (set (mem:DI (match_operand:DI 0 "register_operand" ""))
!         (match_dup 2))
!    (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 8)))
! 	      (clobber (reg:CC 17))])
!    (parallel [(set (match_dup 1) (plus:DI (match_dup 1) (const_int 8)))
! 	      (clobber (reg:CC 17))])]
!   "TARGET_64BIT"
!   "
! {
!   if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
!       emit_insn (gen_strmovdi_rex_1 (operands[0], operands[1], operands[0],
! 				     operands[1]));
!       DONE;
      }
!   else 
!     operands[2] = gen_reg_rtx (DImode);
  }")
  
  
  (define_expand "strmovsi"
    [(set (match_dup 2)
***************
*** 12220,12228 ****
  	      (clobber (reg:CC 17))])
     (parallel [(set (match_dup 1) (plus:SI (match_dup 1) (const_int 4)))
  	      (clobber (reg:CC 17))])]
    ""
    "
  {
    if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
        emit_insn (gen_strmovsi_1 (operands[0], operands[1], operands[0],
--- 14319,14332 ----
  	      (clobber (reg:CC 17))])
     (parallel [(set (match_dup 1) (plus:SI (match_dup 1) (const_int 4)))
  	      (clobber (reg:CC 17))])]
    ""
    "
  {
+   if (TARGET_64BIT)
+     {
+       emit_insn (gen_strmovsi_rex64 (operands[0], operands[1]));
+       DONE;
+     }
    if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
        emit_insn (gen_strmovsi_1 (operands[0], operands[1], operands[0],
***************
*** 12233,12238 ****
--- 14337,14364 ----
      operands[2] = gen_reg_rtx (SImode);
  }")
  
+ (define_expand "strmovsi_rex64"
+   [(set (match_dup 2)
+   	(mem:SI (match_operand:DI 1 "register_operand" "")))
+    (set (mem:SI (match_operand:DI 0 "register_operand" ""))
+         (match_dup 2))
+    (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 4)))
+ 	      (clobber (reg:CC 17))])
+    (parallel [(set (match_dup 1) (plus:DI (match_dup 1) (const_int 4)))
+ 	      (clobber (reg:CC 17))])]
+   "TARGET_64BIT"
+   "
+ {
+   if (TARGET_SINGLE_STRINGOP || optimize_size)
+     {
+       emit_insn (gen_strmovsi_rex_1 (operands[0], operands[1], operands[0],
+ 				     operands[1]));
+       DONE;
+     }
+   else 
+     operands[2] = gen_reg_rtx (SImode);
+ }")
+ 
  (define_expand "strmovhi"
    [(set (match_dup 2)
    	(mem:HI (match_operand:SI 1 "register_operand" "")))
***************
*** 12240,12254 ****
          (match_dup 2))
     (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 2)))
  	      (clobber (reg:CC 17))])
!    (parallel [(set (match_dup 1) (plus:SI (match_dup 1) (const_int 2)))
  	      (clobber (reg:CC 17))])]
!   ""
    "
  {
    if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
!       emit_insn (gen_strmovhi_1 (operands[0], operands[1], operands[0],
! 				operands[1]));
        DONE;
      }
    else 
--- 14366,14407 ----
          (match_dup 2))
     (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 2)))
  	      (clobber (reg:CC 17))])
!    (parallel [(set (match_dup 1) (plus:SI (match_dup 1) (const_int 2)))
! 	      (clobber (reg:CC 17))])]
!   ""
!   "
! {
!   if (TARGET_64BIT)
!     {
!       emit_insn (gen_strmovhi_rex64 (operands[0], operands[1]));
!       DONE;
!     }
!   if (TARGET_SINGLE_STRINGOP || optimize_size)
!     {
!       emit_insn (gen_strmovhi_1 (operands[0], operands[1], operands[0],
! 				operands[1]));
!       DONE;
!     }
!   else 
!     operands[2] = gen_reg_rtx (HImode);
! }")
! 
! (define_expand "strmovhi_rex64"
!   [(set (match_dup 2)
!   	(mem:HI (match_operand:DI 1 "register_operand" "")))
!    (set (mem:HI (match_operand:DI 0 "register_operand" ""))
!         (match_dup 2))
!    (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 2)))
! 	      (clobber (reg:CC 17))])
!    (parallel [(set (match_dup 1) (plus:DI (match_dup 1) (const_int 2)))
  	      (clobber (reg:CC 17))])]
!   "TARGET_64BIT"
    "
  {
    if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
!       emit_insn (gen_strmovhi_rex_1 (operands[0], operands[1], operands[0],
! 				     operands[1]));
        DONE;
      }
    else 
***************
*** 12264,12272 ****
  	      (clobber (reg:CC 17))])
     (parallel [(set (match_dup 1) (plus:SI (match_dup 1) (const_int 1)))
  	      (clobber (reg:CC 17))])]
!   ""
    "
  {
    if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
        emit_insn (gen_strmovqi_1 (operands[0], operands[1], operands[0],
--- 14417,14430 ----
  	      (clobber (reg:CC 17))])
     (parallel [(set (match_dup 1) (plus:SI (match_dup 1) (const_int 1)))
  	      (clobber (reg:CC 17))])]
!   ""
    "
  {
+   if (TARGET_64BIT)
+     {
+       emit_insn (gen_strmovqi_rex64 (operands[0], operands[1]));
+       DONE;
+     }
    if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
        emit_insn (gen_strmovqi_1 (operands[0], operands[1], operands[0],
***************
*** 12277,12282 ****
--- 14435,14478 ----
      operands[2] = gen_reg_rtx (QImode);
  }")
  
+ (define_expand "strmovqi_rex64"
+   [(set (match_dup 2)
+   	(mem:QI (match_operand:DI 1 "register_operand" "")))
+    (set (mem:QI (match_operand:DI 0 "register_operand" ""))
+         (match_dup 2))
+    (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 1)))
+ 	      (clobber (reg:CC 17))])
+    (parallel [(set (match_dup 1) (plus:DI (match_dup 1) (const_int 1)))
+ 	      (clobber (reg:CC 17))])]
+   "!TARGET_64BIT"
+   "
+ {
+   if (TARGET_SINGLE_STRINGOP || optimize_size)
+     {
+       emit_insn (gen_strmovqi_rex_1 (operands[0], operands[1], operands[0],
+ 				     operands[1]));
+       DONE;
+     }
+   else 
+     operands[2] = gen_reg_rtx (QImode);
+ }")
+ 
+ (define_insn "strmovdi_rex_1"
+   [(set (mem:DI (match_operand:DI 2 "register_operand" "0"))
+ 	(mem:DI (match_operand:DI 3 "register_operand" "1")))
+    (set (match_operand:DI 0 "register_operand" "=D")
+ 	(plus:DI (match_dup 2)
+ 		 (const_int 8)))
+    (set (match_operand:DI 1 "register_operand" "=S")
+ 	(plus:DI (match_dup 3)
+ 		 (const_int 8)))
+    (use (reg:SI 19))]
+   "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
+   "movsq"
+   [(set_attr "type" "str")
+    (set_attr "mode" "DI")
+    (set_attr "memory" "both")])
+ 
  (define_insn "strmovsi_1"
    [(set (mem:SI (match_operand:SI 2 "register_operand" "0"))
  	(mem:SI (match_operand:SI 3 "register_operand" "1")))
***************
*** 12287,12294 ****
  	(plus:SI (match_dup 3)
  		 (const_int 4)))
     (use (reg:SI 19))]
!   "TARGET_SINGLE_STRINGOP || optimize_size"
!   "movsl"
    [(set_attr "type" "str")
     (set_attr "mode" "SI")
     (set_attr "memory" "both")])
--- 14483,14506 ----
  	(plus:SI (match_dup 3)
  		 (const_int 4)))
     (use (reg:SI 19))]
!   "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
!   "movsl|movsd"
!   [(set_attr "type" "str")
!    (set_attr "mode" "SI")
!    (set_attr "memory" "both")])
! 
! (define_insn "strmovsi_rex_1"
!   [(set (mem:SI (match_operand:DI 2 "register_operand" "0"))
! 	(mem:SI (match_operand:DI 3 "register_operand" "1")))
!    (set (match_operand:DI 0 "register_operand" "=D")
! 	(plus:DI (match_dup 2)
! 		 (const_int 4)))
!    (set (match_operand:DI 1 "register_operand" "=S")
! 	(plus:DI (match_dup 3)
! 		 (const_int 4)))
!    (use (reg:SI 19))]
!   "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
!   "movsl|movsd"
    [(set_attr "type" "str")
     (set_attr "mode" "SI")
     (set_attr "memory" "both")])
***************
*** 12303,12309 ****
  	(plus:SI (match_dup 3)
  		 (const_int 2)))
     (use (reg:SI 19))]
!   "TARGET_SINGLE_STRINGOP || optimize_size"
    "movsw"
    [(set_attr "type" "str")
     (set_attr "memory" "both")
--- 14515,14537 ----
  	(plus:SI (match_dup 3)
  		 (const_int 2)))
     (use (reg:SI 19))]
!   "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
!   "movsw"
!   [(set_attr "type" "str")
!    (set_attr "memory" "both")
!    (set_attr "mode" "HI")])
! 
! (define_insn "strmovhi_rex_1"
!   [(set (mem:HI (match_operand:DI 2 "register_operand" "0"))
! 	(mem:HI (match_operand:DI 3 "register_operand" "1")))
!    (set (match_operand:DI 0 "register_operand" "=D")
! 	(plus:DI (match_dup 2)
! 		 (const_int 2)))
!    (set (match_operand:DI 1 "register_operand" "=S")
! 	(plus:DI (match_dup 3)
! 		 (const_int 2)))
!    (use (reg:SI 19))]
!   "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
    "movsw"
    [(set_attr "type" "str")
     (set_attr "memory" "both")
***************
*** 12319,12330 ****
  	(plus:SI (match_dup 3)
  		 (const_int 1)))
     (use (reg:SI 19))]
!   "TARGET_SINGLE_STRINGOP || optimize_size"
    "movsb"
    [(set_attr "type" "str")
     (set_attr "memory" "both")
     (set_attr "mode" "QI")])
  
  (define_insn "rep_movsi"
    [(set (match_operand:SI 2 "register_operand" "=c") (const_int 0))
     (set (match_operand:SI 0 "register_operand" "=D") 
--- 14547,14594 ----
  	(plus:SI (match_dup 3)
  		 (const_int 1)))
     (use (reg:SI 19))]
!   "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
!   "movsb"
!   [(set_attr "type" "str")
!    (set_attr "memory" "both")
!    (set_attr "mode" "QI")])
! 
! (define_insn "strmovqi_rex_1"
!   [(set (mem:QI (match_operand:DI 2 "register_operand" "0"))
! 	(mem:QI (match_operand:DI 3 "register_operand" "1")))
!    (set (match_operand:DI 0 "register_operand" "=D")
! 	(plus:DI (match_dup 2)
! 		 (const_int 1)))
!    (set (match_operand:DI 1 "register_operand" "=S")
! 	(plus:DI (match_dup 3)
! 		 (const_int 1)))
!    (use (reg:SI 19))]
!   "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
    "movsb"
    [(set_attr "type" "str")
     (set_attr "memory" "both")
     (set_attr "mode" "QI")])
  
+ (define_insn "rep_movdi_rex64"
+   [(set (match_operand:DI 2 "register_operand" "=c") (const_int 0))
+    (set (match_operand:DI 0 "register_operand" "=D") 
+         (plus:DI (ashift:DI (match_operand:DI 5 "register_operand" "2")
+ 			    (const_int 3))
+ 		 (match_operand:DI 3 "register_operand" "0")))
+    (set (match_operand:DI 1 "register_operand" "=S") 
+         (plus:DI (ashift:DI (match_dup 5) (const_int 3))
+ 		 (match_operand:DI 4 "register_operand" "1")))
+    (set (mem:BLK (match_dup 3))
+ 	(mem:BLK (match_dup 4)))
+    (use (match_dup 5))
+    (use (reg:SI 19))]
+   "TARGET_64BIT"
+   "rep\;movsq|rep movsq"
+   [(set_attr "type" "str")
+    (set_attr "prefix_rep" "1")
+    (set_attr "memory" "both")
+    (set_attr "mode" "DI")])
+ 
  (define_insn "rep_movsi"
    [(set (match_operand:SI 2 "register_operand" "=c") (const_int 0))
     (set (match_operand:SI 0 "register_operand" "=D") 
***************
*** 12338,12344 ****
  	(mem:BLK (match_dup 4)))
     (use (match_dup 5))
     (use (reg:SI 19))]
!   ""
    "rep\;movsl|rep movsd"
    [(set_attr "type" "str")
     (set_attr "prefix_rep" "1")
--- 14602,14628 ----
  	(mem:BLK (match_dup 4)))
     (use (match_dup 5))
     (use (reg:SI 19))]
!   "!TARGET_64BIT"
!   "rep\;movsl|rep movsd"
!   [(set_attr "type" "str")
!    (set_attr "prefix_rep" "1")
!    (set_attr "memory" "both")
!    (set_attr "mode" "SI")])
! 
! (define_insn "rep_movsi_rex64"
!   [(set (match_operand:DI 2 "register_operand" "=c") (const_int 0))
!    (set (match_operand:DI 0 "register_operand" "=D") 
!         (plus:DI (ashift:DI (match_operand:DI 5 "register_operand" "2")
! 			    (const_int 2))
! 		 (match_operand:DI 3 "register_operand" "0")))
!    (set (match_operand:DI 1 "register_operand" "=S") 
!         (plus:DI (ashift:DI (match_dup 5) (const_int 2))
! 		 (match_operand:DI 4 "register_operand" "1")))
!    (set (mem:BLK (match_dup 3))
! 	(mem:BLK (match_dup 4)))
!    (use (match_dup 5))
!    (use (reg:SI 19))]
!   "TARGET_64BIT"
    "rep\;movsl|rep movsd"
    [(set_attr "type" "str")
     (set_attr "prefix_rep" "1")
***************
*** 12356,12362 ****
  	(mem:BLK (match_dup 4)))
     (use (match_dup 5))
     (use (reg:SI 19))]
!   ""
    "rep\;movsb|rep movsb"
    [(set_attr "type" "str")
     (set_attr "prefix_rep" "1")
--- 14640,14664 ----
  	(mem:BLK (match_dup 4)))
     (use (match_dup 5))
     (use (reg:SI 19))]
!   "!TARGET_64BIT"
!   "rep\;movsb|rep movsb"
!   [(set_attr "type" "str")
!    (set_attr "prefix_rep" "1")
!    (set_attr "memory" "both")
!    (set_attr "mode" "SI")])
! 
! (define_insn "rep_movqi_rex64"
!   [(set (match_operand:DI 2 "register_operand" "=c") (const_int 0))
!    (set (match_operand:DI 0 "register_operand" "=D") 
!         (plus:DI (match_operand:DI 3 "register_operand" "0")
! 		 (match_operand:DI 5 "register_operand" "2")))
!    (set (match_operand:DI 1 "register_operand" "=S") 
!         (plus:DI (match_operand:DI 4 "register_operand" "1") (match_dup 5)))
!    (set (mem:BLK (match_dup 3))
! 	(mem:BLK (match_dup 4)))
!    (use (match_dup 5))
!    (use (reg:SI 19))]
!   "TARGET_64BIT"
    "rep\;movsb|rep movsb"
    [(set_attr "type" "str")
     (set_attr "prefix_rep" "1")
***************
*** 12366,12597 ****
  (define_expand "clrstrsi"
     [(use (match_operand:BLK 0 "memory_operand" ""))
      (use (match_operand:SI 1 "nonmemory_operand" ""))
!     (use (match_operand:SI 2 "const_int_operand" ""))]
    ""
    "
  {
!   /* See comments in movstr expanders.  The code is mostly identical.  */
! 
!   rtx destreg, zeroreg, countreg;
!   int align = 0;
!   int count = -1;
!   rtx insns;
! 
!   start_sequence ();
! 
!   if (GET_CODE (operands[2]) == CONST_INT)
!     align = INTVAL (operands[2]);
! 
!   /* This simple hack avoids all inlining code and simplifies code bellow.  */
!   if (!TARGET_ALIGN_STRINGOPS)
!     align = 32;
! 
!   if (GET_CODE (operands[1]) == CONST_INT)
!     count = INTVAL (operands[1]);
! 
!   destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
  
!   emit_insn (gen_cld ());
  
!   /* When optimizing for size emit simple rep ; movsb instruction for
!      counts not divisible by 4.  */
  
!   if ((!optimize || optimize_size) 
!       && (count < 0 || (count & 0x03)))
!     {
!       countreg = copy_to_mode_reg (SImode, operands[1]);
!       zeroreg = copy_to_mode_reg (QImode, const0_rtx);
!       emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
! 				 destreg, countreg));
!     }
!   else if (count >= 0 
! 	   && (align >= 8
! 	       || (!TARGET_PENTIUMPRO && align >= 4)
! 	       || optimize_size || count < 64))
!     {
!       zeroreg = copy_to_mode_reg (SImode, const0_rtx);
!       if (INTVAL (operands[1]) & ~0x03)
! 	{
! 	  countreg = copy_to_mode_reg (SImode,
! 	  			       GEN_INT ((INTVAL (operands[1]) >> 2)
! 						& 0x3fffffff));
! 	  emit_insn (gen_rep_stossi (destreg, countreg, zeroreg,
! 				     destreg, countreg));
! 	}
!       if (INTVAL (operands[1]) & 0x02)
! 	emit_insn (gen_strsethi (destreg,
! 				 gen_rtx_SUBREG (HImode, zeroreg, 0)));
!       if (INTVAL (operands[1]) & 0x01)
! 	emit_insn (gen_strsetqi (destreg,
! 				 gen_rtx_SUBREG (QImode, zeroreg, 0)));
!     }
!   else
      {
!       rtx countreg2;
!       rtx label = NULL;
! 
!       /* In case we don't know anything about the alignment, default to
!          library version, since it is usually equally fast and result in
! 	 shorter code.  */
!       if (!TARGET_INLINE_ALL_STRINGOPS && align < 4)
! 	{
! 	  end_sequence ();
! 	  FAIL;
! 	}
! 
!       if (TARGET_SINGLE_STRINGOP)
! 	emit_insn (gen_cld ());
! 
!       countreg2 = gen_reg_rtx (SImode);
!       countreg = copy_to_mode_reg (SImode, operands[1]);
!       zeroreg = copy_to_mode_reg (SImode, const0_rtx);
! 
!       if (count < 0
! 	  && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4))
! 	{
! 	  label = gen_label_rtx ();
! 	  emit_cmp_and_jump_insns (countreg, GEN_INT (3),
! 				   LEU, 0, SImode, 1, 0, label);
! 	}
!       if (align <= 1)
! 	{
! 	  rtx label = gen_label_rtx ();
! 	  rtx tmpcount = gen_reg_rtx (SImode);
! 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1)));
! 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
! 				   SImode, 1, 0, label);
! 	  emit_insn (gen_strsetqi (destreg,
! 				   gen_rtx_SUBREG (QImode, zeroreg, 0)));
! 	  emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align <= 2)
! 	{
! 	  rtx label = gen_label_rtx ();
! 	  rtx tmpcount = gen_reg_rtx (SImode);
! 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2)));
! 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
! 				   SImode, 1, 0, label);
! 	  emit_insn (gen_strsethi (destreg,
! 				   gen_rtx_SUBREG (HImode, zeroreg, 0)));
! 	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2)));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260))
! 	{
! 	  rtx label = gen_label_rtx ();
! 	  rtx tmpcount = gen_reg_rtx (SImode);
! 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4)));
! 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
! 				   SImode, 1, 0, label);
! 	  emit_insn (gen_strsetsi (destreg, zeroreg));
! 	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4)));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
! 
!       if (!TARGET_SINGLE_STRINGOP)
! 	emit_insn (gen_cld());
!       emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
!       emit_insn (gen_rep_stossi (destreg, countreg2, zeroreg,
! 				 destreg, countreg2));
! 
!       if (label)
! 	{
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align > 2 && count > 0 && (count & 2))
! 	emit_insn (gen_strsethi (destreg,
! 				 gen_rtx_SUBREG (HImode, zeroreg, 0)));
!       if (align <= 2 || count < 0)
! 	{
! 	  rtx label = gen_label_rtx ();
! 	  rtx tmpcount = gen_reg_rtx (SImode);
! 	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2)));
! 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
! 				   SImode, 1, 0, label);
! 	  emit_insn (gen_strsethi (destreg,
! 				   gen_rtx_SUBREG (HImode, zeroreg, 0)));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align > 1 && count > 0 && (count & 1))
! 	emit_insn (gen_strsetqi (destreg,
! 				 gen_rtx_SUBREG (QImode, zeroreg, 0)));
!       if (align <= 1 || count < 0)
! 	{
! 	  rtx label = gen_label_rtx ();
! 	  rtx tmpcount = gen_reg_rtx (SImode);
! 	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1)));
! 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
! 				   SImode, 1, 0, label);
! 	  emit_insn (gen_strsetqi (destreg,
! 				   gen_rtx_SUBREG (QImode, zeroreg, 0)));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
      }
- 
-   insns = get_insns ();
-   end_sequence ();
- 
-   ix86_set_move_mem_attrs (insns, operands[0], operands[0], destreg, destreg);
-   emit_insns (insns);
- 
-   DONE;
  }")
  
- ;; Most CPUs don't like single string operations
- ;; Handle this case here to simplify previous expander.
- 
  (define_expand "strsetsi"
    [(set (mem:SI (match_operand:SI 0 "register_operand" ""))
  	(match_operand:SI 1 "register_operand" ""))
     (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4)))
  	      (clobber (reg:CC 17))])]
    ""
    "
  {
!   if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
        emit_insn (gen_strsetsi_1 (operands[0], operands[0], operands[1]));
        DONE;
      }
  }")
  
  (define_expand "strsethi"
    [(set (mem:HI (match_operand:SI 0 "register_operand" ""))
  	(match_operand:HI 1 "register_operand" ""))
     (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 2)))
  	      (clobber (reg:CC 17))])]
    ""
    "
  {
!   if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
        emit_insn (gen_strsethi_1 (operands[0], operands[0], operands[1]));
        DONE;
      }
  }")
  
  (define_expand "strsetqi"
    [(set (mem:QI (match_operand:SI 0 "register_operand" ""))
  	(match_operand:QI 1 "register_operand" ""))
     (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
  	      (clobber (reg:CC 17))])]
    ""
    "
  {
!   if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
        emit_insn (gen_strsetqi_1 (operands[0], operands[0], operands[1]));
        DONE;
      }
  }")
  
  (define_insn "strsetsi_1"
    [(set (mem:SI (match_operand:SI 1 "register_operand" "0"))
  	(match_operand:SI 2 "register_operand" "a"))
--- 14668,14832 ----
  (define_expand "clrstrsi"
     [(use (match_operand:BLK 0 "memory_operand" ""))
      (use (match_operand:SI 1 "nonmemory_operand" ""))
!     (use (match_operand 2 "const_int_operand" ""))]
    ""
    "
  {
!  if (ix86_expand_clrstr (operands[0], operands[1], operands[2]))
!    DONE;
!  else
!    FAIL;
! }")
  
! (define_expand "clrstrdi"
!    [(use (match_operand:BLK 0 "memory_operand" ""))
!     (use (match_operand:DI 1 "nonmemory_operand" ""))
!     (use (match_operand 2 "const_int_operand" ""))]
!   "TARGET_64BIT"
!   "
! {
!  if (ix86_expand_clrstr (operands[0], operands[1], operands[2]))
!    DONE;
!  else
!    FAIL;
! }")
  
! ;; Most CPUs don't like single string operations
! ;; Handle this case here to simplify previous expander.
  
! (define_expand "strsetdi_rex64"
!   [(set (mem:DI (match_operand:DI 0 "register_operand" ""))
! 	(match_operand:DI 1 "register_operand" ""))
!    (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 8)))
! 	      (clobber (reg:CC 17))])]
!   "TARGET_64BIT"
!   "
! {
!   if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
!       emit_insn (gen_strsetdi_rex_1 (operands[0], operands[0], operands[1]));
!       DONE;
      }
  }")
  
  (define_expand "strsetsi"
    [(set (mem:SI (match_operand:SI 0 "register_operand" ""))
  	(match_operand:SI 1 "register_operand" ""))
     (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4)))
  	      (clobber (reg:CC 17))])]
    ""
    "
  {
!   if (TARGET_64BIT)
!     {
!       emit_insn (gen_strsetsi_rex64 (operands[0], operands[1]));
!       DONE;
!     }
!   else if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
        emit_insn (gen_strsetsi_1 (operands[0], operands[0], operands[1]));
        DONE;
      }
  }")
  
+ (define_expand "strsetsi_rex64"
+   [(set (mem:SI (match_operand:DI 0 "register_operand" ""))
+ 	(match_operand:SI 1 "register_operand" ""))
+    (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 4)))
+ 	      (clobber (reg:CC 17))])]
+   "TARGET_64BIT"
+   "
+ {
+   if (TARGET_SINGLE_STRINGOP || optimize_size)
+     {
+       emit_insn (gen_strsetsi_rex_1 (operands[0], operands[0], operands[1]));
+       DONE;
+     }
+ }")
+ 
  (define_expand "strsethi"
    [(set (mem:HI (match_operand:SI 0 "register_operand" ""))
  	(match_operand:HI 1 "register_operand" ""))
     (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 2)))
  	      (clobber (reg:CC 17))])]
    ""
    "
  {
!   if (TARGET_64BIT)
!     {
!       emit_insn (gen_strsethi_rex64 (operands[0], operands[1]));
!       DONE;
!     }
!   else if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
        emit_insn (gen_strsethi_1 (operands[0], operands[0], operands[1]));
        DONE;
      }
  }")
  
+ (define_expand "strsethi_rex64"
+   [(set (mem:HI (match_operand:DI 0 "register_operand" ""))
+ 	(match_operand:HI 1 "register_operand" ""))
+    (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 2)))
+ 	      (clobber (reg:CC 17))])]
+   "TARGET_64BIT"
+   "
+ {
+   if (TARGET_SINGLE_STRINGOP || optimize_size)
+     {
+       emit_insn (gen_strsethi_rex_1 (operands[0], operands[0], operands[1]));
+       DONE;
+     }
+ }")
+ 
  (define_expand "strsetqi"
    [(set (mem:QI (match_operand:SI 0 "register_operand" ""))
  	(match_operand:QI 1 "register_operand" ""))
     (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
  	      (clobber (reg:CC 17))])]
    "!TARGET_64BIT"
    "
  {
!   if (TARGET_64BIT)
!     {
!       emit_insn (gen_strsetqi_rex64 (operands[0], operands[1]));
!       DONE;
!     }
!   else if (TARGET_SINGLE_STRINGOP || optimize_size)
      {
        emit_insn (gen_strsetqi_1 (operands[0], operands[0], operands[1]));
        DONE;
      }
  }")
  
+ (define_expand "strsetqi_rex64"
+   [(set (mem:QI (match_operand:DI 0 "register_operand" ""))
+ 	(match_operand:QI 1 "register_operand" ""))
+    (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 1)))
+ 	      (clobber (reg:CC 17))])]
+   "TARGET_64BIT"
+   "
+ {
+   if (TARGET_SINGLE_STRINGOP || optimize_size)
+     {
+       emit_insn (gen_strsetqi_rex_1 (operands[0], operands[0], operands[1]));
+       DONE;
+     }
+ }")
+ 
+ (define_insn "strsetdi_rex_1"
+   [(set (mem:SI (match_operand:DI 1 "register_operand" "0"))
+ 	(match_operand:SI 2 "register_operand" "a"))
+    (set (match_operand:DI 0 "register_operand" "=D")
+ 	(plus:DI (match_dup 1)
+ 		 (const_int 8)))
+    (use (reg:SI 19))]
+   "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
+   "stosq"
+   [(set_attr "type" "str")
+    (set_attr "memory" "store")
+    (set_attr "mode" "DI")])
+ 
  (define_insn "strsetsi_1"
    [(set (mem:SI (match_operand:SI 1 "register_operand" "0"))
  	(match_operand:SI 2 "register_operand" "a"))
***************
*** 12599,12606 ****
  	(plus:SI (match_dup 1)
  		 (const_int 4)))
     (use (reg:SI 19))]
!   "TARGET_SINGLE_STRINGOP || optimize_size"
!   "stosl"
    [(set_attr "type" "str")
     (set_attr "memory" "store")
     (set_attr "mode" "SI")])
--- 14834,14854 ----
  	(plus:SI (match_dup 1)
  		 (const_int 4)))
     (use (reg:SI 19))]
!   "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
!   "stosl|stosd"
!   [(set_attr "type" "str")
!    (set_attr "memory" "store")
!    (set_attr "mode" "SI")])
! 
! (define_insn "strsetsi_rex_1"
!   [(set (mem:SI (match_operand:DI 1 "register_operand" "0"))
! 	(match_operand:SI 2 "register_operand" "a"))
!    (set (match_operand:DI 0 "register_operand" "=D")
! 	(plus:DI (match_dup 1)
! 		 (const_int 4)))
!    (use (reg:SI 19))]
!   "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
!   "stosl|stosd"
    [(set_attr "type" "str")
     (set_attr "memory" "store")
     (set_attr "mode" "SI")])
***************
*** 12612,12618 ****
  	(plus:SI (match_dup 1)
  		 (const_int 2)))
     (use (reg:SI 19))]
!   "TARGET_SINGLE_STRINGOP || optimize_size"
    "stosw"
    [(set_attr "type" "str")
     (set_attr "memory" "store")
--- 14860,14879 ----
  	(plus:SI (match_dup 1)
  		 (const_int 2)))
     (use (reg:SI 19))]
!   "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
!   "stosw"
!   [(set_attr "type" "str")
!    (set_attr "memory" "store")
!    (set_attr "mode" "HI")])
! 
! (define_insn "strsethi_rex_1"
!   [(set (mem:HI (match_operand:DI 1 "register_operand" "0"))
! 	(match_operand:HI 2 "register_operand" "a"))
!    (set (match_operand:DI 0 "register_operand" "=D")
! 	(plus:DI (match_dup 1)
! 		 (const_int 2)))
!    (use (reg:SI 19))]
!   "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
    "stosw"
    [(set_attr "type" "str")
     (set_attr "memory" "store")
***************
*** 12625,12636 ****
  	(plus:SI (match_dup 1)
  		 (const_int 1)))
     (use (reg:SI 19))]
!   "TARGET_SINGLE_STRINGOP || optimize_size"
    "stosb"
    [(set_attr "type" "str")
     (set_attr "memory" "store")
     (set_attr "mode" "QI")])
  
  (define_insn "rep_stossi"
    [(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
     (set (match_operand:SI 0 "register_operand" "=D") 
--- 14886,14928 ----
  	(plus:SI (match_dup 1)
  		 (const_int 1)))
     (use (reg:SI 19))]
!   "!TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
    "stosb"
    [(set_attr "type" "str")
     (set_attr "memory" "store")
     (set_attr "mode" "QI")])
  
+ (define_insn "strsetqi_rex_1"
+   [(set (mem:QI (match_operand:DI 1 "register_operand" "0"))
+ 	(match_operand:QI 2 "register_operand" "a"))
+    (set (match_operand:DI 0 "register_operand" "=D")
+ 	(plus:DI (match_dup 1)
+ 		 (const_int 1)))
+    (use (reg:SI 19))]
+   "TARGET_64BIT && (TARGET_SINGLE_STRINGOP || optimize_size)"
+   "stosb"
+   [(set_attr "type" "str")
+    (set_attr "memory" "store")
+    (set_attr "mode" "QI")])
+ 
+ (define_insn "rep_stosdi_rex64"
+   [(set (match_operand:DI 1 "register_operand" "=c") (const_int 0))
+    (set (match_operand:DI 0 "register_operand" "=D") 
+         (plus:DI (ashift:DI (match_operand:DI 4 "register_operand" "1")
+ 			    (const_int 3))
+ 		 (match_operand:DI 3 "register_operand" "0")))
+    (set (mem:BLK (match_dup 3))
+ 	(const_int 0))
+    (use (match_operand:DI 2 "register_operand" "a"))
+    (use (match_dup 4))
+    (use (reg:SI 19))]
+   "TARGET_64BIT"
+   "rep\;stosq|rep stosq"
+   [(set_attr "type" "str")
+    (set_attr "prefix_rep" "1")
+    (set_attr "memory" "store")
+    (set_attr "mode" "DI")])
+ 
  (define_insn "rep_stossi"
    [(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
     (set (match_operand:SI 0 "register_operand" "=D") 
***************
*** 12642,12648 ****
     (use (match_operand:SI 2 "register_operand" "a"))
     (use (match_dup 4))
     (use (reg:SI 19))]
!   ""
    "rep\;stosl|rep stosd"
    [(set_attr "type" "str")
     (set_attr "prefix_rep" "1")
--- 14934,14958 ----
     (use (match_operand:SI 2 "register_operand" "a"))
     (use (match_dup 4))
     (use (reg:SI 19))]
!   "!TARGET_64BIT"
!   "rep\;stosl|rep stosd"
!   [(set_attr "type" "str")
!    (set_attr "prefix_rep" "1")
!    (set_attr "memory" "store")
!    (set_attr "mode" "SI")])
! 
! (define_insn "rep_stossi_rex64"
!   [(set (match_operand:DI 1 "register_operand" "=c") (const_int 0))
!    (set (match_operand:DI 0 "register_operand" "=D") 
!         (plus:DI (ashift:DI (match_operand:DI 4 "register_operand" "1")
! 			    (const_int 2))
! 		 (match_operand:DI 3 "register_operand" "0")))
!    (set (mem:BLK (match_dup 3))
! 	(const_int 0))
!    (use (match_operand:SI 2 "register_operand" "a"))
!    (use (match_dup 4))
!    (use (reg:SI 19))]
!   "TARGET_64BIT"
    "rep\;stosl|rep stosd"
    [(set_attr "type" "str")
     (set_attr "prefix_rep" "1")
***************
*** 12659,12665 ****
     (use (match_operand:QI 2 "register_operand" "a"))
     (use (match_dup 4))
     (use (reg:SI 19))]
!   ""
    "rep\;stosb|rep stosb"
    [(set_attr "type" "str")
     (set_attr "prefix_rep" "1")
--- 14969,14992 ----
     (use (match_operand:QI 2 "register_operand" "a"))
     (use (match_dup 4))
     (use (reg:SI 19))]
!   "!TARGET_64BIT"
!   "rep\;stosb|rep stosb"
!   [(set_attr "type" "str")
!    (set_attr "prefix_rep" "1")
!    (set_attr "memory" "store")
!    (set_attr "mode" "QI")])
! 
! (define_insn "rep_stosqi_rex64"
!   [(set (match_operand:DI 1 "register_operand" "=c") (const_int 0))
!    (set (match_operand:DI 0 "register_operand" "=D") 
!         (plus:DI (match_operand:DI 3 "register_operand" "0")
! 		 (match_operand:DI 4 "register_operand" "1")))
!    (set (mem:BLK (match_dup 3))
! 	(const_int 0))
!    (use (match_operand:QI 2 "register_operand" "a"))
!    (use (match_dup 4))
!    (use (reg:DI 19))]
!   "TARGET_64BIT"
    "rep\;stosb|rep stosb"
    [(set_attr "type" "str")
     (set_attr "prefix_rep" "1")
***************
*** 12670,12677 ****
    [(set (match_operand:SI 0 "register_operand" "")
  	(compare:SI (match_operand:BLK 1 "general_operand" "")
  		    (match_operand:BLK 2 "general_operand" "")))
!    (use (match_operand:SI 3 "general_operand" ""))
!    (use (match_operand:SI 4 "immediate_operand" ""))]
    ""
    "
  {
--- 14997,15004 ----
    [(set (match_operand:SI 0 "register_operand" "")
  	(compare:SI (match_operand:BLK 1 "general_operand" "")
  		    (match_operand:BLK 2 "general_operand" "")))
!    (use (match_operand 3 "general_operand" ""))
!    (use (match_operand 4 "immediate_operand" ""))]
    ""
    "
  {
***************
*** 12685,12691 ****
    addr2 = copy_to_mode_reg (Pmode, XEXP (operands[2], 0));
    
    count = operands[3];
!   countreg = copy_to_mode_reg (SImode, count);
  
    /* %%% Iff we are testing strict equality, we can use known alignment
       to good advantage.  This may be possible with combine, particularly
--- 15012,15018 ----
    addr2 = copy_to_mode_reg (Pmode, XEXP (operands[2], 0));
    
    count = operands[3];
!   countreg = copy_to_mode_reg (Pmode, count);
  
    /* %%% Iff we are testing strict equality, we can use known alignment
       to good advantage.  This may be possible with combine, particularly
***************
*** 12700,12713 ****
  	  emit_move_insn (operands[0], const0_rtx);
  	  DONE;
  	}
!       emit_insn (gen_cmpstrsi_nz_1 (addr1, addr2, countreg, align,
! 				    addr1, addr2, countreg));
      }
    else
      {
!       emit_insn (gen_cmpsi_1 (countreg, countreg));
!       emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align,
! 				 addr1, addr2, countreg));
      }
  
    outlow = gen_lowpart (QImode, out);
--- 15027,15053 ----
  	  emit_move_insn (operands[0], const0_rtx);
  	  DONE;
  	}
!       if (TARGET_64BIT)
! 	emit_insn (gen_cmpstrqi_nz_rex_1 (addr1, addr2, countreg, align,
! 					  addr1, addr2, countreg));
!       else
! 	emit_insn (gen_cmpstrqi_nz_1 (addr1, addr2, countreg, align,
! 				      addr1, addr2, countreg));
      }
    else
      {
!       if (TARGET_64BIT)
! 	{
! 	  emit_insn (gen_cmpdi_1_rex64 (countreg, countreg));
! 	  emit_insn (gen_cmpstrqi_rex_1 (addr1, addr2, countreg, align,
! 					 addr1, addr2, countreg));
! 	}
!       else
! 	{
! 	  emit_insn (gen_cmpsi_1 (countreg, countreg));
! 	  emit_insn (gen_cmpstrqi_1 (addr1, addr2, countreg, align,
! 				     addr1, addr2, countreg));
! 	}
      }
  
    outlow = gen_lowpart (QImode, out);
***************
*** 12738,12744 ****
  ;; memcmp recognizers.  The `cmpsb' opcode does nothing if the count is
  ;; zero.  Emit extra code to make sure that a zero-length compare is EQ.
  
! (define_insn "cmpstrsi_nz_1"
    [(set (reg:CC 17)
  	(compare:CC (mem:BLK (match_operand:SI 4 "register_operand" "0"))
  		    (mem:BLK (match_operand:SI 5 "register_operand" "1"))))
--- 15078,15084 ----
  ;; memcmp recognizers.  The `cmpsb' opcode does nothing if the count is
  ;; zero.  Emit extra code to make sure that a zero-length compare is EQ.
  
! (define_insn "cmpstrqi_nz_1"
    [(set (reg:CC 17)
  	(compare:CC (mem:BLK (match_operand:SI 4 "register_operand" "0"))
  		    (mem:BLK (match_operand:SI 5 "register_operand" "1"))))
***************
*** 12748,12754 ****
     (clobber (match_operand:SI 0 "register_operand" "=S"))
     (clobber (match_operand:SI 1 "register_operand" "=D"))
     (clobber (match_operand:SI 2 "register_operand" "=c"))]
!   ""
    "repz{\;| }cmpsb"
    [(set_attr "type" "str")
     (set_attr "mode" "QI")
--- 15088,15110 ----
     (clobber (match_operand:SI 0 "register_operand" "=S"))
     (clobber (match_operand:SI 1 "register_operand" "=D"))
     (clobber (match_operand:SI 2 "register_operand" "=c"))]
!   "!TARGET_64BIT"
!   "repz{\;| }cmpsb"
!   [(set_attr "type" "str")
!    (set_attr "mode" "QI")
!    (set_attr "prefix_rep" "1")])
! 
! (define_insn "cmpstrqi_nz_rex_1"
!   [(set (reg:CC 17)
! 	(compare:CC (mem:BLK (match_operand:DI 4 "register_operand" "0"))
! 		    (mem:BLK (match_operand:DI 5 "register_operand" "1"))))
!    (use (match_operand:DI 6 "register_operand" "2"))
!    (use (match_operand:SI 3 "immediate_operand" "i"))
!    (use (reg:SI 19))
!    (clobber (match_operand:DI 0 "register_operand" "=S"))
!    (clobber (match_operand:DI 1 "register_operand" "=D"))
!    (clobber (match_operand:DI 2 "register_operand" "=c"))]
!   "TARGET_64BIT"
    "repz{\;| }cmpsb"
    [(set_attr "type" "str")
     (set_attr "mode" "QI")
***************
*** 12756,12762 ****
  
  ;; The same, but the count is not known to not be zero.
  
! (define_insn "cmpstrsi_1"
    [(set (reg:CC 17)
  	(if_then_else:CC (ne (match_operand:SI 6 "register_operand" "2")
  			     (const_int 0))
--- 15112,15118 ----
  
  ;; The same, but the count is not known to not be zero.
  
! (define_insn "cmpstrqi_1"
    [(set (reg:CC 17)
  	(if_then_else:CC (ne (match_operand:SI 6 "register_operand" "2")
  			     (const_int 0))
***************
*** 12769,12775 ****
     (clobber (match_operand:SI 0 "register_operand" "=S"))
     (clobber (match_operand:SI 1 "register_operand" "=D"))
     (clobber (match_operand:SI 2 "register_operand" "=c"))]
!   ""
    "repz{\;| }cmpsb"
    [(set_attr "type" "str")
     (set_attr "mode" "QI")
--- 15125,15150 ----
     (clobber (match_operand:SI 0 "register_operand" "=S"))
     (clobber (match_operand:SI 1 "register_operand" "=D"))
     (clobber (match_operand:SI 2 "register_operand" "=c"))]
!   "!TARGET_64BIT"
!   "repz{\;| }cmpsb"
!   [(set_attr "type" "str")
!    (set_attr "mode" "QI")
!    (set_attr "prefix_rep" "1")])
! 
! (define_insn "cmpstrqi_rex_1"
!   [(set (reg:CC 17)
! 	(if_then_else:CC (ne (match_operand:DI 6 "register_operand" "2")
! 			     (const_int 0))
! 	  (compare:CC (mem:BLK (match_operand:DI 4 "register_operand" "0"))
! 		      (mem:BLK (match_operand:DI 5 "register_operand" "1")))
! 	  (const_int 0)))
!    (use (match_operand:SI 3 "immediate_operand" "i"))
!    (use (reg:CC 17))
!    (use (reg:SI 19))
!    (clobber (match_operand:DI 0 "register_operand" "=S"))
!    (clobber (match_operand:DI 1 "register_operand" "=D"))
!    (clobber (match_operand:DI 2 "register_operand" "=c"))]
!   "TARGET_64BIT"
    "repz{\;| }cmpsb"
    [(set_attr "type" "str")
     (set_attr "mode" "QI")
***************
*** 12779,12854 ****
    [(set (match_operand:SI 0 "register_operand" "")
  	(unspec:SI [(match_operand:BLK 1 "general_operand" "")
  		    (match_operand:QI 2 "immediate_operand" "")
! 		    (match_operand:SI 3 "immediate_operand" "")] 0))]
    ""
    "
  {
!   rtx out, addr, scratch1, scratch2, scratch3;
!   rtx eoschar = operands[2];
!   rtx align = operands[3];
! 
!   /* The generic case of strlen expander is long.  Avoid it's
!      expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
! 
!   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
!       && !TARGET_INLINE_ALL_STRINGOPS
!       && !optimize_size
!       && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4))
!     FAIL;
! 
!   out = operands[0];
!   addr = force_reg (Pmode, XEXP (operands[1], 0));
!   scratch1 = gen_reg_rtx (SImode);
! 
!   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
!       && !optimize_size)
!     {
!       /* Well it seems that some optimizer does not combine a call like
! 	     foo(strlen(bar), strlen(bar));
! 	 when the move and the subtraction is done here.  It does calculate
! 	 the length just once when these instructions are done inside of
! 	 output_strlen_unroll().  But I think since &bar[strlen(bar)] is
! 	 often used and I use one fewer register for the lifetime of
! 	 output_strlen_unroll() this is better.  */
! 
!       if (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)
! 	emit_move_insn (scratch1, addr);
! 
!       emit_move_insn (out, addr);
! 
!       ix86_expand_strlensi_unroll_1 (out, align, scratch1);
! 
!       /* strlensi_unroll_1 returns the address of the zero at the end of
! 	 the string, like memchr(), so compute the length by subtracting
! 	 the start address.  */
!       emit_insn (gen_subsi3 (out, out, addr));
!     }
!   else
!     {
!       scratch2 = gen_reg_rtx (SImode);
!       scratch3 = gen_reg_rtx (SImode);
! 
!       emit_move_insn (scratch3, addr);
  
!       emit_insn (gen_cld ());
!       emit_insn (gen_strlensi_1 (scratch1, scratch3, eoschar,
! 				 align, constm1_rtx, scratch3));
!       emit_insn (gen_one_cmplsi2 (scratch2, scratch1));
!       emit_insn (gen_addsi3 (out, scratch2, constm1_rtx));
!     }
!   DONE;
  }")
  
! (define_insn "strlensi_1"
    [(set (match_operand:SI 0 "register_operand" "=&c")
  	(unspec:SI [(mem:BLK (match_operand:SI 5 "register_operand" "1"))
! 		    (match_operand:QI 2 "general_operand" "a")
  		    (match_operand:SI 3 "immediate_operand" "i")
! 		    (match_operand:SI 4 "immediate_operand" "0")] 0))
     (use (reg:SI 19))
     (clobber (match_operand:SI 1 "register_operand" "=D"))
     (clobber (reg:CC 17))]
!   ""
    "repnz{\;| }scasb"
    [(set_attr "type" "str")
     (set_attr "mode" "QI")
--- 15154,15208 ----
    [(set (match_operand:SI 0 "register_operand" "")
  	(unspec:SI [(match_operand:BLK 1 "general_operand" "")
  		    (match_operand:QI 2 "immediate_operand" "")
! 		    (match_operand 3 "immediate_operand" "")] 0))]
    ""
    "
  {
!  if (ix86_expand_strlen (operands[0], operands[1], operands[2], operands[3]))
!    DONE;
!  else
!    FAIL;
! }")
  
! (define_expand "strlendi"
!   [(set (match_operand:DI 0 "register_operand" "")
! 	(unspec:DI [(match_operand:BLK 1 "general_operand" "")
! 		    (match_operand:QI 2 "immediate_operand" "")
! 		    (match_operand 3 "immediate_operand" "")] 0))]
!   ""
!   "
! {
!  if (ix86_expand_strlen (operands[0], operands[1], operands[2], operands[3]))
!    DONE;
!  else
!    FAIL;
  }")
  
! (define_insn "strlenqi_1"
    [(set (match_operand:SI 0 "register_operand" "=&c")
  	(unspec:SI [(mem:BLK (match_operand:SI 5 "register_operand" "1"))
! 		    (match_operand:QI 2 "register_operand" "a")
  		    (match_operand:SI 3 "immediate_operand" "i")
! 		    (match_operand:SI 4 "register_operand" "0")] 0))
     (use (reg:SI 19))
     (clobber (match_operand:SI 1 "register_operand" "=D"))
     (clobber (reg:CC 17))]
!   "!TARGET_64BIT"
!   "repnz{\;| }scasb"
!   [(set_attr "type" "str")
!    (set_attr "mode" "QI")
!    (set_attr "prefix_rep" "1")])
! 
! (define_insn "strlenqi_rex_1"
!   [(set (match_operand:DI 0 "register_operand" "=&c")
! 	(unspec:DI [(mem:BLK (match_operand:DI 5 "register_operand" "1"))
! 		    (match_operand:QI 2 "register_operand" "a")
! 		    (match_operand:DI 3 "immediate_operand" "i")
! 		    (match_operand:DI 4 "register_operand" "0")] 0))
!    (use (reg:SI 19))
!    (clobber (match_operand:DI 1 "register_operand" "=D"))
!    (clobber (reg:CC 17))]
!   "TARGET_64BIT"
    "repnz{\;| }scasb"
    [(set_attr "type" "str")
     (set_attr "mode" "QI")
*** i386.c	Thu Mar 22 20:29:02 2001
--- /p1/new/x86-64/gcc/gcc/config/i386/i386.c	Sun Mar 25 00:37:22 2001
*************** static void ix86_set_move_mem_attrs_1 PA
*** 566,571 ****
--- 578,587 ----
  static void ix86_sched_reorder_pentium PARAMS((rtx *, rtx *));
  static void ix86_sched_reorder_ppro PARAMS((rtx *, rtx *));
  static HOST_WIDE_INT ix86_GOT_alias_set PARAMS ((void));
+ static void ix86_adjust_counter PARAMS ((rtx, HOST_WIDE_INT));
+ static rtx ix86_zero_extend_to_Pmode PARAMS ((rtx));
+ static rtx ix86_expand_aligntest PARAMS ((rtx, int));
+ static void ix86_expand_strlensi_unroll_1 PARAMS ((rtx, rtx));
  
  struct ix86_address
  {
*************** ix86_split_lshrdi (operands, scratch)
*** 6934,6984 ****
    rtx low[2], high[2];
    int count;
  
!   if (GET_CODE (operands[2]) == CONST_INT)
      {
!       split_di (operands, 2, low, high);
!       count = INTVAL (operands[2]) & 63;
  
!       if (count >= 32)
! 	{
! 	  emit_move_insn (low[0], high[1]);
! 	  emit_move_insn (high[0], const0_rtx);
  
! 	  if (count > 32)
! 	    emit_insn (gen_lshrsi3 (low[0], low[0], GEN_INT (count - 32)));
! 	}
        else
! 	{
! 	  if (!rtx_equal_p (operands[0], operands[1]))
! 	    emit_move_insn (operands[0], operands[1]);
! 	  emit_insn (gen_x86_shrd_1 (low[0], high[0], GEN_INT (count)));
! 	  emit_insn (gen_lshrsi3 (high[0], high[0], GEN_INT (count)));
! 	}
      }
    else
      {
!       if (!rtx_equal_p (operands[0], operands[1]))
! 	emit_move_insn (operands[0], operands[1]);
! 
!       split_di (operands, 1, low, high);
  
!       emit_insn (gen_x86_shrd_1 (low[0], high[0], operands[2]));
!       emit_insn (gen_lshrsi3 (high[0], high[0], operands[2]));
  
!       /* Heh.  By reversing the arguments, we can reuse this pattern.  */
!       if (TARGET_CMOVE && (! no_new_pseudos || scratch))
  	{
! 	  if (! no_new_pseudos)
! 	    scratch = force_reg (SImode, const0_rtx);
! 	  else
! 	    emit_move_insn (scratch, const0_rtx);
! 
! 	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
! 					  scratch));
  	}
        else
! 	emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
      }
  }
  
  /* Expand the appropriate insns for doing strlen if not just doing
--- 8224,8810 ----
    rtx low[2], high[2];
    int count;
  
!   if (GET_CODE (operands[2]) == CONST_INT)
!     {
!       split_di (operands, 2, low, high);
!       count = INTVAL (operands[2]) & 63;
! 
!       if (count >= 32)
! 	{
! 	  emit_move_insn (low[0], high[1]);
! 	  emit_move_insn (high[0], const0_rtx);
! 
! 	  if (count > 32)
! 	    emit_insn (gen_lshrsi3 (low[0], low[0], GEN_INT (count - 32)));
! 	}
!       else
! 	{
! 	  if (!rtx_equal_p (operands[0], operands[1]))
! 	    emit_move_insn (operands[0], operands[1]);
! 	  emit_insn (gen_x86_shrd_1 (low[0], high[0], GEN_INT (count)));
! 	  emit_insn (gen_lshrsi3 (high[0], high[0], GEN_INT (count)));
! 	}
!     }
!   else
!     {
!       if (!rtx_equal_p (operands[0], operands[1]))
! 	emit_move_insn (operands[0], operands[1]);
! 
!       split_di (operands, 1, low, high);
! 
!       emit_insn (gen_x86_shrd_1 (low[0], high[0], operands[2]));
!       emit_insn (gen_lshrsi3 (high[0], high[0], operands[2]));
! 
!       /* Heh.  By reversing the arguments, we can reuse this pattern.  */
!       if (TARGET_CMOVE && (! no_new_pseudos || scratch))
! 	{
! 	  if (! no_new_pseudos)
! 	    scratch = force_reg (SImode, const0_rtx);
! 	  else
! 	    emit_move_insn (scratch, const0_rtx);
! 
! 	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
! 					  scratch));
! 	}
!       else
! 	emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
!     }
! }
! 
! /* Helper function for the string operations bellow.  Dest VARIABLE whether
!    it is aligned to VALUE bytes.  If true, jump to the label.  */
! static rtx
! ix86_expand_aligntest (variable, value)
!      rtx variable;
!      int value;
! {
!   rtx label = gen_label_rtx ();
!   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
!   if (GET_MODE (variable) == DImode)
!     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
!   else
!     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
!   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
! 			   1, 0, label);
!   return label;
! }
! 
! /* Adjust COUNTER by the VALUE.  */
! static void
! ix86_adjust_counter (countreg, value)
!      rtx countreg;
!      HOST_WIDE_INT value;
! {
!   if (GET_MODE (countreg) == DImode)
!     emit_insn (gen_adddi3 (countreg, countreg, GEN_INT (-value)));
!   else
!     emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-value)));
! }
!
! /* Zero extend possibly SImode EXP to Pmode register.  */
! static rtx
! ix86_zero_extend_to_Pmode (exp)
!    rtx exp;
! {
!   rtx r;
!   if (GET_MODE (exp) == VOIDmode)
!     return force_reg (Pmode, exp);
!   if (GET_MODE (exp) == Pmode)
!     return copy_to_mode_reg (Pmode, exp);
!   r = gen_reg_rtx (Pmode);
!   emit_insn (gen_zero_extendsidi2 (r, exp));
!   return r;
! }
! 
! /* Expand string move (memcpy) operation.  Use i386 string operations when
!    profitable.  expand_clrstr contains similar code.  */
! int
! ix86_expand_movstr (dst, src, count_exp, align_exp)
!      rtx dst, src, count_exp, align_exp;
! {
!   rtx srcreg, destreg, countreg;
!   enum machine_mode counter_mode;
!   HOST_WIDE_INT align = 0;
!   unsigned HOST_WIDE_INT count = 0;
!   rtx insns;
! 
!   start_sequence ();
! 
!   if (GET_CODE (align_exp) == CONST_INT)
!     align = INTVAL (align_exp);
! 
!   /* This simple hack avoids all inlining code and simplifies code bellow.  */
!   if (!TARGET_ALIGN_STRINGOPS)
!     align = 64;
! 
!   if (GET_CODE (count_exp) == CONST_INT)
!     count = INTVAL (count_exp);
! 
!   /* Figure out proper mode for counter.  For 32bits it is always SImode,
!      for 64bits use SImode when possible, otherwise DImode.
!      Set count to number of bytes copied when known at compile time.  */
!   if (!TARGET_64BIT || GET_MODE (count_exp) == SImode
!       || x86_64_zero_extended_value (count_exp))
!     counter_mode = SImode;
!   else
!     counter_mode = DImode;
! 
!   if (counter_mode != SImode && counter_mode != DImode)
!     abort ();
! 
!   destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
!   srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
! 
!   emit_insn (gen_cld ());
! 
!   /* When optimizing for size emit simple rep ; movsb instruction for
!      counts not divisible by 4.  */
! 
!   if ((!optimize || optimize_size) && (count == 0 || (count & 0x03)))
!     {
!       countreg = ix86_zero_extend_to_Pmode (count_exp);
!       if (TARGET_64BIT)
! 	emit_insn (gen_rep_movqi_rex64 (destreg, srcreg, countreg,
! 				        destreg, srcreg, countreg));
!       else
! 	emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
! 				  destreg, srcreg, countreg));
!     }
! 
!   /* For constant aligned (or small unaligned) copies use rep movsl
!      followed by code copying the rest.  For PentiumPro ensure 8 byte
!      alignment to allow rep movsl acceleration.  */
! 
!   else if (count != 0
! 	   && (align >= 8
! 	       || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4)
! 	       || optimize_size || count < (unsigned int)64))
!     {
!       int size = TARGET_64BIT && !optimize_size ? 8 : 4;
!       if (count & ~(size - 1))
! 	{
! 	  countreg = copy_to_mode_reg (counter_mode,
! 				       GEN_INT ((count >> (size == 4 ? 2 : 3))
! 						& (TARGET_64BIT ? -1 : 0x3fffffff)));
! 	  countreg = ix86_zero_extend_to_Pmode (countreg);
! 	  if (size == 4)
! 	    {
! 	      if (TARGET_64BIT)
! 		emit_insn (gen_rep_movsi_rex64 (destreg, srcreg, countreg,
! 					        destreg, srcreg, countreg));
! 	      else
! 		emit_insn (gen_rep_movsi (destreg, srcreg, countreg,
! 					  destreg, srcreg, countreg));
! 	    }
! 	  else
! 	    emit_insn (gen_rep_movdi_rex64 (destreg, srcreg, countreg,
! 					    destreg, srcreg, countreg));
! 	}
!       if (size == 8 && (count & 0x04))
! 	emit_insn (gen_strmovsi (destreg, srcreg));
!       if (count & 0x02)
! 	emit_insn (gen_strmovhi (destreg, srcreg));
!       if (count & 0x01)
! 	emit_insn (gen_strmovqi (destreg, srcreg));
!     }
!   /* The generic code based on the glibc implementation:
!      - align destination to 4 bytes (8 byte alignment is used for PentiumPro
!      allowing accelerated copying there)
!      - copy the data using rep movsl
!      - copy the rest.  */
!   else
!     {
!       rtx countreg2;
!       rtx label = NULL;
! 
!       /* In case we don't know anything about the alignment, default to
!          library version, since it is usually equally fast and result in
!          shorter code.  */
!       if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD)
! 	{
! 	  end_sequence ();
! 	  return 0;
! 	}
! 
!       if (TARGET_SINGLE_STRINGOP)
! 	emit_insn (gen_cld ());
! 
!       countreg2 = gen_reg_rtx (Pmode);
!       countreg = copy_to_mode_reg (counter_mode, count_exp);
! 
!       /* We don't use loops to align destination and to copy parts smaller
!          than 4 bytes, because gcc is able to optimize such code better (in
!          the case the destination or the count really is aligned, gcc is often
!          able to predict the branches) and also it is friendlier to the
!          hardware branch prediction.  
! 
!          Using loops is benefical for generic case, because we can
!          handle small counts using the loops.  Many CPUs (such as Athlon)
!          have large REP prefix setup costs.
! 
!          This is quite costy.  Maybe we can revisit this decision later or
!          add some customizability to this code.  */
! 
!       if (count == 0
! 	  && align < (TARGET_PENTIUMPRO && (count == 0
! 					    || count >= (unsigned int)260)
! 		      ? 8 : UNITS_PER_WORD))
! 	{
! 	  label = gen_label_rtx ();
! 	  emit_cmp_and_jump_insns (countreg, GEN_INT (UNITS_PER_WORD - 1),
! 				   LEU, 0, counter_mode, 1, 0, label);
! 	}
!       if (align <= 1)
! 	{
! 	  rtx label = ix86_expand_aligntest (destreg, 1);
! 	  emit_insn (gen_strmovqi (destreg, srcreg));
! 	  ix86_adjust_counter (countreg, 1);
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align <= 2)
! 	{
! 	  rtx label = ix86_expand_aligntest (destreg, 2);
! 	  emit_insn (gen_strmovhi (destreg, srcreg));
! 	  ix86_adjust_counter (countreg, 2);
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align <= 4
! 	  && ((TARGET_PENTIUMPRO && (count == 0
! 				     || count >= (unsigned int)260))
! 	      || TARGET_64BIT))
! 	{
! 	  rtx label = ix86_expand_aligntest (destreg, 4);
! 	  emit_insn (gen_strmovsi (destreg, srcreg));
! 	  ix86_adjust_counter (countreg, 4);
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
! 
!       if (!TARGET_SINGLE_STRINGOP)
! 	emit_insn (gen_cld ());
!       if (TARGET_64BIT)
! 	{
! 	  emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg),
! 				  GEN_INT (3)));
! 	  emit_insn (gen_rep_movdi_rex64 (destreg, srcreg, countreg2,
! 					  destreg, srcreg, countreg2));
! 	}
!       else
! 	{
! 	  emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
! 	  emit_insn (gen_rep_movsi (destreg, srcreg, countreg2,
! 				    destreg, srcreg, countreg2));
! 	}
! 
!       if (label)
! 	{
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (TARGET_64BIT && align > 4 && count != 0 && (count & 4))
! 	emit_insn (gen_strmovsi (destreg, srcreg));
!       if ((align <= 4 || count == 0) && TARGET_64BIT)
! 	{
! 	  rtx label = ix86_expand_aligntest (countreg, 4);
! 	  emit_insn (gen_strmovsi (destreg, srcreg));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align > 2 && count != 0 && (count & 2))
! 	emit_insn (gen_strmovhi (destreg, srcreg));
!       if (align <= 2 || count == 0)
! 	{
! 	  rtx label = ix86_expand_aligntest (countreg, 2);
! 	  emit_insn (gen_strmovhi (destreg, srcreg));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align > 1 && count != 0 && (count & 1))
! 	emit_insn (gen_strmovqi (destreg, srcreg));
!       if (align <= 1 || count == 0)
! 	{
! 	  rtx label = ix86_expand_aligntest (countreg, 1);
! 	  emit_insn (gen_strmovqi (destreg, srcreg));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!     }
! 
!   insns = get_insns ();
!   end_sequence ();
! 
!   ix86_set_move_mem_attrs (insns, dst, src, destreg, srcreg);
!   emit_insns (insns);
!   return 1;
! }
! 
! /* Expand string clear operation (bzero).  Use i386 string operations when
!    profitable.  expand_movstr contains similar code.  */
! int
! ix86_expand_clrstr (src, count_exp, align_exp)
!      rtx src, count_exp, align_exp;
! {
!   rtx destreg, zeroreg, countreg;
!   enum machine_mode counter_mode;
!   HOST_WIDE_INT align = 0;
!   unsigned HOST_WIDE_INT count = 0;
! 
!   if (GET_CODE (align_exp) == CONST_INT)
!     align = INTVAL (align_exp);
! 
!   /* This simple hack avoids all inlining code and simplifies code bellow.  */
!   if (!TARGET_ALIGN_STRINGOPS)
!     align = 32;
! 
!   if (GET_CODE (count_exp) == CONST_INT)
!     count = INTVAL (count_exp);
!   /* Figure out proper mode for counter.  For 32bits it is always SImode,
!      for 64bits use SImode when possible, otherwise DImode.
!      Set count to number of bytes copied when known at compile time.  */
!   if (!TARGET_64BIT || GET_MODE (count_exp) == SImode
!       || x86_64_zero_extended_value (count_exp))
!     counter_mode = SImode;
!   else
!     counter_mode = DImode;
! 
!   destreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
! 
!   emit_insn (gen_cld ());
! 
!   /* When optimizing for size emit simple rep ; movsb instruction for
!      counts not divisible by 4.  */
! 
!   if ((!optimize || optimize_size) && (count == 0 || (count & 0x03)))
!     {
!       countreg = ix86_zero_extend_to_Pmode (count_exp);
!       zeroreg = copy_to_mode_reg (QImode, const0_rtx);
!       if (TARGET_64BIT)
! 	emit_insn (gen_rep_stosqi_rex64 (destreg, countreg, zeroreg,
! 				         destreg, countreg));
!       else
! 	emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
! 				   destreg, countreg));
!     }
!   else if (count != 0
! 	   && (align >= 8
! 	       || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4)
! 	       || optimize_size || count < (unsigned int)64))
!     {
!       int size = TARGET_64BIT && !optimize_size ? 8 : 4;
!       zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx);
!       if (count & ~(size - 1))
! 	{
! 	  countreg = copy_to_mode_reg (counter_mode,
! 				       GEN_INT ((count >> (size == 4 ? 2 : 3))
! 						& (TARGET_64BIT ? -1 : 0x3fffffff)));
! 	  countreg = ix86_zero_extend_to_Pmode (countreg);
! 	  if (size == 4)
! 	    {
! 	      if (TARGET_64BIT)
! 		emit_insn (gen_rep_stossi_rex64 (destreg, countreg, zeroreg,
! 					         destreg, countreg));
! 	      else
! 		emit_insn (gen_rep_stossi (destreg, countreg, zeroreg,
! 					   destreg, countreg));
! 	    }
! 	  else
! 	    emit_insn (gen_rep_stosdi_rex64 (destreg, countreg, zeroreg,
! 					     destreg, countreg));
! 	}
!       if (size == 8 && (count & 0x04))
! 	emit_insn (gen_strsetsi (destreg,
! 				 gen_rtx_SUBREG (SImode, zeroreg, 0)));
!       if (count & 0x02)
! 	emit_insn (gen_strsethi (destreg,
! 				 gen_rtx_SUBREG (HImode, zeroreg, 0)));
!       if (count & 0x01)
! 	emit_insn (gen_strsetqi (destreg,
! 				 gen_rtx_SUBREG (QImode, zeroreg, 0)));
!     }
!   else
!     {
!       rtx countreg2;
!       rtx label = NULL;
! 
!       /* In case we don't know anything about the alignment, default to
!          library version, since it is usually equally fast and result in
!          shorter code.  */
!       if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD)
! 	return 0;
! 
!       if (TARGET_SINGLE_STRINGOP)
! 	emit_insn (gen_cld ());
! 
!       countreg2 = gen_reg_rtx (Pmode);
!       countreg = copy_to_mode_reg (counter_mode, count_exp);
!       zeroreg = copy_to_mode_reg (Pmode, const0_rtx);
! 
!       if (count == 0
! 	  && align < (TARGET_PENTIUMPRO && (count == 0
! 					    || count >= (unsigned int)260)
! 		      ? 8 : UNITS_PER_WORD))
! 	{
! 	  label = gen_label_rtx ();
! 	  emit_cmp_and_jump_insns (countreg, GEN_INT (UNITS_PER_WORD - 1),
! 				   LEU, 0, counter_mode, 1, 0, label);
! 	}
!       if (align <= 1)
! 	{
! 	  rtx label = ix86_expand_aligntest (destreg, 1);
! 	  emit_insn (gen_strsetqi (destreg,
! 				   gen_rtx_SUBREG (QImode, zeroreg, 0)));
! 	  ix86_adjust_counter (countreg, 1);
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align <= 2)
! 	{
! 	  rtx label = ix86_expand_aligntest (destreg, 2);
! 	  emit_insn (gen_strsethi (destreg,
! 				   gen_rtx_SUBREG (HImode, zeroreg, 0)));
! 	  ix86_adjust_counter (countreg, 2);
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align <= 4 && TARGET_PENTIUMPRO && (count == 0
! 					      || count >= (unsigned int)260))
! 	{
! 	  rtx label = ix86_expand_aligntest (destreg, 4);
! 	  emit_insn (gen_strsetsi (destreg, (TARGET_64BIT
! 					     ? gen_rtx_SUBREG (SImode, zeroreg, 0)
! 					     : zeroreg)));
! 	  ix86_adjust_counter (countreg, 4);
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
! 
!       if (!TARGET_SINGLE_STRINGOP)
! 	emit_insn (gen_cld ());
!       if (TARGET_64BIT)
! 	{
! 	  emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg),
! 				  GEN_INT (3)));
! 	  emit_insn (gen_rep_stosdi_rex64 (destreg, countreg2, zeroreg,
! 					   destreg, countreg2));
! 	}
!       else
! 	{
! 	  emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
! 	  emit_insn (gen_rep_stossi (destreg, countreg2, zeroreg,
! 				     destreg, countreg2));
! 	}
! 
!       if (label)
! 	{
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (TARGET_64BIT && align > 4 && count != 0 && (count & 4))
! 	emit_insn (gen_strsetsi (destreg,
! 				 gen_rtx_SUBREG (SImode, zeroreg, 0)));
!       if (TARGET_64BIT && (align <= 4 || count == 0))
! 	{
! 	  rtx label = ix86_expand_aligntest (destreg, 2);
! 	  emit_insn (gen_strsetsi (destreg,
! 				   gen_rtx_SUBREG (SImode, zeroreg, 0)));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align > 2 && count != 0 && (count & 2))
! 	emit_insn (gen_strsethi (destreg,
! 				 gen_rtx_SUBREG (HImode, zeroreg, 0)));
!       if (align <= 2 || count == 0)
! 	{
! 	  rtx label = ix86_expand_aligntest (destreg, 2);
! 	  emit_insn (gen_strsethi (destreg,
! 				   gen_rtx_SUBREG (HImode, zeroreg, 0)));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!       if (align > 1 && count != 0 && (count & 1))
! 	emit_insn (gen_strsetqi (destreg,
! 				 gen_rtx_SUBREG (QImode, zeroreg, 0)));
!       if (align <= 1 || count == 0)
! 	{
! 	  rtx label = ix86_expand_aligntest (destreg, 1);
! 	  emit_insn (gen_strsetqi (destreg,
! 				   gen_rtx_SUBREG (QImode, zeroreg, 0)));
! 	  emit_label (label);
! 	  LABEL_NUSES (label) = 1;
! 	}
!     }
!   return 1;
! }
! /* Expand strlen.  */
! int
! ix86_expand_strlen (out, src, eoschar, align)
!      rtx out, src, eoschar, align;
! {
!   rtx addr, scratch1, scratch2, scratch3, scratch4;
! 
!   /* The generic case of strlen expander is long.  Avoid it's
!      expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
! 
!   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
!       && !TARGET_INLINE_ALL_STRINGOPS
!       && !optimize_size
!       && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4))
!     return 0;
! 
!   addr = force_reg (Pmode, XEXP (src, 0));
!   scratch1 = gen_reg_rtx (Pmode);
! 
!   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
!       && !optimize_size)
      {
!       /* Well it seems that some optimizer does not combine a call like
!          foo(strlen(bar), strlen(bar));
!          when the move and the subtraction is done here.  It does calculate
!          the length just once when these instructions are done inside of
!          output_strlen_unroll().  But I think since &bar[strlen(bar)] is
!          often used and I use one fewer register for the lifetime of
!          output_strlen_unroll() this is better.  */
  
!       emit_move_insn (out, addr);
  
!       ix86_expand_strlensi_unroll_1 (out, align);
! 
!       /* strlensi_unroll_1 returns the address of the zero at the end of
!          the string, like memchr(), so compute the length by subtracting
!          the start address.  */
!       if (TARGET_64BIT)
! 	emit_insn (gen_subdi3 (out, out, addr));
        else
! 	emit_insn (gen_subsi3 (out, out, addr));
      }
    else
      {
!       scratch2 = gen_reg_rtx (Pmode);
!       scratch3 = gen_reg_rtx (Pmode);
!       scratch4 = force_reg (Pmode, constm1_rtx);
  
!       emit_move_insn (scratch3, addr);
!       eoschar = force_reg (QImode, eoschar);
  
!       emit_insn (gen_cld ());
!       if (TARGET_64BIT)
  	{
! 	  emit_insn (gen_strlenqi_rex_1 (scratch1, scratch3, eoschar,
! 					 align, scratch4, scratch3));
! 	  emit_insn (gen_one_cmpldi2 (scratch2, scratch1));
! 	  emit_insn (gen_adddi3 (out, scratch2, constm1_rtx));
  	}
        else
! 	{
! 	  emit_insn (gen_strlenqi_1 (scratch1, scratch3, eoschar,
! 				     align, scratch4, scratch3));
! 	  emit_insn (gen_one_cmplsi2 (scratch2, scratch1));
! 	  emit_insn (gen_addsi3 (out, scratch2, constm1_rtx));
! 	}
      }
+   return 1;
  }
  
  /* Expand the appropriate insns for doing strlen if not just doing
*************** ix86_split_lshrdi (operands, scratch)
*** 6992,7000 ****
     This is just the body. It needs the initialisations mentioned above and
     some address computing at the end.  These things are done in i386.md.  */
  
! void
! ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch)
!      rtx out, align_rtx, scratch;
  {
    int align;
    rtx tmp;
--- 8816,8824 ----
     This is just the body. It needs the initialisations mentioned above and
     some address computing at the end.  These things are done in i386.md.  */
  
! static void
! ix86_expand_strlensi_unroll_1 (out, align_rtx)
!      rtx out, align_rtx;
  {
    int align;
    rtx tmp;
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 7004,7009 ****
--- 8828,8834 ----
    rtx end_0_label = gen_label_rtx ();
    rtx mem;
    rtx tmpreg = gen_reg_rtx (SImode);
+   rtx scratch = gen_reg_rtx (SImode);
  
    align = 0;
    if (GET_CODE (align_rtx) == CONST_INT)
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 7014,7019 ****
--- 8839,8846 ----
    /* Is there a known alignment and is it less than 4?  */
    if (align < 4)
      {
+       rtx scratch1 = gen_reg_rtx (Pmode);
+       emit_move_insn (scratch1, out);
        /* Is there a known alignment and is it not 2? */
        if (align != 2)
  	{
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 7021,7046 ****
  	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
  
  	  /* Leave just the 3 lower bits.  */
! 	  align_rtx = expand_binop (SImode, and_optab, scratch, GEN_INT (3),
  				    NULL_RTX, 0, OPTAB_WIDEN);
  
  	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
! 				   SImode, 1, 0, align_4_label);
  	  emit_cmp_and_jump_insns (align_rtx, GEN_INT (2), EQ, NULL,
! 				   SImode, 1, 0, align_2_label);
  	  emit_cmp_and_jump_insns (align_rtx, GEN_INT (2), GTU, NULL,
! 				   SImode, 1, 0, align_3_label);
  	}
        else
          {
  	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
  	     check if is aligned to 4 - byte.  */
  
! 	  align_rtx = expand_binop (SImode, and_optab, scratch, GEN_INT (2),
  				    NULL_RTX, 0, OPTAB_WIDEN);
  
  	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
! 				   SImode, 1, 0, align_4_label);
          }
  
        mem = gen_rtx_MEM (QImode, out);
--- 8848,8873 ----
  	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
  
  	  /* Leave just the 3 lower bits.  */
! 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
  				    NULL_RTX, 0, OPTAB_WIDEN);
  
  	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
! 				   Pmode, 1, 0, align_4_label);
  	  emit_cmp_and_jump_insns (align_rtx, GEN_INT (2), EQ, NULL,
! 				   Pmode, 1, 0, align_2_label);
  	  emit_cmp_and_jump_insns (align_rtx, GEN_INT (2), GTU, NULL,
! 				   Pmode, 1, 0, align_3_label);
  	}
        else
          {
  	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
  	     check if is aligned to 4 - byte.  */
  
! 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (2),
  				    NULL_RTX, 0, OPTAB_WIDEN);
  
  	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
! 				   Pmode, 1, 0, align_4_label);
          }
  
        mem = gen_rtx_MEM (QImode, out);
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 7052,7058 ****
  			       QImode, 1, 0, end_0_label);
  
        /* Increment the address.  */
!       emit_insn (gen_addsi3 (out, out, const1_rtx));
  
        /* Not needed with an alignment of 2 */
        if (align != 2)
--- 8879,8888 ----
  			       QImode, 1, 0, end_0_label);
  
        /* Increment the address.  */
!       if (TARGET_64BIT)
! 	emit_insn (gen_adddi3 (out, out, const1_rtx));
!       else
! 	emit_insn (gen_addsi3 (out, out, const1_rtx));
  
        /* Not needed with an alignment of 2 */
        if (align != 2)
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 7062,7068 ****
  	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
  				   QImode, 1, 0, end_0_label);
  
! 	  emit_insn (gen_addsi3 (out, out, const1_rtx));
  
  	  emit_label (align_3_label);
  	}
--- 8892,8901 ----
  	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
  				   QImode, 1, 0, end_0_label);
  
! 	  if (TARGET_64BIT)
! 	    emit_insn (gen_adddi3 (out, out, const1_rtx));
! 	  else
! 	    emit_insn (gen_addsi3 (out, out, const1_rtx));
  
  	  emit_label (align_3_label);
  	}
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 7070,7076 ****
        emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
  			       QImode, 1, 0, end_0_label);
  
!       emit_insn (gen_addsi3 (out, out, const1_rtx));
      }
  
    /* Generate loop to check 4 bytes at a time.  It is not a good idea to
--- 8903,8912 ----
        emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
  			       QImode, 1, 0, end_0_label);
  
!       if (TARGET_64BIT)
! 	emit_insn (gen_adddi3 (out, out, const1_rtx));
!       else
! 	emit_insn (gen_addsi3 (out, out, const1_rtx));
      }
  
    /* Generate loop to check 4 bytes at a time.  It is not a good idea to
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 7080,7086 ****
  
    mem = gen_rtx_MEM (SImode, out);
    emit_move_insn (scratch, mem);
!   emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
  
    /* This formula yields a nonzero result iff one of the bytes is zero.
       This saves three branches inside loop and many cycles.  */
--- 8916,8925 ----
  
    mem = gen_rtx_MEM (SImode, out);
    emit_move_insn (scratch, mem);
!   if (TARGET_64BIT)
!     emit_insn (gen_adddi3 (out, out, GEN_INT (4)));
!   else
!     emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
  
    /* This formula yields a nonzero result iff one of the bytes is zero.
       This saves three branches inside loop and many cycles.  */
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 7095,7100 ****
--- 8934,8940 ----
    if (TARGET_CMOVE)
      {
         rtx reg = gen_reg_rtx (SImode);
+        rtx reg2 = gen_reg_rtx (Pmode);
         emit_move_insn (reg, tmpreg);
         emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
  
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 7107,7121 ****
  						     reg,
  						     tmpreg)));
         /* Emit lea manually to avoid clobbering of flags.  */
!        emit_insn (gen_rtx_SET (SImode, reg,
! 			       gen_rtx_PLUS (SImode, out, GEN_INT (2))));
  
         tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
         tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
         emit_insn (gen_rtx_SET (VOIDmode, out,
! 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
! 						     reg,
! 						     out)));
  
      }
    else
--- 8947,8961 ----
  						     reg,
  						     tmpreg)));
         /* Emit lea manually to avoid clobbering of flags.  */
!        emit_insn (gen_rtx_SET (SImode, reg2,
! 			       gen_rtx_PLUS (Pmode, out, GEN_INT (2))));
  
         tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
         tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
         emit_insn (gen_rtx_SET (VOIDmode, out,
! 			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
! 				       		     reg2,
! 				       		     out)));
  
      }
    else
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 7134,7140 ****
  
         /* Not in the first two.  Move two bytes forward.  */
         emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
!        emit_insn (gen_addsi3 (out, out, GEN_INT (2)));
  
         emit_label (end_2_label);
  
--- 8974,8983 ----
  
         /* Not in the first two.  Move two bytes forward.  */
         emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
!        if (TARGET_64BIT)
! 	 emit_insn (gen_adddi3 (out, out, GEN_INT (2)));
!        else
! 	 emit_insn (gen_addsi3 (out, out, GEN_INT (2)));
  
         emit_label (end_2_label);
  
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 7143,7149 ****
    /* Avoid branch in fixing the byte.  */
    tmpreg = gen_lowpart (QImode, tmpreg);
    emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
!   emit_insn (gen_subsi3_carry (out, out, GEN_INT (3)));
  
    emit_label (end_0_label);
  }
--- 8986,8995 ----
    /* Avoid branch in fixing the byte.  */
    tmpreg = gen_lowpart (QImode, tmpreg);
    emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
!   if (TARGET_64BIT)
!     emit_insn (gen_subdi3_carry_rex64 (out, out, GEN_INT (3)));
!   else
!     emit_insn (gen_subsi3_carry (out, out, GEN_INT (3)));
  
    emit_label (end_0_label);
  }
*** i386-protos.h	Wed Mar 21 19:53:08 2001
--- /p1/new/x86-64/gcc/gcc/config/i386/i386-protos.h	Wed Mar 21 10:28:12 2001
*************** extern int promotable_binary_operator PA
*** 77,85 ****
--- 78,89 ----
  extern int memory_displacement_operand PARAMS ((rtx, enum machine_mode));
  extern int cmpsi_operand PARAMS ((rtx, enum machine_mode));
  extern int long_memory_operand PARAMS ((rtx, enum machine_mode));
  extern int aligned_operand PARAMS ((rtx, enum machine_mode));
  extern enum machine_mode ix86_cc_mode PARAMS ((enum rtx_code, rtx, rtx));
  
+ extern int ix86_expand_movstr PARAMS ((rtx, rtx, rtx, rtx));
+ extern int ix86_expand_clrstr PARAMS ((rtx, rtx, rtx));
+ extern int ix86_expand_strlen PARAMS ((rtx, rtx, rtx, rtx));
  
  extern int legitimate_pic_address_disp_p PARAMS ((rtx));
  extern int legitimate_address_p PARAMS ((enum machine_mode, rtx, int));
*************** extern int ix86_split_long_move PARAMS (
*** 119,125 ****
  extern void ix86_split_ashldi PARAMS ((rtx *, rtx));
  extern void ix86_split_ashrdi PARAMS ((rtx *, rtx));
  extern void ix86_split_lshrdi PARAMS ((rtx *, rtx));
- extern void ix86_expand_strlensi_unroll_1 PARAMS ((rtx, rtx, rtx));
  extern int ix86_address_cost PARAMS ((rtx));
  extern rtx ix86_find_base_term PARAMS ((rtx));
  
--- 124,129 ----



More information about the Gcc-patches mailing list