More i386 string ops changes

Jan Hubicka hubicka@atrey.karlin.mff.cuni.cz
Thu Jan 13 05:09:00 GMT 2000


Hi
This patch rewrites rest of string operations in the same spirit as I did
for the memcpy.
It also adds MOVE_RATION to ix86_costs structure so it can be set per-cpu
basis and include my rewrite of strlen expander.
It basically changes the internal loop to emit similar code to glibc's strlen
that is approx 60% faster than our original implementation.

Thu Jan 13 13:51:28 MET 2000  Jan Hubicka  <jh@suse.cz>
	* i386.md (memstr): Use do not use rep stosb for counts divisible by 4
	when optimize_size.
	(clrstrsi): Rewrite.
	(strsethi, strsetqi): New expanders.
	(strsethi_1, strsetqi_1, rep_stossi, rep_stosqi): New insn patterns.
	(cmpstrsi): Emit compare insn before cmpstrsi_1
	(cmpstrsi_nz): use flags, set type to str, prefix_length to 1.
	(strlensi_1): Likewise.
	(cmpstrsi_1): Likewise; do not output compare.
	(strlen expander): Do not unroll when optimizing for size.
	(*subsi3_carry): Rename to subsi3_carry
	(*ashlqi3_cmpno): Likewise.
	* i386.h (processor_costs): Add move_ratio field.
	(MOVE_RATIO): Use move_ratio field, set to 3 for OPTIMIZE_SIZE
	* i386.c (*_cost): Set move_ratio.
	(x86_unroll_strlen): Enable for Athlon, PPro and K6 too.
	(x86_expand_strlensi_1): Rewrite the main loop.

? egcs/gcc/config/i386/moje
Index: egcs/gcc/config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.c,v
retrieving revision 1.122
diff -c -3 -p -r1.122 i386.c
*** i386.c	2000/01/11 23:52:07	1.122
--- i386.c	2000/01/13 12:50:48
*************** struct processor_costs i386_cost = {	/* 
*** 64,69 ****
--- 64,70 ----
    1,					/* cost of multiply per each bit set */
    23,					/* cost of a divide/mod */
    15,					/* "large" insn */
+   3,					/* MOVE_RATIO */
    4,					/* cost for loading QImode using movzbl */
    {2, 4, 2},				/* cost of loading integer registers
  					   in QImode, HImode and SImode.
*************** struct processor_costs i486_cost = {	/* 
*** 84,89 ****
--- 85,91 ----
    1,					/* cost of multiply per each bit set */
    40,					/* cost of a divide/mod */
    15,					/* "large" insn */
+   3,					/* MOVE_RATIO */
    4,					/* cost for loading QImode using movzbl */
    {2, 4, 2},				/* cost of loading integer registers
  					   in QImode, HImode and SImode.
*************** struct processor_costs pentium_cost = {
*** 104,109 ****
--- 106,112 ----
    0,					/* cost of multiply per each bit set */
    25,					/* cost of a divide/mod */
    8,					/* "large" insn */
+   6,					/* MOVE_RATIO */
    6,					/* cost for loading QImode using movzbl */
    {2, 4, 2},				/* cost of loading integer registers
  					   in QImode, HImode and SImode.
*************** struct processor_costs pentiumpro_cost =
*** 124,129 ****
--- 127,133 ----
    0,					/* cost of multiply per each bit set */
    17,					/* cost of a divide/mod */
    8,					/* "large" insn */
+   6,					/* MOVE_RATIO */
    2,					/* cost for loading QImode using movzbl */
    {4, 4, 4},				/* cost of loading integer registers
  					   in QImode, HImode and SImode.
*************** struct processor_costs k6_cost = {
*** 144,149 ****
--- 148,154 ----
    0,					/* cost of multiply per each bit set */
    18,					/* cost of a divide/mod */
    8,					/* "large" insn */
+   4,					/* MOVE_RATIO */
    3,					/* cost for loading QImode using movzbl */
    {4, 5, 4},				/* cost of loading integer registers
  					   in QImode, HImode and SImode.
*************** struct processor_costs athlon_cost = {
*** 164,169 ****
--- 169,175 ----
    0,					/* cost of multiply per each bit set */
    19,					/* cost of a divide/mod */
    8,					/* "large" insn */
+   9,					/* MOVE_RATIO */
    4,					/* cost for loading QImode using movzbl */
    {4, 5, 4},				/* cost of loading integer registers
  					   in QImode, HImode and SImode.
*************** const int x86_zero_extend_with_and = m_4
*** 191,197 ****
  const int x86_movx = m_ATHLON /* m_386 | m_PPRO | m_K6 */;
  const int x86_double_with_add = ~m_386;
  const int x86_use_bit_test = m_386;
! const int x86_unroll_strlen = m_486 | m_PENT;
  const int x86_use_q_reg = m_PENT | m_PPRO | m_K6;
  const int x86_use_any_reg = m_486;
  const int x86_cmove = m_PPRO | m_ATHLON;
--- 197,203 ----
  const int x86_movx = m_ATHLON /* m_386 | m_PPRO | m_K6 */;
  const int x86_double_with_add = ~m_386;
  const int x86_use_bit_test = m_386;
! const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6;
  const int x86_use_q_reg = m_PENT | m_PPRO | m_K6;
  const int x86_use_any_reg = m_486;
  const int x86_cmove = m_PPRO | m_ATHLON;
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 5149,5158 ****
    rtx align_3_label = NULL_RTX;
    rtx align_4_label = gen_label_rtx ();
    rtx end_0_label = gen_label_rtx ();
-   rtx end_2_label = gen_label_rtx ();
-   rtx end_3_label = gen_label_rtx ();
    rtx mem;
    rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
  
    align = 0;
    if (GET_CODE (align_rtx) == CONST_INT)
--- 5155,5163 ----
    rtx align_3_label = NULL_RTX;
    rtx align_4_label = gen_label_rtx ();
    rtx end_0_label = gen_label_rtx ();
    rtx mem;
    rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+   rtx tmpreg = gen_reg_rtx (SImode);
  
    align = 0;
    if (GET_CODE (align_rtx) == CONST_INT)
*************** ix86_expand_strlensi_unroll_1 (out, alig
*** 5269,5316 ****
  
    mem = gen_rtx_MEM (SImode, out);
    emit_move_insn (scratch, mem);
  
!   /* Check first byte. */
!   emit_insn (gen_cmpqi_0 (gen_lowpart (QImode, scratch), const0_rtx));
!   tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
!   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 
! 			      gen_rtx_LABEL_REF (VOIDmode, end_0_label),
! 			      pc_rtx);
!   emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
! 
!   /* Check second byte. */
!   emit_insn (gen_cmpqi_ext_3 (scratch, const0_rtx));
!   tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
!   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 
! 			      gen_rtx_LABEL_REF (VOIDmode, end_3_label),
! 			      pc_rtx);
!   emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
! 
!   /* Check third byte. */
!   emit_insn (gen_testsi_1 (scratch, GEN_INT (0x00ff0000)));
!   tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
!   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 
! 			      gen_rtx_LABEL_REF (VOIDmode, end_2_label),
! 			      pc_rtx);
!   emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
  
!   /* Check fourth byte and increment address. */
!   emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
!   emit_insn (gen_testsi_1 (scratch, GEN_INT (0xff000000)));
!   tmp = gen_rtx_NE (VOIDmode, flags, const0_rtx);
!   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, 
! 			      gen_rtx_LABEL_REF (VOIDmode, align_4_label),
! 			      pc_rtx);
!   emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
  
!   /* Now generate fixups when the compare stops within a 4-byte word. */
!   emit_insn (gen_subsi3 (out, out, GEN_INT (3)));
!   
!   emit_label (end_2_label);
!   emit_insn (gen_addsi3 (out, out, const1_rtx));
  
!   emit_label (end_3_label);
!   emit_insn (gen_addsi3 (out, out, const1_rtx));
  
    emit_label (end_0_label);
  }
--- 5274,5343 ----
  
    mem = gen_rtx_MEM (SImode, out);
    emit_move_insn (scratch, mem);
+   emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
  
!   /* Use formula that gives nonzero result ifif one of the bytes is zero.
!      This saves three branches inside loop and many cycles.  */
  
!   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
!   emit_insn (gen_one_cmplsi2 (scratch, scratch));
!   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
!   emit_insn (gen_andsi3 (tmpreg, tmpreg, GEN_INT (0x80808080)));
!   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, 0, align_4_label);
! 
!   if (TARGET_CMOVE)
!     {
!        rtx reg = gen_reg_rtx (SImode);
!        emit_move_insn (reg, tmpreg);
!        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
! 
!        /* If zero is not in the first two bytes, move two bytes forward. */
!        emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
!        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
!        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
!        emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
! 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
! 				       		     reg, 
! 				       		     tmpreg)));
!        /* Emit lea manually to avoid clobbering of flags.  */
!        emit_insn (gen_rtx_SET (SImode, reg,
! 			       gen_rtx_PLUS (SImode, out, GEN_INT (2))));
! 
!        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
!        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
!        emit_insn (gen_rtx_SET (VOIDmode, out,
! 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
! 				       		     reg,
! 				       		     out)));
  
!     }
!   else
!     {
!        rtx end_2_label = gen_label_rtx ();
!        /* Is zero in the first two bytes? */
! 
!        emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
!        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
!        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
!        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
!                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
!                             pc_rtx);
!        tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
!        JUMP_LABEL (tmp) = end_2_label;
! 
!        /* Not in the first two.  Move two bytes forward. */
!        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
!        emit_insn (gen_addsi3 (out, out, GEN_INT (2)));
! 
!        emit_label (end_2_label);
! 
!     }
  
!   /* Avoid branch in fixing the byte. */
!   emit_insn (gen_ashlqi3_cmpno (gen_rtx_SUBREG (QImode, tmpreg, 0),
! 				gen_rtx_SUBREG (QImode, tmpreg, 0),
! 				const1_rtx));
!   emit_insn (gen_subsi3_carry (out, out, GEN_INT (3)));
  
    emit_label (end_0_label);
  }
Index: egcs/gcc/config/i386/i386.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.h,v
retrieving revision 1.89
diff -c -3 -p -r1.89 i386.h
*** i386.h	2000/01/11 18:01:35	1.89
--- i386.h	2000/01/13 12:50:49
*************** struct processor_costs {
*** 62,67 ****
--- 62,69 ----
    int mult_bit;			/* cost of multiply per each bit set */
    int divide;			/* cost of a divide/mod */
    int large_insn;		/* insns larger than this cost more */
+   int move_ratio;		/* The threshold of number of scalar memory-to-memory
+ 				   move insns.  */
    int movzbl_load;		/* cost of loading using movzbl */
    int int_load[3];		/* cost of loading integer registers
  				   in QImode, HImode and SImode relative
*************** while (0)
*** 1709,1721 ****
     Increasing the value will always make code faster, but eventually
     incurs high cost in increased code size.
  
!    If you don't define this, a reasonable default is used.
  
!    Make this large on i386, since the block move is very inefficient with small
!    blocks, and the hard register needs of the block move require much reload
!    work. */
! 
! #define MOVE_RATIO 5
  
  /* Define if shifts truncate the shift count
     which implies one can omit a sign-extension or zero-extension
--- 1711,1719 ----
     Increasing the value will always make code faster, but eventually
     incurs high cost in increased code size.
  
!    If you don't define this, a reasonable default is used.  */
  
! #define MOVE_RATIO (optimize_size ? 3 : ix86_cost->move_ratio)
  
  /* Define if shifts truncate the shift count
     which implies one can omit a sign-extension or zero-extension
Index: egcs/gcc/config/i386/i386.md
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.md,v
retrieving revision 1.128
diff -c -3 -p -r1.128 i386.md
*** i386.md	2000/01/11 18:01:35	1.128
--- i386.md	2000/01/13 12:50:51
***************
*** 3736,3742 ****
    "sub{l}\\t{%2, %0|%0, %2}"
    [(set_attr "type" "alu")])
  
! (define_insn "*subsi3_carry"
    [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
  	  (minus:SI (match_operand:SI 1 "nonimmediate_operand" "0,0")
  	    (plus:SI (match_operand:SI 2 "general_operand" "ri,rm")
--- 3736,3742 ----
    "sub{l}\\t{%2, %0|%0, %2}"
    [(set_attr "type" "alu")])
  
! (define_insn "subsi3_carry"
    [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
  	  (minus:SI (match_operand:SI 1 "nonimmediate_operand" "0,0")
  	    (plus:SI (match_operand:SI 2 "general_operand" "ri,rm")
***************
*** 5861,5867 ****
  ;; This pattern can't accept a variable shift count, since shifts by
  ;; zero don't affect the flags.  We assume that shifts by constant
  ;; zero are optimized away.
! (define_insn "*ashlqi3_cmpno"
    [(set (reg:CCNO 17)
  	(compare:CCNO
  	  (ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0")
--- 5861,5867 ----
  ;; This pattern can't accept a variable shift count, since shifts by
  ;; zero don't affect the flags.  We assume that shifts by constant
  ;; zero are optimized away.
! (define_insn "ashlqi3_cmpno"
    [(set (reg:CCNO 17)
  	(compare:CCNO
  	  (ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0")
***************
*** 7841,7848 ****
    srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
  
    emit_insn (gen_cld());
!   /* When optimizing for size emit simple rep ; movsb instruction.  */
!   if (!optimize || optimize_size)
      {
        countreg = copy_to_mode_reg (SImode, operands[2]);
        emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
--- 7841,7849 ----
    srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
  
    emit_insn (gen_cld());
!   /* When optimizing for size emit simple rep ; movsb instruction for
!      counts not divisible by 4.  */
!   if ((!optimize || optimize_size) && (INTVAL (operands[2]) & 0x03))
      {
        countreg = copy_to_mode_reg (SImode, operands[2]);
        emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
***************
*** 7983,8066 ****
     (set_attr "memory" "both")])
  
  (define_expand "clrstrsi"
!   [(set (reg:SI 19) (const_int 0))
!    (set (match_dup 3) (const_int 0))
!    (parallel [(set (match_operand:BLK 0 "memory_operand" "")
! 		   (const_int 0))
! 	      (use (match_operand:SI 1 "const_int_operand" ""))
! 	      (use (match_operand:SI 2 "const_int_operand" ""))
! 	      (use (match_dup 3))
! 	      (use (reg:SI 19))
! 	      (clobber (match_scratch:SI 4 ""))
! 	      (clobber (match_dup 5))])]
    ""
    "
  {
!   rtx addr0;
  
    if (GET_CODE (operands[1]) != CONST_INT)
      FAIL;
  
!   addr0 = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
  
!   operands[3] = gen_reg_rtx (SImode);
!   operands[5] = addr0;
  
!   operands[0] = gen_rtx_MEM (BLKmode, addr0);
  }")
  
  ;; It might seem that operand 0 could use predicate register_operand.
  ;; But strength reduction might offset the MEM expression.  So we let
  ;; reload put the address into %edi.
  
! (define_insn "*clrstrsi_1"
!   [(set (mem:BLK (match_operand:SI 0 "address_operand" "D"))
  	(const_int 0))
!    (use (match_operand:SI 1 "const_int_operand" "n"))
!    (use (match_operand:SI 2 "immediate_operand" "i"))
!    (use (match_operand:SI 3 "register_operand" "a"))
!    (use (reg:SI 19))
!    (clobber (match_scratch:SI 4 "=&c"))
!    (clobber (match_dup 0))]
    ""
!   "*
! {
!   rtx xops[2];
! 
!   if (GET_CODE (operands[1]) == CONST_INT)
!     {
!       unsigned int count = INTVAL (operands[1]) & 0xffffffff;
!       if (count & ~0x03)
! 	{
! 	  xops[0] = GEN_INT (count / 4);
! 	  xops[1] = operands[4];
  
! 	  /* K6: stos takes 1 cycle, rep stos takes 8 + %ecx cycles.
! 	     80386: 4/5+5n (+2 for set of ecx)
! 	     80486: 5/7+5n (+1 for set of ecx)
! 	     */
! 	  if (count / 4 < ((int) ix86_cpu < (int)PROCESSOR_PENTIUM ? 4 : 6))
! 	    {
! 	      do
! 		output_asm_insn (\"{stosl|stosd}\", xops);
! 	      while ((count -= 4) > 3);
! 	    }
! 	  else
! 	    {
! 	      output_asm_insn (\"mov{l}\\t{%0, %1|%1, %0}\", xops);
! 	      output_asm_insn (\"{rep\;stosl|rep stosd}\", xops);
! 	    }
! 	}
!       if (INTVAL (operands[1]) & 0x02)
! 	output_asm_insn (\"stosw\", operands);
!       if (INTVAL (operands[1]) & 0x01)
! 	output_asm_insn (\"stosb\", operands);
!     }
!   else
!     abort ();
!   RET;
! }"
!   [(set_attr "type" "multi")])
  
  (define_expand "cmpstrsi"
    [(set (match_operand:SI 0 "register_operand" "")
--- 7984,8126 ----
     (set_attr "memory" "both")])
  
  (define_expand "clrstrsi"
!    [(use (match_operand:BLK 0 "memory_operand" ""))
!     (use (match_operand:SI 1 "const_int_operand" ""))
!     (use (match_operand:SI 2 "const_int_operand" ""))]
    ""
    "
  {
!   rtx destreg, zeroreg, countreg;
  
    if (GET_CODE (operands[1]) != CONST_INT)
      FAIL;
+ 
+   destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
+ 
+   emit_insn (gen_cld());
+ 
+   /* When optimizing for size emit simple rep ; movsb instruction for
+      counts not divisible by 4.  */
+   if ((!optimize || optimize_size) && (INTVAL (operands[1]) & 0x03))
+     {
+       countreg = copy_to_mode_reg (SImode, operands[1]);
+       zeroreg = copy_to_mode_reg (QImode, const0_rtx);
+       emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
+ 				 destreg, countreg));
+     }
+   else
+     {
+       zeroreg = copy_to_mode_reg (SImode, const0_rtx);
+       if (INTVAL (operands[1]) & ~0x03)
+ 	{
+ 	  countreg = copy_to_mode_reg (SImode,
+ 	  			       GEN_INT ((INTVAL (operands[1]) >> 2)
+ 						& 0x3fffffff));
+ 	  emit_insn (gen_rep_stossi (destreg, countreg, zeroreg,
+ 				     destreg, countreg));
+ 	}
+       if (INTVAL (operands[1]) & 0x02)
+ 	emit_insn (gen_strsethi (destreg,
+ 				 gen_rtx_SUBREG (HImode, zeroreg, 0)));
+       if (INTVAL (operands[1]) & 0x01)
+ 	emit_insn (gen_strsetqi (destreg,
+ 				 gen_rtx_SUBREG (QImode, zeroreg, 0)));
+     }
+   DONE;
+ }")
  
! ;; Most CPUs don't like single string operations
! ;; Handle this case here to simplify previous expander.
  
! (define_expand "strsethi"
!   [(set (mem:HI (match_operand:SI 0 "register_operand" ""))
! 	(match_operand:HI 1 "register_operand" ""))
!    (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 2)))
! 	      (clobber (reg:CC 17))])]
!   ""
!   "
! {
!   if (TARGET_SINGLE_STRINGOP || optimize_size)
!     {
!       emit_insn (gen_strsethi_1 (operands[0], operands[0], operands[1]));
!       DONE;
!     }
! }")
  
! (define_expand "strsetqi"
!   [(set (mem:QI (match_operand:SI 0 "register_operand" ""))
! 	(match_operand:QI 1 "register_operand" ""))
!    (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
! 	      (clobber (reg:CC 17))])]
!   ""
!   "
! {
!   if (TARGET_SINGLE_STRINGOP || optimize_size)
!     {
!       emit_insn (gen_strsetqi_1 (operands[0], operands[0], operands[1]));
!       DONE;
!     }
  }")
  
+ (define_insn "strsethi_1"
+   [(set (mem:HI (match_operand:SI 1 "register_operand" "0"))
+ 	(match_operand:HI 2 "register_operand" "a"))
+    (set (match_operand:SI 0 "register_operand" "=D")
+ 	(plus:SI (match_dup 0)
+ 		 (const_int 2)))
+    (use (reg:SI 19))]
+   "TARGET_SINGLE_STRINGOP || optimize_size"
+   "stosw"
+   [(set_attr "type" "str")
+    (set_attr "memory" "store")
+    (set_attr "length_prefix" "1")])
+ 
+ (define_insn "strsetqi_1"
+   [(set (mem:QI (match_operand:SI 1 "register_operand" "0"))
+ 	(match_operand:QI 2 "register_operand" "a"))
+    (set (match_operand:SI 0 "register_operand" "=D")
+ 	(plus:SI (match_dup 0)
+ 		 (const_int 1)))
+    (use (reg:SI 19))]
+   "TARGET_SINGLE_STRINGOP || optimize_size"
+   "stosb"
+   [(set_attr "type" "str")
+    (set_attr "memory" "store")])
+ 
  ;; It might seem that operand 0 could use predicate register_operand.
  ;; But strength reduction might offset the MEM expression.  So we let
  ;; reload put the address into %edi.
  
! (define_insn "rep_stossi"
!   [(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
!    (use (match_operand:SI 2 "register_operand" "a"))
!    (use (match_operand:SI 4 "register_operand" "1"))
!    (set (match_operand:SI 0 "register_operand" "=D") 
!         (plus:SI (match_operand:SI 3 "address_operand" "0")
! 	         (ashift:SI (match_dup 3) (const_int 2))))
!    (set (mem:BLK (match_dup 3))
  	(const_int 0))
!    (use (reg:SI 19))]
    ""
!   "rep\;stosl|rep stosd"
!   [(set_attr "type" "str")
!    (set_attr "length_prefix" "1")
!    (set_attr "memory" "store")])
  
! (define_insn "rep_stosqi"
!   [(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
!    (use (match_operand:QI 2 "register_operand" "a"))
!    (use (match_operand:SI 4 "register_operand" "1"))
!    (set (match_operand:SI 0 "register_operand" "=D") 
!         (plus:SI (match_operand:SI 3 "address_operand" "0") (match_dup 3)))
!    (set (mem:BLK (match_dup 3))
! 	(const_int 0))
!    (use (reg:SI 19))]
!   ""
!   "rep\;stosb|rep stosb"
!   [(set_attr "type" "str")
!    (set_attr "length_prefix" "1")
!    (set_attr "memory" "store")])
  
  (define_expand "cmpstrsi"
    [(set (match_operand:SI 0 "register_operand" "")
***************
*** 8099,8105 ****
        emit_insn (gen_cmpstrsi_nz_1 (addr1, addr2, countreg, align));
      }
    else
!     emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align));
  
    outlow = gen_lowpart (QImode, out);
    emit_insn (gen_cmpintqi (outlow));
--- 8159,8168 ----
        emit_insn (gen_cmpstrsi_nz_1 (addr1, addr2, countreg, align));
      }
    else
!     {
!       emit_insn (gen_cmpsi_1 (countreg, countreg));
!       emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align));
!     }
  
    outlow = gen_lowpart (QImode, out);
    emit_insn (gen_cmpintqi (outlow));
***************
*** 8145,8152 ****
     (clobber (match_dup 2))]
    ""
    "repz{\;| }cmpsb"
!   [(set_attr "type" "multi")
!    (set_attr "length" "3")])
  
  ;; The same, but the count is not known to not be zero.
  
--- 8208,8215 ----
     (clobber (match_dup 2))]
    ""
    "repz{\;| }cmpsb"
!   [(set_attr "type" "str")
!    (set_attr "length_prefix" "1")])
  
  ;; The same, but the count is not known to not be zero.
  
***************
*** 8158,8172 ****
  		      (mem:BLK (match_operand:SI 1 "address_operand" "D")))
  	  (const_int 0)))
     (use (match_operand:SI 3 "immediate_operand" "i"))
     (use (reg:SI 19))
     (clobber (match_dup 0))
     (clobber (match_dup 1))
     (clobber (match_dup 2))]
    ""
!   ;; The initial compare sets the zero flag.
!   "cmp{l}\\t%2, %2\;repz{\;| }cmpsb"
!   [(set_attr "type" "multi")
!    (set_attr "length" "5")])
  
  (define_expand "strlensi"
    [(set (match_operand:SI 0 "register_operand" "")
--- 8221,8235 ----
  		      (mem:BLK (match_operand:SI 1 "address_operand" "D")))
  	  (const_int 0)))
     (use (match_operand:SI 3 "immediate_operand" "i"))
+    (use (reg:CC 17))
     (use (reg:SI 19))
     (clobber (match_dup 0))
     (clobber (match_dup 1))
     (clobber (match_dup 2))]
    ""
!   "repz{\;| }cmpsb"
!   [(set_attr "type" "str")
!    (set_attr "length_prefix" "1")])
  
  (define_expand "strlensi"
    [(set (match_operand:SI 0 "register_operand" "")
***************
*** 8184,8190 ****
    align = operands[3];
    scratch1 = gen_reg_rtx (SImode);
  
!   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1)
      {
        /* Well it seems that some optimizer does not combine a call like
  	     foo(strlen(bar), strlen(bar));
--- 8247,8254 ----
    align = operands[3];
    scratch1 = gen_reg_rtx (SImode);
  
!   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
!       && !optimize_size)
      {
        /* Well it seems that some optimizer does not combine a call like
  	     foo(strlen(bar), strlen(bar));
***************
*** 8236,8243 ****
     (clobber (reg:CC 17))]
    ""
    "repnz{\;| }scasb"
!   [(set_attr "type" "multi")
!    (set_attr "length" "3")])
  
  ;; Conditional move instructions.
  
--- 8300,8307 ----
     (clobber (reg:CC 17))]
    ""
    "repnz{\;| }scasb"
!   [(set_attr "type" "str")
!    (set_attr "length_prefix" "1")])
  
  ;; Conditional move instructions.
  


More information about the Gcc-patches mailing list