This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

i386 stringops inlining patch


Hi
This patch adds the customizability to the string expanders

Honza

Fri Jan 20 12:35:55 CET 2000  Jan Hubicka  <jh@suse.cz>
	* i386.md (movstrsi, clrstrsi): Support variable sized copies, align
	destination when needed.
	(strmovsi, strsetsi): New expander.
	(strmovsi_1, strsetsi_1): New pattern.
	* i386.h (MASK_NO_ALIGN_STROP, MASK_INLINE_ALL_STROP,
	TARGET_ALIGN_STRINGOPS, TARGET_INLINE_ALL_STRINGOPS): New macros.
	(TARGET_SWITCHES) Add align-stringops and inline-all-stringops.
	* invoke.texi (align-stringops, inline-all-stringops): Document.

*** i386.md.orig	Wed Jan 19 05:56:59 2000
--- i386.md	Thu Jan 20 22:10:41 2000
***************
*** 7836,7884 ****
  (define_expand "movstrsi"
    [(use (match_operand:BLK 0 "memory_operand" ""))
     (use (match_operand:BLK 1 "memory_operand" ""))
!    (use (match_operand:SI 2 "const_int_operand" ""))
     (use (match_operand:SI 3 "const_int_operand" ""))]
    ""
    "
  {
    rtx srcreg, destreg, countreg;
  
!   if (GET_CODE (operands[2]) != CONST_INT)
!     FAIL;
  
    destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
    srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
  
    emit_insn (gen_cld());
    /* When optimizing for size emit simple rep ; movsb instruction for
       counts not divisible by 4.  */
!   if ((!optimize || optimize_size) && (INTVAL (operands[2]) & 0x03))
      {
        countreg = copy_to_mode_reg (SImode, operands[2]);
        emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
        				destreg, srcreg, countreg));
      }
!   else
      {
!       if (INTVAL (operands[2]) & ~0x03)
  	{
  	  countreg = copy_to_mode_reg (SImode,
! 	  			       GEN_INT ((INTVAL (operands[2]) >> 2)
  						& 0x3fffffff));
  	  emit_insn (gen_rep_movsi (destreg, srcreg, countreg,
  				    destreg, srcreg, countreg));
  	}
!       if (INTVAL (operands[2]) & 0x02)
  	emit_insn (gen_strmovhi (destreg, srcreg));
!       if (INTVAL (operands[2]) & 0x01)
  	emit_insn (gen_strmovqi (destreg, srcreg));
      }
    DONE;
  }")
  
  ;; Most CPUs don't like single string operations
  ;; Handle this case here to simplify previous expander.
  
  (define_expand "strmovhi"
    [(set (match_dup 2)
    	(mem:HI (match_operand:SI 1 "register_operand" "")))
--- 7833,8040 ----
  (define_expand "movstrsi"
    [(use (match_operand:BLK 0 "memory_operand" ""))
     (use (match_operand:BLK 1 "memory_operand" ""))
!    (use (match_operand:SI 2 "nonmemory_operand" ""))
     (use (match_operand:SI 3 "const_int_operand" ""))]
    ""
    "
  {
    rtx srcreg, destreg, countreg;
+   int align = 0;
+   int count = -1;
  
!   if (GET_CODE (operands[3]) == CONST_INT)
!     align = INTVAL (operands[3]);
! 
!   /* This simple hack avoids all inlining code and simplifies code bellow.  */
!   if (!TARGET_ALIGN_STRINGOPS)
!     align = 32;
! 
!   if (GET_CODE (operands[2]) == CONST_INT)
!     count = INTVAL (operands[2]);
  
    destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
    srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
  
    emit_insn (gen_cld());
+ 
    /* When optimizing for size emit simple rep ; movsb instruction for
       counts not divisible by 4.  */
! 
!   if ((!optimize || optimize_size) 
!       && (count < 0 || (count & 0x03)))
      {
        countreg = copy_to_mode_reg (SImode, operands[2]);
        emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
        				destreg, srcreg, countreg));
      }
! 
!   /* For constant aligned (or small unaligned) copies use rep movsl
!      followed by code copying the rest.  For PentiumPro ensure 8 byte
!      alignment to allow rep movsl acceleration.  */
! 
!   else if (count >= 0 
! 	   && (align >= 8
! 	       || (!TARGET_PENTIUMPRO && align >= 4)
! 	       || optimize_size || count < 64))
      {
!       if (count & ~0x03)
  	{
  	  countreg = copy_to_mode_reg (SImode,
! 	  			       GEN_INT ((count >> 2)
  						& 0x3fffffff));
  	  emit_insn (gen_rep_movsi (destreg, srcreg, countreg,
  				    destreg, srcreg, countreg));
  	}
!       if (count & 0x02)
  	emit_insn (gen_strmovhi (destreg, srcreg));
!       if (count & 0x01)
  	emit_insn (gen_strmovqi (destreg, srcreg));
      }
+   /* The generic code based on the glibc implementation:
+      - align destination to 4 bytes (8 byte alignment is used for PentiumPro
+        allowing accelerated copying there)
+      - copy the data using rep movsl
+      - copy the rest.  */
+   else
+     {
+       rtx countreg2;
+       rtx label = NULL;
+ 
+       /* In case we don't know anything about the alignment, default to
+          library version, since it is usually equally fast and result in
+ 	 shorter code.  */
+       if (!TARGET_INLINE_ALL_STRINGOPS && align < 4)
+ 	FAIL;
+ 
+       if (TARGET_SINGLE_STRINGOP)
+ 	emit_insn (gen_cld());
+ 
+       countreg2 = gen_reg_rtx (SImode);
+       countreg = copy_to_mode_reg (SImode, operands[2]);
+ 
+       /* We don't use loops to align destination and to copy parts smaller
+ 	 than 4 bytes, because gcc is able to optimize such code better (in
+ 	 the case the destination or the count really is aligned, gcc is often
+ 	 able to predict the branches) and also it is friendlier to the
+ 	 hardware branch prediction.  
+ 
+ 	 Using loops is benefical for generic case, because we can
+ 	 handle small counts using the loops.  Many CPUs (such as Athlon)
+ 	 have large REP prefix setup costs.
+ 
+ 	 This is quite costy.  Maybe we can revisit this decision later or
+ 	 add some customizability to this code.  */
+ 
+       if (count < 0
+ 	  && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4))
+ 	{
+ 	  label = gen_label_rtx ();
+ 	  emit_cmp_and_jump_insns (countreg, GEN_INT (3),
+ 				   LEU, 0, SImode, 1, 0, label);
+ 	}
+       if (align <= 1)
+ 	{
+ 	  rtx label = gen_label_rtx ();
+ 	  rtx tmpcount = gen_reg_rtx (SImode);
+ 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1)));
+ 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ 				   SImode, 1, 0, label);
+ 	  emit_insn (gen_strmovqi (destreg, srcreg));
+ 	  emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx));
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+       if (align <= 2)
+ 	{
+ 	  rtx label = gen_label_rtx ();
+ 	  rtx tmpcount = gen_reg_rtx (SImode);
+ 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2)));
+ 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ 				   SImode, 1, 0, label);
+ 	  emit_insn (gen_strmovhi (destreg, srcreg));
+ 	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2)));
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+       if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260))
+ 	{
+ 	  rtx label = gen_label_rtx ();
+ 	  rtx tmpcount = gen_reg_rtx (SImode);
+ 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4)));
+ 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ 				   SImode, 1, 0, label);
+ 	  emit_insn (gen_strmovsi (destreg, srcreg));
+ 	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4)));
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+ 
+       if (!TARGET_SINGLE_STRINGOP)
+ 	emit_insn (gen_cld());
+       emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
+       emit_insn (gen_rep_movsi (destreg, srcreg, countreg2,
+ 				destreg, srcreg, countreg2));
+ 
+       if (label)
+ 	{
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+       if (align > 2 && count > 0 && (count & 2))
+ 	emit_insn (gen_strmovhi (destreg, srcreg));
+       if (align <= 2 || count < 0)
+ 	{
+ 	  rtx label = gen_label_rtx ();
+ 	  rtx tmpcount = gen_reg_rtx (SImode);
+ 	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2)));
+ 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ 				   SImode, 1, 0, label);
+ 	  emit_insn (gen_strmovhi (destreg, srcreg));
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+       if (align > 1 && count > 0 && (count & 1))
+ 	emit_insn (gen_strmovsi (destreg, srcreg));
+       if (align <= 1 || count < 0)
+ 	{
+ 	  rtx label = gen_label_rtx ();
+ 	  rtx tmpcount = gen_reg_rtx (SImode);
+ 	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1)));
+ 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ 				   SImode, 1, 0, label);
+ 	  emit_insn (gen_strmovqi (destreg, srcreg));
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+     }
    DONE;
  }")
  
  ;; Most CPUs don't like single string operations
  ;; Handle this case here to simplify previous expander.
  
+ (define_expand "strmovsi"
+   [(set (match_dup 2)
+   	(mem:SI (match_operand:SI 1 "register_operand" "")))
+    (set (mem:SI (match_operand:SI 0 "register_operand" ""))
+         (match_dup 2))
+    (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4)))
+ 	      (clobber (reg:CC 17))])
+    (parallel [(set (match_dup 1) (plus:SI (match_dup 1) (const_int 4)))
+ 	      (clobber (reg:CC 17))])]
+   ""
+   "
+ {
+   if (TARGET_SINGLE_STRINGOP || optimize_size)
+     {
+       emit_insn (gen_strmovsi_1 (operands[0], operands[1], operands[0],
+ 				operands[1]));
+       DONE;
+     }
+   else 
+     operands[2] = gen_reg_rtx (SImode);
+ }")
+ 
  (define_expand "strmovhi"
    [(set (match_dup 2)
    	(mem:HI (match_operand:SI 1 "register_operand" "")))
***************
*** 7923,7928 ****
--- 8079,8099 ----
      operands[2] = gen_reg_rtx (QImode);
  }")
  
+ (define_insn "strmovsi_1"
+   [(set (mem:SI (match_operand:SI 2 "register_operand" "0"))
+ 	(mem:SI (match_operand:SI 3 "register_operand" "1")))
+    (set (match_operand:SI 0 "register_operand" "=D")
+ 	(plus:SI (match_dup 0)
+ 		 (const_int 4)))
+    (set (match_operand:SI 1 "register_operand" "=S")
+ 	(plus:SI (match_dup 1)
+ 		 (const_int 4)))
+    (use (reg:SI 19))]
+   "TARGET_SINGLE_STRINGOP || optimize_size"
+   "movsl"
+   [(set_attr "type" "str")
+    (set_attr "memory" "both")])
+ 
  (define_insn "strmovhi_1"
    [(set (mem:HI (match_operand:SI 2 "register_operand" "0"))
  	(mem:HI (match_operand:SI 3 "register_operand" "1")))
***************
*** 7994,8008 ****
  
  (define_expand "clrstrsi"
     [(use (match_operand:BLK 0 "memory_operand" ""))
!     (use (match_operand:SI 1 "const_int_operand" ""))
      (use (match_operand:SI 2 "const_int_operand" ""))]
    ""
    "
  {
    rtx destreg, zeroreg, countreg;
  
!   if (GET_CODE (operands[1]) != CONST_INT)
!     FAIL;
  
    destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
  
--- 8165,8190 ----
  
  (define_expand "clrstrsi"
     [(use (match_operand:BLK 0 "memory_operand" ""))
!     (use (match_operand:SI 1 "nonmemory_operand" ""))
      (use (match_operand:SI 2 "const_int_operand" ""))]
    ""
    "
  {
+   /* See comments in movstr expanders.  The code is mostly identical.  */
+ 
    rtx destreg, zeroreg, countreg;
+   int align = 0;
+   int count = -1;
  
!   if (GET_CODE (operands[2]) == CONST_INT)
!     align = INTVAL (operands[2]);
! 
!   /* This simple hack avoids all inlining code and simplifies code bellow.  */
!   if (!TARGET_ALIGN_STRINGOPS)
!     align = 32;
! 
!   if (GET_CODE (operands[1]) == CONST_INT)
!     count = INTVAL (operands[1]);
  
    destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
  
***************
*** 8010,8023 ****
  
    /* When optimizing for size emit simple rep ; movsb instruction for
       counts not divisible by 4.  */
!   if ((!optimize || optimize_size) && (INTVAL (operands[1]) & 0x03))
      {
        countreg = copy_to_mode_reg (SImode, operands[1]);
        zeroreg = copy_to_mode_reg (QImode, const0_rtx);
        emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
  				 destreg, countreg));
      }
!   else
      {
        zeroreg = copy_to_mode_reg (SImode, const0_rtx);
        if (INTVAL (operands[1]) & ~0x03)
--- 8192,8210 ----
  
    /* When optimizing for size emit simple rep ; movsb instruction for
       counts not divisible by 4.  */
! 
!   if ((!optimize || optimize_size) 
!       && (count < 0 || (count & 0x03)))
      {
        countreg = copy_to_mode_reg (SImode, operands[1]);
        zeroreg = copy_to_mode_reg (QImode, const0_rtx);
        emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
  				 destreg, countreg));
      }
!   else if (count >= 0 
! 	   && (align >= 8
! 	       || (!TARGET_PENTIUMPRO && align >= 4)
! 	       || optimize_size || count < 64))
      {
        zeroreg = copy_to_mode_reg (SImode, const0_rtx);
        if (INTVAL (operands[1]) & ~0x03)
***************
*** 8035,8046 ****
--- 8222,8354 ----
  	emit_insn (gen_strsetqi (destreg,
  				 gen_rtx_SUBREG (QImode, zeroreg, 0)));
      }
+   else
+     {
+       rtx countreg2;
+       rtx label = NULL;
+ 
+       /* In case we don't know anything about the alignment, default to
+          library version, since it is usually equally fast and result in
+ 	 shorter code.  */
+       if (!TARGET_INLINE_ALL_STRINGOPS && align < 4)
+ 	FAIL;
+ 
+       if (TARGET_SINGLE_STRINGOP)
+ 	emit_insn (gen_cld());
+ 
+       countreg2 = gen_reg_rtx (SImode);
+       countreg = copy_to_mode_reg (SImode, operands[1]);
+       zeroreg = copy_to_mode_reg (SImode, const0_rtx);
+ 
+       if (count < 0
+ 	  && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4))
+ 	{
+ 	  label = gen_label_rtx ();
+ 	  emit_cmp_and_jump_insns (countreg, GEN_INT (3),
+ 				   LEU, 0, SImode, 1, 0, label);
+ 	}
+       if (align <= 1)
+ 	{
+ 	  rtx label = gen_label_rtx ();
+ 	  rtx tmpcount = gen_reg_rtx (SImode);
+ 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1)));
+ 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ 				   SImode, 1, 0, label);
+ 	  emit_insn (gen_strsetqi (destreg,
+ 				   gen_rtx_SUBREG (QImode, zeroreg, 0)));
+ 	  emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx));
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+       if (align <= 2)
+ 	{
+ 	  rtx label = gen_label_rtx ();
+ 	  rtx tmpcount = gen_reg_rtx (SImode);
+ 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2)));
+ 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ 				   SImode, 1, 0, label);
+ 	  emit_insn (gen_strsethi (destreg,
+ 				   gen_rtx_SUBREG (HImode, zeroreg, 0)));
+ 	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2)));
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+       if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260))
+ 	{
+ 	  rtx label = gen_label_rtx ();
+ 	  rtx tmpcount = gen_reg_rtx (SImode);
+ 	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4)));
+ 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ 				   SImode, 1, 0, label);
+ 	  emit_insn (gen_strsethi (destreg, zeroreg));
+ 	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4)));
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+ 
+       if (!TARGET_SINGLE_STRINGOP)
+ 	emit_insn (gen_cld());
+       emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
+       emit_insn (gen_rep_stossi (destreg, countreg2, zeroreg,
+ 				 destreg, countreg2));
+ 
+       if (label)
+ 	{
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+       if (align > 2 && count > 0 && (count & 2))
+ 	emit_insn (gen_strsethi (destreg,
+ 				 gen_rtx_SUBREG (HImode, zeroreg, 0)));
+       if (align <= 2 || count < 0)
+ 	{
+ 	  rtx label = gen_label_rtx ();
+ 	  rtx tmpcount = gen_reg_rtx (SImode);
+ 	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2)));
+ 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ 				   SImode, 1, 0, label);
+ 	  emit_insn (gen_strsethi (destreg,
+ 				   gen_rtx_SUBREG (HImode, zeroreg, 0)));
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+       if (align > 1 && count > 0 && (count & 1))
+ 	emit_insn (gen_strsetqi (destreg,
+ 				 gen_rtx_SUBREG (QImode, zeroreg, 0)));
+       if (align <= 1 || count < 0)
+ 	{
+ 	  rtx label = gen_label_rtx ();
+ 	  rtx tmpcount = gen_reg_rtx (SImode);
+ 	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1)));
+ 	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ 				   SImode, 1, 0, label);
+ 	  emit_insn (gen_strsetqi (destreg,
+ 				   gen_rtx_SUBREG (QImode, zeroreg, 0)));
+ 	  emit_label (label);
+ 	  LABEL_NUSES (label) = 1;
+ 	}
+     }
    DONE;
  }")
  
  ;; Most CPUs don't like single string operations
  ;; Handle this case here to simplify previous expander.
  
+ (define_expand "strsetsi"
+   [(set (mem:SI (match_operand:SI 0 "register_operand" ""))
+ 	(match_operand:SI 1 "register_operand" ""))
+    (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4)))
+ 	      (clobber (reg:CC 17))])]
+   ""
+   "
+ {
+   if (TARGET_SINGLE_STRINGOP || optimize_size)
+     {
+       emit_insn (gen_strsetsi_1 (operands[0], operands[0], operands[1]));
+       DONE;
+     }
+ }")
+ 
  (define_expand "strsethi"
    [(set (mem:HI (match_operand:SI 0 "register_operand" ""))
  	(match_operand:HI 1 "register_operand" ""))
***************
*** 8071,8076 ****
--- 8379,8396 ----
      }
  }")
  
+ (define_insn "strsetsi_1"
+   [(set (mem:SI (match_operand:SI 1 "register_operand" "0"))
+ 	(match_operand:SI 2 "register_operand" "a"))
+    (set (match_operand:SI 0 "register_operand" "=D")
+ 	(plus:SI (match_dup 0)
+ 		 (const_int 4)))
+    (use (reg:SI 19))]
+   "TARGET_SINGLE_STRINGOP || optimize_size"
+   "stosl"
+   [(set_attr "type" "str")
+    (set_attr "memory" "store")])
+ 
  (define_insn "strsethi_1"
    [(set (mem:HI (match_operand:SI 1 "register_operand" "0"))
  	(match_operand:HI 2 "register_operand" "a"))
***************
*** 8268,8274 ****
  	 output_strlen_unroll() this is better.  */
  
        if (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)
! 	emit_move_insn (scratch1, addr);
        emit_move_insn (out, addr);
  
        ix86_expand_strlensi_unroll_1 (out, align, scratch1);
--- 8588,8601 ----
  	 output_strlen_unroll() this is better.  */
  
        if (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)
! 	{
! 	  emit_move_insn (scratch1, addr);
! 
! 	  /* The generic case of strlen expander is long.  Avoid it's
! 	     expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
! 	  if (!TARGET_INLINE_ALL_STRINGOPS)
! 	    FAIL;
! 	}
        emit_move_insn (out, addr);
  
        ix86_expand_strlensi_unroll_1 (out, align, scratch1);
*** i386.h.old	Thu Jan 20 20:45:15 2000
--- i386.h	Thu Jan 20 21:18:27 2000
*************** extern int target_flags;
*** 101,106 ****
--- 101,108 ----
  #define MASK_NO_FANCY_MATH_387	0x00000040	/* Disable sin, cos, sqrt */
  #define MASK_OMIT_LEAF_FRAME_POINTER 0x080      /* omit leaf frame pointers */
  #define MASK_STACK_PROBE	0x00000100	/* Enable stack probing */
+ #define MASK_NO_ALIGN_STROPS	0x00001000	/* Enable aligning of string ops. */
+ #define MASK_INLINE_ALL_STROPS	0x00002000	/* Inline stringops in all cases */
  
  /* Temporary codegen switches */
  #define MASK_INTEL_SYNTAX	0x00000200
*************** extern const int x86_promote_QImode, x86
*** 190,195 ****
--- 192,200 ----
  
  #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
  
+ #define TARGET_ALIGN_STRINGOPS (!(target_flags & MASK_NO_ALIGN_STROPS))
+ #define TARGET_INLINE_ALL_STRINGOPS (target_flags & MASK_INLINE_ALL_STROPS)
+ 
  #define ASSEMBLER_DIALECT ((target_flags & MASK_INTEL_SYNTAX) != 0)
  
  #define TARGET_SWITCHES							      \
*************** extern const int x86_promote_QImode, x86
*** 238,243 ****
--- 243,256 ----
    { "intel-syntax",		MASK_INTEL_SYNTAX,			      \
      "Emit Intel syntax assembler opcodes" },				      \
    { "no-intel-syntax",		-MASK_INTEL_SYNTAX, "" },		      \
+   { "align-stringops",		-MASK_NO_ALIGN_STROPS,			      \
+     "Align destination of the string operations" },			      \
+   { "no-align-stringops",	 MASK_NO_ALIGN_STROPS,			      \
+     "Do not align destination of the string operations" },		      \
+   { "inline-all-strinops",	 MASK_INLINE_ALL_STROPS,		      \
+     "Inline all known string operations" },				      \
+   { "no-inline-all-stringops",	-MASK_INLINE_ALL_STROPS,		      \
+     "Do not inline all known string operations" },			      \
    SUBTARGET_SWITCHES							      \
    { "", TARGET_DEFAULT, 0 }}
  
*** invoke.texi.old	Thu Jan 20 22:54:01 2000
--- invoke.texi	Thu Jan 20 22:58:22 2000
*************** in the following sections.
*** 360,366 ****
  -mreg-alloc=@var{list}  -mregparm=@var{num}
  -malign-jumps=@var{num}  -malign-loops=@var{num}
  -malign-functions=@var{num} -mpreferred-stack-boundary=@var{num}
! -mthreads
  
  @emph{HPPA Options}
  -march=@var{architecture type}
--- 360,366 ----
  -mreg-alloc=@var{list}  -mregparm=@var{num}
  -malign-jumps=@var{num}  -malign-loops=@var{num}
  -malign-functions=@var{num} -mpreferred-stack-boundary=@var{num}
! -mthreads -mno-align-stringops -minline-all-stringops
  
  @emph{HPPA Options}
  -march=@var{architecture type}
*************** on thread-safe exception handling must c
*** 5954,5959 ****
--- 5954,5972 ----
  @samp{-mthreads} option. When compiling, @samp{-mthreads} defines 
  @samp{-D_MT}; when linking, it links in a special thread helper library 
  @samp{-lmingwthrd} which cleans up per thread exception handling data.
+ 
+ @item -mno-align-stringops
+ @kindex -mno-align-stringops
+ Do not align destination of inlined string operations. This switch reduces
+ code size and improves performance in case the destination is already aligned,
+ but gcc don't know about it.
+ 
+ @item -minline-all-stringops
+ @kindex -minline-all-stringops
+ By default GCC inlines string operations only when destination is known to be
+ aligned at least to 4 byte boundary. This enables more inlining, increase code
+ size, but may improve performance of code that depends on fast memcpy, strlen
+ and memset for short lengths.
  @end table
  
  @node HPPA Options

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]