This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH,i386] fix PR 11001


On Thu, Oct 04, 2007 at 08:34:11AM +0200, Uros Bizjak wrote:
> > +  /* Can't use this if the user has appropriated ecx, esi, or edi.  */
> > +  if ((alg == rep_prefix_1_byte
> > +       || alg == rep_prefix_4_byte
> > +       || alg == rep_prefix_8_byte)
> > +      && (global_regs[2] || global_regs[4] || global_regs[5]))
> > +    return 0;
> > +
> 
> I think that you should put this check into decide_alg(). There you
> can decide between copy loop and libcall, also taking into account
> optimize_size flag, as well as TARGET_INLINE_ALL_STRINGOPS and
> TARGET_INLINE_ALL_STRINGOPS_DYNAMICALLY.
> 
> Please note, that rep_prefix_* algorithms are used for larger blocks.
> Perhaps in your case, we should scan algorithms table backwards until
> we hit other non-rep_prefix algorithm. If we found none, we should
> default to a loop algorithm for optimize_size or to a libcall for
> other cases.

OK, I think this patch implements what you want:

- decide_alg now consults global_regs;

- loops are used if optimize_size and we cannot use rep prefixes;

- we don't run the heuristic if we cannot use rep prefixes and default
  to libcalls if we cannot find another strategy (loops or unrolled
  loops) for the size of block we are setting/copying--for situations
  where rep prefixes cannot be used.  I am unsure of what you were
  suggesting here--I read your first paragraph as saying that
  TARGET_INLINE_ALL_STRINGOPS* be respected whereas the second paragraph
  seems to say that it's ok to punt and choose libcalls if we can't do
  rep-based copying.

Bootstrapped and tested on x86_64-unknown-linux-gnu.  Modified ChangeLog
entry below; entries for testcases not included.

-Nathan

	PR 11001
	gcc/
	* config/i386/i386.md (strmov): Check for esi and edi usage.
	* config/i386/i386.c (decide_alg): Check whether we can use a
	rep prefix and adjust algorithm choice accordingly.
	(ix86_expand_strlen): Check for eax, ecx, and edi usage.

Index: i386.c
===================================================================
--- i386.c	(revision 128981)
+++ i386.c	(working copy)
@@ -15057,21 +15057,32 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
 	    int *dynamic_check)
 {
   const struct stringop_algs * algs;
+  /* Algorithms using the rep prefix want at least edi and ecx;
+     additionally, memset wants eax and memcpy wants esi.  Don't
+     consider such algorithms if the user has appropriated those
+     registers for their own purposes.	*/
+  int rep_prefix_usable = !(global_regs[2] || global_regs[5]
+			    || (memset ? global_regs[0] : global_regs[4]));
+
+#define ALG_USABLE_P(alg) ((alg != rep_prefix_1_byte		\
+			    && alg != rep_prefix_4_byte		\
+			    && alg != rep_prefix_8_byte)	\
+			   || rep_prefix_usable)
 
   *dynamic_check = -1;
   if (memset)
     algs = &ix86_cost->memset[TARGET_64BIT != 0];
   else
     algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
-  if (stringop_alg != no_stringop)
+  if (stringop_alg != no_stringop && ALG_USABLE_P (stringop_alg))
     return stringop_alg;
   /* rep; movq or rep; movl is the smallest variant.  */
   else if (optimize_size)
     {
       if (!count || (count & 3))
-	return rep_prefix_1_byte;
+	return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
       else
-	return rep_prefix_4_byte;
+	return rep_prefix_usable ? rep_prefix_4_byte : loop;
     }
   /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
    */
@@ -15083,27 +15094,34 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
       enum stringop_alg alg = libcall;
       for (i = 0; i < NAX_STRINGOP_ALGS; i++)
 	{
-	  gcc_assert (algs->size[i].max);
+	  /* We get here if the algorithms that were not libcall-based
+	     were rep-prefix based and we are unable to use rep prefixes
+	     based on global register usage.  Break out of the loop and
+	     use the heuristic below.  */
+	  if (algs->size[i].max == 0)
+	    break;
 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
 	    {
-	      if (algs->size[i].alg != libcall)
-		alg = algs->size[i].alg;
+	      enum stringop_alg candidate = algs->size[i].alg;
+
+	      if (candidate != libcall && ALG_USABLE_P (candidate))
+		alg = candidate;
 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
-	         last non-libcall inline algorithm.  */
+		 last non-libcall inline algorithm.  */
 	      if (TARGET_INLINE_ALL_STRINGOPS)
 		{
 		  /* When the current size is best to be copied by a libcall,
-		     but we are still forced to inline, run the heuristic bellow
+		     but we are still forced to inline, run the heuristic below
 		     that will pick code for medium sized blocks.  */
 		  if (alg != libcall)
 		    return alg;
 		  break;
 		}
-	      else
-		return algs->size[i].alg;
+	      else if (ALG_USABLE_P (candidate))
+		return candidate;
 	    }
 	}
-      gcc_assert (TARGET_INLINE_ALL_STRINGOPS);
+      gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
     }
   /* When asked to inline the call anyway, try to pick meaningful choice.
      We look for maximal size of block that is faster to copy by hand and
@@ -15113,14 +15131,15 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
      If this turns out to be bad, we might simply specify the preferred
      choice in ix86_costs.  */
   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
-      && algs->unknown_size == libcall)
+      && algs->unknown_size == libcall && rep_prefix_usable)
     {
       int max = -1;
       enum stringop_alg alg;
       int i;
 
       for (i = 0; i < NAX_STRINGOP_ALGS; i++)
-	if (algs->size[i].alg != libcall && algs->size[i].alg)
+	if (algs->size[i].alg != libcall && algs->size[i].alg
+	    && ALG_USABLE_P (algs->size[i].alg))
 	  max = algs->size[i].max;
       if (max == -1)
 	max = 4096;
@@ -15131,7 +15150,8 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
 	*dynamic_check = max;
       return alg;
     }
-  return algs->unknown_size;
+  return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
+#undef ALG_USABLE_P
 }
 
 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
@@ -15985,6 +16005,11 @@ ix86_expand_strlen (rtx out, rtx src, rt
   else
     {
       rtx unspec;
+
+      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
+      if (global_regs[0] || global_regs[2] || global_regs[5])
+        return false;
+
       scratch2 = gen_reg_rtx (Pmode);
       scratch3 = gen_reg_rtx (Pmode);
       scratch4 = force_reg (Pmode, constm1_rtx);


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]