This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

ivopts cost oddities


While investigating a code size regression between our 3.4-based
Blackfin compiler and a 4.1-based one, I came across some oddities in
how costs are calculated in tree-ssa-loop-ivopts.c.

Attached is zlib1.i, which is a reduced testcase that exhibits the
problem.  Basically what happens is that ivopts transforms something like

 char *p;
 loop {
   a = p + 1;
   x = [a];
   b = p + 2;
   x = [b];
   ...
   p += 16;
 }

into

 char *p;
 char *a = p + 1;
 char *b = p + 2;
 ...
 loop {
   x = [a];
   y = [b];
   ...
   a += 16;
   b += 16;
   ...
   p += 16;
 }

There are a couple of things wrong with this.  First of all, this
transformation can never be a win, since no additions are saved, no
memory addresses are changed, and setup costs and register pressure are
higher.

Also, ivopts fails to recognize that [reg + constant] is a valid
addressing mode.  This is because a) it always uses Pmode as the mode of
the memory reference, not taking into account possible differences
between addresses for different size mem refs, and b) the first constant
offset it tests is 1, but [reg + 1] is not valid for SImode on the
Blackfin - only [reg + 4*n] is; [reg + 2*n] for HImode.  The allowable
ranges differ between QI, HI and SImode on the Blackfin.

Tracing through the code trying to find where the costs are generated, I
found that get_addresses gets called with an out-of-bounds constant, and
symbol_present == var_present == 0 and ratio == 1.  This case falls into
a small if statement which uses the cost of a Pmode addition as a base
cost, which is sensible, but also sets var_present to 1, which is not.
It causes us to additionally add the cost of a reg+reg addition, leading
to an exaggerated cost for using the original biv and causes us to
create additional ones until the register pressure heuristics kick in.
Just eliminating this one line is enough to solve a big part of the
problem on the Blackfin, and is done in patch1.

patch2 adds code to track machine_mode-specific data about addressing
mode capabilities.

patch3 modifies the way we look for the bounds of [reg + constant]
addressing.  Instead of starting at [reg + 1], we start at [reg +
BIGGEST_ALIGNMENT / BITS_PER_UNIT], which I hope works for all machines.

Not committed yet - don't know whether this is a regression on any
target (bfin wasn't in official 3.4).  So far I've regression tested a
slightly earlier version on a 4.1-based bfin-elf compiler, and
bootstrapped some languages on i686-linux - it doesn't seem to affect
code generation on that target.

Zdenek, any comments?  Would anyone care to run the sum of the three
patches through SPEC on a non-x86 target?


Bernd
unsigned long adler32(adler, buf, len)
     unsigned long adler;
     unsigned char *buf;
     unsigned int len;
{
    unsigned long s1 = adler & 0xffff;
    unsigned long s2 = (adler >> 16) & 0xffff;
    int k;

    if (buf == 0) return 1L;

    while (len >= 16) {
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};;
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};;;
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};;
	{s1 += *buf++; s2 += s1;};
	{s1 += *buf++; s2 += s1;};;;;
	len -= 16;
    }
    s1 %= 65521L;
    s2 %= 65521L;

    return (s2 << 16) | s1;
}
Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc.orig/tree-ssa-loop-ivopts.c
+++ gcc/tree-ssa-loop-ivopts.c
@@ -3368,10 +3368,7 @@ get_address_cost (bool symbol_present, b
     cost += multiply_by_cost (ratio, Pmode);
 
   if (s_offset && !offset_p && !symbol_present)
-    {
-      cost += add_cost (Pmode);
-      var_present = true;
-    }
+    cost += add_cost (Pmode);
 
   acost = costs[symbol_present][var_present][offset_p][ratio_p];
   if (!acost)
Index: gcc/tree-flow.h
===================================================================
--- gcc.orig/tree-flow.h
+++ gcc/tree-flow.h
@@ -823,7 +823,7 @@ extern void linear_transform_loops (stru
 
 /* In tree-ssa-loop-ivopts.c  */
 bool expr_invariant_in_loop_p (struct loop *, tree);
-bool multiplier_allowed_in_address_p (HOST_WIDE_INT);
+bool multiplier_allowed_in_address_p (HOST_WIDE_INT, enum machine_mode);
 unsigned multiply_by_cost (HOST_WIDE_INT, enum machine_mode);
 
 /* In tree-ssa-threadupdate.c.  */
Index: gcc/tree-ssa-address.c
===================================================================
--- gcc.orig/tree-ssa-address.c
+++ gcc/tree-ssa-address.c
@@ -397,8 +397,9 @@ most_expensive_mult_to_index (struct mem
 
   for (i = 0; i < addr->n; i++)
     {
+      /* FIXME: Should use the correct memory mode rather than Pmode.  */
       if (addr->coefs[i] == 1
-	  || !multiplier_allowed_in_address_p (addr->coefs[i]))
+	  || !multiplier_allowed_in_address_p (addr->coefs[i], Pmode))
 	continue;
       
       acost = multiply_by_cost (addr->coefs[i], Pmode);
Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc.orig/tree-ssa-loop-ivopts.c
+++ gcc/tree-ssa-loop-ivopts.c
@@ -3250,32 +3250,32 @@ multiply_by_cost (HOST_WIDE_INT cst, enu
 /* Returns true if multiplying by RATIO is allowed in address.  */
 
 bool
-multiplier_allowed_in_address_p (HOST_WIDE_INT ratio)
+multiplier_allowed_in_address_p (HOST_WIDE_INT ratio, enum machine_mode mode)
 {
 #define MAX_RATIO 128
-  static sbitmap valid_mult;
+  static sbitmap valid_mult[MAX_MACHINE_MODE];
   
-  if (!valid_mult)
+  if (!valid_mult[mode])
     {
       rtx reg1 = gen_raw_REG (Pmode, LAST_VIRTUAL_REGISTER + 1);
       rtx addr;
       HOST_WIDE_INT i;
 
-      valid_mult = sbitmap_alloc (2 * MAX_RATIO + 1);
-      sbitmap_zero (valid_mult);
+      valid_mult[mode] = sbitmap_alloc (2 * MAX_RATIO + 1);
+      sbitmap_zero (valid_mult[mode]);
       addr = gen_rtx_fmt_ee (MULT, Pmode, reg1, NULL_RTX);
       for (i = -MAX_RATIO; i <= MAX_RATIO; i++)
 	{
 	  XEXP (addr, 1) = gen_int_mode (i, Pmode);
-	  if (memory_address_p (Pmode, addr))
-	    SET_BIT (valid_mult, i + MAX_RATIO);
+	  if (memory_address_p (mode, addr))
+	    SET_BIT (valid_mult[mode], i + MAX_RATIO);
 	}
 
       if (dump_file && (dump_flags & TDF_DETAILS))
 	{
 	  fprintf (dump_file, "  allowed multipliers:");
 	  for (i = -MAX_RATIO; i <= MAX_RATIO; i++)
-	    if (TEST_BIT (valid_mult, i + MAX_RATIO))
+	    if (TEST_BIT (valid_mult[mode], i + MAX_RATIO))
 	      fprintf (dump_file, " %d", (int) i);
 	  fprintf (dump_file, "\n");
 	  fprintf (dump_file, "\n");
@@ -3285,7 +3285,7 @@ multiplier_allowed_in_address_p (HOST_WI
   if (ratio > MAX_RATIO || ratio < -MAX_RATIO)
     return false;
 
-  return TEST_BIT (valid_mult, ratio + MAX_RATIO);
+  return TEST_BIT (valid_mult[mode], ratio + MAX_RATIO);
 }
 
 /* Returns cost of address in shape symbol + var + OFFSET + RATIO * index.
@@ -3296,12 +3296,13 @@ multiplier_allowed_in_address_p (HOST_WI
 
 static unsigned
 get_address_cost (bool symbol_present, bool var_present,
-		  unsigned HOST_WIDE_INT offset, HOST_WIDE_INT ratio)
+		  unsigned HOST_WIDE_INT offset, HOST_WIDE_INT ratio,
+		  enum machine_mode mem_mode)
 {
-  static bool initialized = false;
-  static HOST_WIDE_INT rat, off;
-  static HOST_WIDE_INT min_offset, max_offset;
-  static unsigned costs[2][2][2][2];
+  static bool initialized[MAX_MACHINE_MODE];
+  static HOST_WIDE_INT rat[MAX_MACHINE_MODE], off[MAX_MACHINE_MODE];
+  static HOST_WIDE_INT min_offset[MAX_MACHINE_MODE], max_offset[MAX_MACHINE_MODE];
+  static unsigned costs[MAX_MACHINE_MODE][2][2][2][2];
   unsigned cost, acost;
   rtx seq, addr, base;
   bool offset_p, ratio_p;
@@ -3310,10 +3311,10 @@ get_address_cost (bool symbol_present, b
   unsigned HOST_WIDE_INT mask;
   unsigned bits;
 
-  if (!initialized)
+  if (!initialized[mem_mode])
     {
       HOST_WIDE_INT i;
-      initialized = true;
+      initialized[mem_mode] = true;
 
       reg1 = gen_raw_REG (Pmode, LAST_VIRTUAL_REGISTER + 1);
 
@@ -3321,32 +3322,36 @@ get_address_cost (bool symbol_present, b
       for (i = 1; i <= 1 << 20; i <<= 1)
 	{
 	  XEXP (addr, 1) = gen_int_mode (i, Pmode);
-	  if (!memory_address_p (Pmode, addr))
+	  if (!memory_address_p (mem_mode, addr))
 	    break;
 	}
-      max_offset = i >> 1;
-      off = max_offset;
+      max_offset[mem_mode] = i >> 1;
+      off[mem_mode] = max_offset[mem_mode];
 
       for (i = 1; i <= 1 << 20; i <<= 1)
 	{
 	  XEXP (addr, 1) = gen_int_mode (-i, Pmode);
-	  if (!memory_address_p (Pmode, addr))
+	  if (!memory_address_p (mem_mode, addr))
 	    break;
 	}
-      min_offset = -(i >> 1);
+      min_offset[mem_mode] = -(i >> 1);
 
       if (dump_file && (dump_flags & TDF_DETAILS))
 	{
 	  fprintf (dump_file, "get_address_cost:\n");
-	  fprintf (dump_file, "  min offset %d\n", (int) min_offset);
-	  fprintf (dump_file, "  max offset %d\n", (int) max_offset);
+	  fprintf (dump_file, "  min offset %s %d\n",
+		   GET_MODE_NAME (mem_mode),
+		   (int) min_offset[mem_mode]);
+	  fprintf (dump_file, "  max offset %s %d\n",
+		   GET_MODE_NAME (mem_mode),
+		   (int) max_offset[mem_mode]);
 	}
 
-      rat = 1;
+      rat[mem_mode] = 1;
       for (i = 2; i <= MAX_RATIO; i++)
-	if (multiplier_allowed_in_address_p (i))
+	if (multiplier_allowed_in_address_p (i, mem_mode))
 	  {
-	    rat = i;
+	    rat[mem_mode] = i;
 	    break;
 	  }
     }
@@ -3360,9 +3365,10 @@ get_address_cost (bool symbol_present, b
 
   cost = 0;
   offset_p = (s_offset != 0
-	      && min_offset <= s_offset && s_offset <= max_offset);
+	      && min_offset[mem_mode] <= s_offset
+	      && s_offset <= max_offset[mem_mode]);
   ratio_p = (ratio != 1
-	     && multiplier_allowed_in_address_p (ratio));
+	     && multiplier_allowed_in_address_p (ratio, mem_mode));
 
   if (ratio != 1 && !ratio_p)
     cost += multiply_by_cost (ratio, Pmode);
@@ -3370,7 +3376,7 @@ get_address_cost (bool symbol_present, b
   if (s_offset && !offset_p && !symbol_present)
     cost += add_cost (Pmode);
 
-  acost = costs[symbol_present][var_present][offset_p][ratio_p];
+  acost = costs[mem_mode][symbol_present][var_present][offset_p][ratio_p];
   if (!acost)
     {
       int old_cse_not_expected;
@@ -3379,7 +3385,8 @@ get_address_cost (bool symbol_present, b
       addr = gen_raw_REG (Pmode, LAST_VIRTUAL_REGISTER + 1);
       reg1 = gen_raw_REG (Pmode, LAST_VIRTUAL_REGISTER + 2);
       if (ratio_p)
-	addr = gen_rtx_fmt_ee (MULT, Pmode, addr, gen_int_mode (rat, Pmode));
+	addr = gen_rtx_fmt_ee (MULT, Pmode, addr,
+			       gen_int_mode (rat[mem_mode], Pmode));
 
       if (var_present)
 	addr = gen_rtx_fmt_ee (PLUS, Pmode, addr, reg1);
@@ -3391,10 +3398,11 @@ get_address_cost (bool symbol_present, b
 	    base = gen_rtx_fmt_e (CONST, Pmode,
 				  gen_rtx_fmt_ee (PLUS, Pmode,
 						  base,
-						  gen_int_mode (off, Pmode)));
+						  gen_int_mode (off[mem_mode],
+								Pmode)));
 	}
       else if (offset_p)
-	base = gen_int_mode (off, Pmode);
+	base = gen_int_mode (off[mem_mode], Pmode);
       else
 	base = NULL_RTX;
     
@@ -3406,7 +3414,7 @@ get_address_cost (bool symbol_present, b
  	 follow.  */
       old_cse_not_expected = cse_not_expected;
       cse_not_expected = true;
-      addr = memory_address (Pmode, addr);
+      addr = memory_address (mem_mode, addr);
       cse_not_expected = old_cse_not_expected;
       seq = get_insns ();
       end_sequence ();
@@ -3416,7 +3424,7 @@ get_address_cost (bool symbol_present, b
 
       if (!acost)
 	acost = 1;
-      costs[symbol_present][var_present][offset_p][ratio_p] = acost;
+      costs[mem_mode][symbol_present][var_present][offset_p][ratio_p] = acost;
     }
 
   return cost + acost;
@@ -3832,7 +3840,8 @@ get_computation_cost_at (struct ivopts_d
      (symbol/var/const parts may be omitted).  If we are looking for an address,
      find the cost of addressing this.  */
   if (address_p)
-    return cost + get_address_cost (symbol_present, var_present, offset, ratio);
+    return cost + get_address_cost (symbol_present, var_present, offset, ratio,
+				    TYPE_MODE (TREE_TYPE (*use->op_p)));
 
   /* Otherwise estimate the costs for computing the expression.  */
   aratio = ratio > 0 ? ratio : -ratio;
Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc.orig/tree-ssa-loop-ivopts.c
+++ gcc/tree-ssa-loop-ivopts.c
@@ -3314,27 +3314,28 @@ get_address_cost (bool symbol_present, b
   if (!initialized[mem_mode])
     {
       HOST_WIDE_INT i;
+      HOST_WIDE_INT start = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
       initialized[mem_mode] = true;
 
       reg1 = gen_raw_REG (Pmode, LAST_VIRTUAL_REGISTER + 1);
 
       addr = gen_rtx_fmt_ee (PLUS, Pmode, reg1, NULL_RTX);
-      for (i = 1; i <= 1 << 20; i <<= 1)
+      for (i = start; i <= 1 << 20; i <<= 1)
 	{
 	  XEXP (addr, 1) = gen_int_mode (i, Pmode);
 	  if (!memory_address_p (mem_mode, addr))
 	    break;
 	}
-      max_offset[mem_mode] = i >> 1;
+      max_offset[mem_mode] = i == start ? 0 : i >> 1;
       off[mem_mode] = max_offset[mem_mode];
 
-      for (i = 1; i <= 1 << 20; i <<= 1)
+      for (i = start; i <= 1 << 20; i <<= 1)
 	{
 	  XEXP (addr, 1) = gen_int_mode (-i, Pmode);
 	  if (!memory_address_p (mem_mode, addr))
 	    break;
 	}
-      min_offset[mem_mode] = -(i >> 1);
+      min_offset[mem_mode] = i == start ? 0 : -(i >> 1);
 
       if (dump_file && (dump_flags & TDF_DETAILS))
 	{

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]