This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: Yet another attempt to solve multiplication by 11 and many (most?) other constants problem

From: Jan Hubicka <jh at suse dot cz>
To: Roger Sayle <roger at eyesopen dot com>
Cc: Jan Hubicka <jh at suse dot cz>, David Edelsohn <dje at watson dot ibm dot com>, gcc-patches at gcc dot gnu dot org, rth at redhat dot com
Date: Sat, 21 Jan 2006 21:09:44 +0100
Subject: Re: Yet another attempt to solve multiplication by 11 and many (most?) other constants problem
References: <20060118223757.GW26136@kam.mff.cuni.cz> <Pine.LNX.4.44.0601190615490.23324-100000@www.eyesopen.com>

Hi,
> 
> Finally, I've a minor concern about compile-time impact.  synth_mult
> is already highly recursive and we've previously had PRs about the
> time it takes to synthesize 29 insn sequences on the EV4 alpha.
> Since then we've added much better result caching to synth_mult, so
> it probably shouldn't be an issue.

Unforutnately the compilation time is concern.  I am attachine more
polished patch that replaces the current rules instead of hacking in the
new ones.  Unforutnately on the testcase:

volatile int m;

#define test16(n) \
   m*=n; \
   m*=n + 1; \
   m*=n + 2; \
   m*=n + 3; \
   m*=n + 4; \
   m*=n + 5; \
   m*=n + 6; \
   m*=n + 7; \
   m*=n + 8; \
   m*=n + 9; \
   m*=n + 10; \
   m*=n + 11; \
   m*=n + 12; \
   m*=n + 13; \
   m*=n + 14; \
   m*=n + 15; 
#define test256(n) \
	test16(n+0); \
	test16(n+1*16); \
	test16(n+2*16); \
	test16(n+3*16); \
	test16(n+4*16); \
	test16(n+5*16); \
	test16(n+6*16); \
	test16(n+7*16); \
	test16(n+8*16); \
	test16(n+9*16); \
	test16(n+10*16); \
	test16(n+11*16); \
	test16(n+12*16); \
	test16(n+13*16); \
	test16(n+14*16); \
	test16(n+15*16);

main()
{
	test256(0);
	test256(1*256);
	test256(2*256);
	test256(3*256);
	test256(4*256);
	test256(5*256);
	test256(6*256);
	test256(7*256);
	test256(8*256);
	test256(9*256);
	test256(10*256);
	test256(11*256);
	test256(12*256);
	test256(13*256);
	test256(14*256);
	test256(15*256);
}

it ramps expansion to about 70% of complation time for -march=pentium4
(with checking, but I guess it does not matter much) even if I added
extra code to cache the log argument.  Perhaps it is acceptable given
somewhat extreme nature of testcase but probably not.

One idea I ran across is to only try log 0 as in current code and log
representing highest bit as needed for 11.  This works fairly well (for
first 30 numbers brings optimal results), but misses quite few cases
resulting in more temporaries.  Do you have any ideas how to cut the
algorithm without making the results worse?

Honza

Index: expmed.c
===================================================================
-u -L expmed.c	(revision 109978) -L expmed.c	(working copy) .svn/text-base/expmed.c.svn-base expmed.c
--- expmed.c	(revision 109978)
+++ expmed.c	(working copy)
@@ -2378,6 +2378,8 @@ struct alg_hash_entry {
 
   /* The best multiplication algorithm for t.  */
   enum alg_code alg;
+  /* The best log argument for algorithms where it makes sense.  */
+  int log;
 
   /* The cost of multiplication if ALG_CODE is not alg_impossible.
      Otherwise, the cost within which multiplication by T is
@@ -2432,6 +2434,7 @@ synth_mult (struct algorithm *alg_out, u
   int hash_index;
   bool cache_hit = false;
   enum alg_code cache_alg = alg_zero;
+  int cache_log = 0;
 
   /* Indicate that no algorithm is yet found.  If no algorithm
      is found, this value will be returned and indicate failure.  */
@@ -2486,6 +2489,7 @@ synth_mult (struct algorithm *alg_out, u
       && alg_hash[hash_index].alg != alg_unknown)
     {
       cache_alg = alg_hash[hash_index].alg;
+      cache_log = alg_hash[hash_index].log;
 
       if (cache_alg == alg_impossible)
 	{
@@ -2520,8 +2524,10 @@ synth_mult (struct algorithm *alg_out, u
 	      goto do_alg_shift;
 
 	    case alg_add_t_m2:
+	      goto do_alg_add_t_m2;
+
 	    case alg_sub_t_m2:
-	      goto do_alg_addsub_t_m2;
+	      goto do_alg_sub_t_m2;
 
 	    case alg_add_factor:
 	    case alg_sub_factor:
@@ -2574,12 +2580,15 @@ synth_mult (struct algorithm *alg_out, u
 	goto done;
     }
 
-  /* If we have an odd number, add or subtract one.  */
+  do_alg_sub_t_m2:
+
+  /* Try eliminating 1s in the multiplier using alg_add_t_m2.
+     This operation allows parallelism on superscalar CPUs and thus is rather
+     effective.  */
   if ((t & 1) != 0)
     {
       unsigned HOST_WIDE_INT w;
 
-    do_alg_addsub_t_m2:
       for (w = 1; (w & t) != 0; w <<= 1)
 	;
       /* If T was -1, then W will be zero after the loop.  This is another
@@ -2595,13 +2604,13 @@ synth_mult (struct algorithm *alg_out, u
 	{
 	  /* T ends with ...111.  Multiply by (T + 1) and subtract 1.  */
 
-	  op_cost = add_cost[mode];
+	  op_cost = op_latency = add_cost[mode];
 	  new_limit.cost = best_cost.cost - op_cost;
-	  new_limit.latency = best_cost.latency - op_cost;
+	  new_limit.latency = best_cost.latency - op_latency;
 	  synth_mult (alg_in, t + 1, &new_limit, mode);
 
 	  alg_in->cost.cost += op_cost;
-	  alg_in->cost.latency += op_cost;
+	  alg_in->cost.latency += op_latency;
 	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
 	    {
 	      struct algorithm *x;
@@ -2611,28 +2620,58 @@ synth_mult (struct algorithm *alg_out, u
 	      best_alg->op[best_alg->ops] = alg_sub_t_m2;
 	    }
 	}
-      else
+    }
+  if (cache_hit)
+    goto done;
+
+  do_alg_add_t_m2:
+
+  for (m = cache_hit ? cache_log : floor_log2 (t); m >= 0; m--)
+    {
+      unsigned HOST_WIDE_INT d;
+
+      d = ((unsigned HOST_WIDE_INT) 1 << m);
+      if ((t & d) != 0)
 	{
-	  /* T ends with ...01 or ...011.  Multiply by (T - 1) and add 1.  */
+	  /* Multiply by (T - m) and add m.  */
 
-	  op_cost = add_cost[mode];
+	  if (m)
+	    {
+	      op_cost = add_cost[mode] + shift_cost[mode][m];
+	      /* If the target has a cheap shift-and-add insn use
+		 that in preference to a shift insn followed by a sub insn.
+		 Assume that the shift-and-sub is "atomic" with a latency
+		 equal to it's cost, otherwise assume that on superscalar
+		 hardware the shift may be executed concurrently with the
+		 earlier steps in the algorithm.  */
+	      if (shiftadd_cost[mode][m] >= op_cost)
+		op_latency = add_cost[mode];
+	      else
+		op_latency = op_cost = shiftadd_cost[mode][m];
+	    }
+	  else
+	    op_cost = op_latency = add_cost[mode];
 	  new_limit.cost = best_cost.cost - op_cost;
-	  new_limit.latency = best_cost.latency - op_cost;
-	  synth_mult (alg_in, t - 1, &new_limit, mode);
+	  new_limit.latency = best_cost.latency - op_latency;
+	  synth_mult (alg_in, t - d, &new_limit, mode);
 
 	  alg_in->cost.cost += op_cost;
-	  alg_in->cost.latency += op_cost;
+	  /* Ensure that the CPU had time to perform the shift.  */
+	  if (m && alg_in->cost.latency < shift_cost[mode][m] + add_cost[mode]
+	      && op_cost != op_latency)
+	    alg_in->cost.latency = shift_cost[mode][m] + add_cost[mode];
+	  alg_in->cost.latency += op_latency;
 	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
 	    {
 	      struct algorithm *x;
 	      best_cost = alg_in->cost;
 	      x = alg_in, alg_in = best_alg, best_alg = x;
-	      best_alg->log[best_alg->ops] = 0;
+	      best_alg->log[best_alg->ops] = m;
 	      best_alg->op[best_alg->ops] = alg_add_t_m2;
 	    }
+	  if (cache_hit)
+	    goto done;
 	}
-      if (cache_hit)
-	goto done;
     }
 
   /* Look for factors of t of the form
@@ -2646,28 +2685,20 @@ synth_mult (struct algorithm *alg_out, u
      COST_LIMIT) the search.  */
 
  do_alg_addsub_factor:
-  for (m = floor_log2 (t - 1); m >= 2; m--)
+  for (m = cache_hit ? cache_log : floor_log2 (t - 1); m >= 2; m--)
     {
       unsigned HOST_WIDE_INT d;
 
       d = ((unsigned HOST_WIDE_INT) 1 << m) + 1;
-      if (t % d == 0 && t > d && m < maxm
+      if (t % d == 0 && t > d && m < maxm 
 	  && (!cache_hit || cache_alg == alg_add_factor))
 	{
 	  /* If the target has a cheap shift-and-add instruction use
-	     that in preference to a shift insn followed by an add insn.
-	     Assume that the shift-and-add is "atomic" with a latency
-	     equal to its cost, otherwise assume that on superscalar
-	     hardware the shift may be executed concurrently with the
-	     earlier steps in the algorithm.  */
+	     that in preference to a shift insn followed by an add insn.  */
 	  op_cost = add_cost[mode] + shift_cost[mode][m];
 	  if (shiftadd_cost[mode][m] < op_cost)
-	    {
-	      op_cost = shiftadd_cost[mode][m];
-	      op_latency = op_cost;
-	    }
-	  else
-	    op_latency = add_cost[mode];
+	    op_cost = shiftadd_cost[mode][m];
+	  op_latency = op_cost;
 
 	  new_limit.cost = best_cost.cost - op_cost;
 	  new_limit.latency = best_cost.latency - op_latency;
@@ -2690,23 +2721,15 @@ synth_mult (struct algorithm *alg_out, u
 	}
 
       d = ((unsigned HOST_WIDE_INT) 1 << m) - 1;
-      if (t % d == 0 && t > d && m < maxm
+      if (t % d == 0 && t > d && m < maxm 
 	  && (!cache_hit || cache_alg == alg_sub_factor))
 	{
 	  /* If the target has a cheap shift-and-subtract insn use
-	     that in preference to a shift insn followed by a sub insn.
-	     Assume that the shift-and-sub is "atomic" with a latency
-	     equal to it's cost, otherwise assume that on superscalar
-	     hardware the shift may be executed concurrently with the
-	     earlier steps in the algorithm.  */
+	     that in preference to a shift insn followed by a sub insn.  */
 	  op_cost = add_cost[mode] + shift_cost[mode][m];
-	  if (shiftsub_cost[mode][m] < op_cost)
-	    {
-	      op_cost = shiftsub_cost[mode][m];
-	      op_latency = op_cost;
-	    }
-	  else
-	    op_latency = add_cost[mode];
+	  if (shiftadd_cost[mode][m] < op_cost)
+	    op_cost = shiftadd_cost[mode][m];
+	  op_latency = op_cost;
 
 	  new_limit.cost = best_cost.cost - op_cost;
 	  new_limit.latency = best_cost.latency - op_latency;
@@ -2726,15 +2749,16 @@ synth_mult (struct algorithm *alg_out, u
 	    }
 	  break;
 	}
+      if (cache_hit)
+	goto done;
     }
-  if (cache_hit)
-    goto done;
+
+  do_alg_add_t2_m:
 
   /* Try shift-and-add (load effective address) instructions,
      i.e. do a*3, a*5, a*9.  */
   if ((t & 1) != 0)
     {
-    do_alg_add_t2_m:
       q = t - 1;
       q = q & -q;
       m = exact_log2 (q);
@@ -2807,6 +2831,7 @@ synth_mult (struct algorithm *alg_out, u
       alg_hash[hash_index].t = t;
       alg_hash[hash_index].mode = mode;
       alg_hash[hash_index].alg = best_alg->op[best_alg->ops];
+      alg_hash[hash_index].log = best_alg->log[best_alg->ops];
       alg_hash[hash_index].cost.cost = best_cost.cost;
       alg_hash[hash_index].cost.latency = best_cost.latency;
     }

Follow-Ups:
- Re: Yet another attempt to solve multiplication by 11 and many (most?) other constants problem
  - From: Steven Bosscher
- Re: Yet another attempt to solve multiplication by 11 and many (most?) other constants problem
  - From: Jan Hubicka

References:
- Yet another attempt to solve multiplication by 11 and many (most?) other constants problem
  - From: Jan Hubicka
- Re: Yet another attempt to solve multiplication by 11 and many (most?) other constants problem
  - From: Roger Sayle

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]