[patch] expmed.c: Speed up synth_mult.
Kazu Hirata
kazu@cs.umass.edu
Wed Nov 17 22:11:00 GMT 2004
Hi,
Attached is a patch to speed up synth_mult.
synth_mult synthesizes a best algorithm for multiplying a variable by
some integer constant. The problem is that it is massively recursive.
That is, it makes several recursive calls within one call.
The patch speeds up synth_mult by caching the results of recent calls
to synth_mult. Specifically, given integer T to multiply a variable
by, what we cache is the first algorithm to tackle T. That is, if
"x * T" shuold be computed by a left-shift followed by some other
operation, we cache "left-shift" along with T. The code to handle
"left-shift" will still make a recursive call to figure out what to do
after shifting, but we do *not* try other methods of multiplication,
thereby reducing the branch factor to 1.
To make the cache efficient, I use it like a hash table. In other
words, if T is the integer to multiple a variable by, we compute a
hash value from T and use that to index into the cache. If I have a
collision, I simply kick out the older entry.
Here is timing on two runs of "./cc1 -quiet -O2 -o /dev/null ir.ii"
from PR 13776.
original patched
real: 160.998 158.924 ( 1.288% down)
user: 159.533 157.780 ( 1.098% down)
synt: 9631817 53766 (99.441% down)
"real" and "user" are in seconds. "synt" is the number of calls to
synth_mult, including recursive calls.
I tried cc1-files as well to make sure that this patch was not
"backfiring".
original patched
real: 378.022 378.388 (0.096% up)
user: 374.182 373.407 (0.207% down)
I would call these noise.
Admittedly, the resulting code is a little ugly. I should probably
clean up the goto mess, but I would like to do that in a separate
patch.
Tested on i686-pc-linux-gnu. OK to apply?
Kazu Hirata
2004-11-17 Kazu Hirata <kazu@cs.umass.edu>
* expmed.c (alg_code): Add alg_unknown.
(alg_hash_entry): New.
(NUM_ALG_HASH_ENTRIES): Likewise.
(alg_hash): Likewise.
(synth_mult): Cache the result into alg_hash.
Index: expmed.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/expmed.c,v
retrieving revision 1.205
diff -u -d -p -r1.205 expmed.c
--- expmed.c 13 Nov 2004 22:10:47 -0000 1.205
+++ expmed.c 15 Nov 2004 23:52:43 -0000
@@ -2297,7 +2297,7 @@ expand_shift (enum tree_code code, enum
return temp;
}
-enum alg_code { alg_zero, alg_m, alg_shift,
+enum alg_code { alg_unknown, alg_zero, alg_m, alg_shift,
alg_add_t_m2, alg_sub_t_m2,
alg_add_factor, alg_sub_factor,
alg_add_t2_m, alg_sub_t2_m };
@@ -2364,6 +2364,26 @@ struct algorithm
char log[MAX_BITS_PER_WORD];
};
+/* The entry for our multiplication cache/hash table. */
+struct alg_hash_entry {
+ /* The number we are multiplying by. */
+ unsigned int t;
+
+ /* The mode in which we are multiplying something by T. */
+ enum machine_mode mode;
+
+ /* The best multiplication algorithm for t. */
+ enum alg_code alg;
+};
+
+/* The number of cache/hash entries. */
+#define NUM_ALG_HASH_ENTRIES 307
+
+/* Each entry of ALG_HASH caches alg_code for some integer. This is
+ actually a hash table. If we have a collision, that the older
+ entry is kicked out. */
+static struct alg_hash_entry alg_hash[NUM_ALG_HASH_ENTRIES];
+
/* Indicates the type of fixup needed after a constant multiplication.
BASIC_VARIANT means no fixup is needed, NEGATE_VARIANT means that
the result should be negated, and ADD_VARIANT means that the
@@ -2400,6 +2420,9 @@ synth_mult (struct algorithm *alg_out, u
int op_cost, op_latency;
unsigned HOST_WIDE_INT q;
int maxm = MIN (BITS_PER_WORD, GET_MODE_BITSIZE (mode));
+ int hash_index;
+ bool cache_hit = false;
+ enum alg_code cache_alg = alg_zero;
/* Indicate that no algorithm is yet found. If no algorithm
is found, this value will be returned and indicate failure. */
@@ -2445,11 +2468,46 @@ synth_mult (struct algorithm *alg_out, u
best_alg = alloca (sizeof (struct algorithm));
best_cost = *cost_limit;
+ /* Compute the hash index. */
+ hash_index = (t ^ (unsigned int) mode) % NUM_ALG_HASH_ENTRIES;
+
+ /* See if we already know what to do for T. */
+ if (alg_hash[hash_index].t == t
+ && alg_hash[hash_index].mode == mode
+ && alg_hash[hash_index].alg != alg_unknown)
+ {
+ cache_hit = true;
+ cache_alg = alg_hash[hash_index].alg;
+ switch (cache_alg)
+ {
+ case alg_shift:
+ goto do_alg_shift;
+
+ case alg_add_t_m2:
+ case alg_sub_t_m2:
+ goto do_alg_addsub_t_m2;
+
+ case alg_add_factor:
+ case alg_sub_factor:
+ goto do_alg_addsub_factor;
+
+ case alg_add_t2_m:
+ goto do_alg_add_t2_m;
+
+ case alg_sub_t2_m:
+ goto do_alg_sub_t2_m;
+
+ default:
+ gcc_unreachable ();
+ }
+ }
+
/* If we have a group of zero bits at the low-order part of T, try
multiplying by the remaining bits and then doing a shift. */
if ((t & 1) == 0)
{
+ do_alg_shift:
m = floor_log2 (t & -t); /* m = number of low zero bits */
if (m < maxm)
{
@@ -2475,6 +2533,8 @@ synth_mult (struct algorithm *alg_out, u
best_alg->op[best_alg->ops] = alg_shift;
}
}
+ if (cache_hit)
+ goto done;
}
/* If we have an odd number, add or subtract one. */
@@ -2482,6 +2542,7 @@ synth_mult (struct algorithm *alg_out, u
{
unsigned HOST_WIDE_INT w;
+ do_alg_addsub_t_m2:
for (w = 1; (w & t) != 0; w <<= 1)
;
/* If T was -1, then W will be zero after the loop. This is another
@@ -2533,6 +2594,8 @@ synth_mult (struct algorithm *alg_out, u
best_alg->op[best_alg->ops] = alg_add_t_m2;
}
}
+ if (cache_hit)
+ goto done;
}
/* Look for factors of t of the form
@@ -2545,12 +2608,14 @@ synth_mult (struct algorithm *alg_out, u
good sequence quickly, and therefore be able to prune (by decreasing
COST_LIMIT) the search. */
+ do_alg_addsub_factor:
for (m = floor_log2 (t - 1); m >= 2; m--)
{
unsigned HOST_WIDE_INT d;
d = ((unsigned HOST_WIDE_INT) 1 << m) + 1;
- if (t % d == 0 && t > d && m < maxm)
+ if (t % d == 0 && t > d && m < maxm
+ && (!cache_hit || cache_alg == alg_add_factor))
{
/* If the target has a cheap shift-and-add instruction use
that in preference to a shift insn followed by an add insn.
@@ -2588,7 +2653,8 @@ synth_mult (struct algorithm *alg_out, u
}
d = ((unsigned HOST_WIDE_INT) 1 << m) - 1;
- if (t % d == 0 && t > d && m < maxm)
+ if (t % d == 0 && t > d && m < maxm
+ && (!cache_hit || cache_alg == alg_sub_factor))
{
/* If the target has a cheap shift-and-subtract insn use
that in preference to a shift insn followed by a sub insn.
@@ -2624,11 +2690,14 @@ synth_mult (struct algorithm *alg_out, u
break;
}
}
+ if (cache_hit)
+ goto done;
/* Try shift-and-add (load effective address) instructions,
i.e. do a*3, a*5, a*9. */
if ((t & 1) != 0)
{
+ do_alg_add_t2_m:
q = t - 1;
q = q & -q;
m = exact_log2 (q);
@@ -2650,7 +2719,10 @@ synth_mult (struct algorithm *alg_out, u
best_alg->op[best_alg->ops] = alg_add_t2_m;
}
}
+ if (cache_hit)
+ goto done;
+ do_alg_sub_t2_m:
q = t + 1;
q = q & -q;
m = exact_log2 (q);
@@ -2672,12 +2744,23 @@ synth_mult (struct algorithm *alg_out, u
best_alg->op[best_alg->ops] = alg_sub_t2_m;
}
}
+ if (cache_hit)
+ goto done;
}
+ done:
/* If best_cost has not decreased, we have not found any algorithm. */
if (!CHEAPER_MULT_COST (&best_cost, cost_limit))
return;
+ /* Cache the result. */
+ if (!cache_hit)
+ {
+ alg_hash[hash_index].t = t;
+ alg_hash[hash_index].mode = mode;
+ alg_hash[hash_index].alg = best_alg->op[best_alg->ops];
+ }
+
/* If we are getting a too long sequence for `struct algorithm'
to record, make this search fail. */
if (best_alg->ops == MAX_BITS_PER_WORD)
More information about the Gcc-patches
mailing list