Yet another attempt to solve multiplication by 11 and many (most?) other constants problem
Jan Hubicka
jh@suse.cz
Wed Jan 18 22:51:00 GMT 2006
Hi,
I've been playing bit further witht he mult expansion as this seems bit ugly in
my generic patch and here is some update. I've added i386 pattern to decompose
LEA when doing so provably reduces latency (this is needed to get optimal *11
sequence not "optimized" into one with one extra cycle). This issue is however
bit independent on fact that synt_mult sometimes goes very wrong.
Looking at the synth_mult latency heuristic, one bit I really don't like about
it is that it is trying to introduce the parallelism by:
alg_add_factor total := total * coeff + total;
alg_sub_factor total := total * coeff - total;
assming their latency to be cost of addition. This is not really correct, the
sequence it translate into:
t2 = t;
t = t<<2;
t += t2
has no parallelism in it. If it is preceeded by extra shift operation (like t>>8),
it eventually optimize translates as:
t2 = t<<8
t = t>>10
t += t2
resulting in parallelism on 3 address machine, not on i386. The parallelism is
however just implicit in the alg_* sequence and thus there is not much to play
about it. The alg_* contain:
alg_add_t_m2 total := total + multiplicand * coeff;
alg_sub_t_m2 total := total - multiplicand * coeff;
This one naturally allows parallelism without any CSE interference:
t2 = t<<8
t = somealgorithm
t = t+t2
that seems like more plausible way to represent sequences above, but
unfortunately this is never considered by synth_mult since it only considers
those two patterns with coef==1 (ie in the useless cases parallelism wise).
The patch bellow adds somewhat code (not cleaned up, just an experiment) to
consider all possible coeffs and latencies. When the latency code in the
alg_add_factor is disabled it optimizes the resulting sequences quite a bit.
I seem to believe that it is possible to disable alg_add_factor code since
we will represent the CSEed sequences above using alg_add_t_m2/alg_sub_t_m2
I. e. checking constant in between 11 and 22:
for 11:
leal (%eax,%eax,2), %edx
sall $3, %eax
addl %edx, %eax
(3 cycles, 1 temporary), instead of:
leal 0(,%eax,4), %ecx
movl %eax, %edx
sall $4, %edx
subl %ecx, %edx
subl %eax, %edx
(4 cycles, 2 temporaries)
For 12:
leal (%eax,%eax,2), %eax
sall $2, %eax
(3 cycles, no temporary) instead of:
leal 0(,%eax,4), %edx
sall $4, %eax
subl %edx, %eax
(3 cycles, 1 temporary)
For 13:
leal (%eax,%eax,4), %edx
sall $3, %eax
addl %edx, %eax
(3 cycles, 1 temporary) instead of:
leal 0(,%eax,4), %ecx
movl %eax, %edx
sall $4, %edx
subl %ecx, %edx
addl %eax, %edx
For 18:
leal (%eax,%eax,8), %eax
addl %eax, %eax
(3 cycles, 0 temporaries) instead of:
movl %eax, %edx
sall $4, %edx
leal (%edx,%eax,2), %eax
(4 cycles, 1 temporary)
For 19:
leal (%eax,%eax,2), %edx
sall $4, %eax
addl %eax, %edx
(3 cycles, 1 temporary) instead of:
movl %eax, %edx
sall $4, %edx
leal (%edx,%eax,4), %edx
subl %eax, %edx
(4 cycles, 1 temporary)
For 20:
leal (%eax,%eax,4), %eax
sall $2, %eax
(3 cycles, 0 temporaries) instead of:
movl %eax, %edx
sall $4, %edx
leal (%edx,%eax,4), %eax
(4 cycles, 1 temporary)
For 21:
leal (%eax,%eax,4), %edx
sall $4, %eax
addl %eax, %edx
(3 cycles, 1 temporary) instead of:
movl %eax, %edx
sall $4, %edx
leal (%edx,%eax,4), %edx
addl %eax, %edx
(5 cycles, 1 temporary)
For 22:
leal (%eax,%eax,2), %edx
sall $3, %eax
addl %edx, %eax
addl %eax, %eax
(4 cycles, 1 temporary) instead of:
leal 0(,%eax,4), %ecx
movl %eax, %edx
sall $4, %edx
subl %ecx, %edx
subl %eax, %edx
addl %edx, %edx
(5 cycles, 2 temporaries)
and so on. The list of improved constatns is rather dense.
It loses on *14:
leal 0(,%edx,8), %eax
subl %edx, %eax
addl %eax, %eax
(4 cycles, 1 temporary)
instead of:
leal (%eax,%eax), %edx
sall $4, %eax
subl %edx, %eax
(3 cycles, 1 temporary)
because 2 address nature of i386 with lea exception is not very precisely
modeled, but perhaps it can be hacked around.
I checked other 30 constants and I don't know of any sequence where old
code would win (even on 3 address machine like PPC). On first 256
numbers we also never use 3rd temporary.
We still lose on *3/*7/*15 relative to AMD recommendation. We do:
leal 0(,%edx,8), %eax
subl %edx, %eax
that is one instruction shorter than:
mov %eax, %edx
sall $4, %eax
subl %edx, %eax
but requires one extra cycle. This is result of regalloc being smart enought to figure
out it can save reg by making destination different from source but not knowing that
it increase latency...
What do you think of this approach?
Honza
Index: expmed.c
===================================================================
-c -L expmed.c (revision 109639) -L expmed.c (working copy) .svn/text-base/expmed.c.svn-base expmed.c
*** expmed.c (revision 109639)
--- expmed.c (working copy)
*************** synth_mult (struct algorithm *alg_out, u
*** 2650,2657 ****
{
unsigned HOST_WIDE_INT d;
d = ((unsigned HOST_WIDE_INT) 1 << m) + 1;
! if (t % d == 0 && t > d && m < maxm
&& (!cache_hit || cache_alg == alg_add_factor))
{
/* If the target has a cheap shift-and-add instruction use
--- 2650,2720 ----
{
unsigned HOST_WIDE_INT d;
+ if (1)
+ {
+ /* If the target has a cheap shift-and-add instruction use
+ that in preference to a shift insn followed by an add insn.
+ Assume that the shift-and-add is "atomic" with a latency
+ equal to its cost */
+ op_cost = add_cost[mode] + shift_cost[mode][m];
+ if (shiftadd_cost[mode][m] < op_cost)
+ op_cost = op_latency = shiftadd_cost[mode][m];
+ else
+ op_latency = add_cost[mode];
+
+ new_limit.cost = best_cost.cost - op_cost;
+ new_limit.latency = best_cost.latency - op_latency;
+ synth_mult (alg_in, t - ((unsigned HOST_WIDE_INT) 1 << m),
+ &new_limit, mode);
+
+ alg_in->cost.cost += op_cost;
+ if (alg_in->cost.latency < shift_cost[mode][m] + add_cost[mode]
+ && op_cost != op_latency)
+ alg_in->cost.latency = shift_cost[mode][m] + add_cost[mode];
+ alg_in->cost.latency += op_latency;
+ if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
+ {
+ struct algorithm *x;
+ best_cost = alg_in->cost;
+ x = alg_in, alg_in = best_alg, best_alg = x;
+ best_alg->log[best_alg->ops] = m;
+ best_alg->op[best_alg->ops] = alg_add_t_m2;
+ }
+ }
+ if (1)
+ {
+ /* If the target has a cheap shift-and-add instruction use
+ that in preference to a shift insn followed by an add insn.
+ Assume that the shift-and-add is "atomic" with a latency
+ equal to its cost */
+ op_cost = add_cost[mode] + shift_cost[mode][m];
+ if (shiftadd_cost[mode][m] < op_cost)
+ op_cost = op_latency = shiftadd_cost[mode][m];
+ else
+ op_latency = add_cost[mode];
+
+ new_limit.cost = best_cost.cost - op_cost;
+ new_limit.latency = best_cost.latency - op_latency;
+ synth_mult (alg_in, t + ((unsigned HOST_WIDE_INT) 1 << m),
+ &new_limit, mode);
+
+ alg_in->cost.cost += op_cost;
+ if (alg_in->cost.latency < shift_cost[mode][m] + add_cost[mode]
+ && op_cost != op_latency)
+ alg_in->cost.latency = shift_cost[mode][m] + add_cost[mode];
+ alg_in->cost.latency += op_latency;
+ if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
+ {
+ struct algorithm *x;
+ best_cost = alg_in->cost;
+ x = alg_in, alg_in = best_alg, best_alg = x;
+ best_alg->log[best_alg->ops] = m;
+ best_alg->op[best_alg->ops] = alg_sub_t_m2;
+ }
+ }
+
d = ((unsigned HOST_WIDE_INT) 1 << m) + 1;
! if (t % d == 0 && t > d && m < maxm
&& (!cache_hit || cache_alg == alg_add_factor))
{
/* If the target has a cheap shift-and-add instruction use
*************** synth_mult (struct algorithm *alg_out, u
*** 2662,2673 ****
earlier steps in the algorithm. */
op_cost = add_cost[mode] + shift_cost[mode][m];
if (shiftadd_cost[mode][m] < op_cost)
! {
! op_cost = shiftadd_cost[mode][m];
! op_latency = op_cost;
! }
! else
! op_latency = add_cost[mode];
new_limit.cost = best_cost.cost - op_cost;
new_limit.latency = best_cost.latency - op_latency;
--- 2725,2732 ----
earlier steps in the algorithm. */
op_cost = add_cost[mode] + shift_cost[mode][m];
if (shiftadd_cost[mode][m] < op_cost)
! op_cost = shiftadd_cost[mode][m];
! op_latency = op_cost;
new_limit.cost = best_cost.cost - op_cost;
new_limit.latency = best_cost.latency - op_latency;
*************** synth_mult (struct algorithm *alg_out, u
*** 2690,2696 ****
}
d = ((unsigned HOST_WIDE_INT) 1 << m) - 1;
! if (t % d == 0 && t > d && m < maxm
&& (!cache_hit || cache_alg == alg_sub_factor))
{
/* If the target has a cheap shift-and-subtract insn use
--- 2749,2755 ----
}
d = ((unsigned HOST_WIDE_INT) 1 << m) - 1;
! if (t % d == 0 && t > d && m < maxm
&& (!cache_hit || cache_alg == alg_sub_factor))
{
/* If the target has a cheap shift-and-subtract insn use
*************** synth_mult (struct algorithm *alg_out, u
*** 2700,2712 ****
hardware the shift may be executed concurrently with the
earlier steps in the algorithm. */
op_cost = add_cost[mode] + shift_cost[mode][m];
! if (shiftsub_cost[mode][m] < op_cost)
! {
! op_cost = shiftsub_cost[mode][m];
! op_latency = op_cost;
! }
! else
! op_latency = add_cost[mode];
new_limit.cost = best_cost.cost - op_cost;
new_limit.latency = best_cost.latency - op_latency;
--- 2759,2767 ----
hardware the shift may be executed concurrently with the
earlier steps in the algorithm. */
op_cost = add_cost[mode] + shift_cost[mode][m];
! if (shiftadd_cost[mode][m] < op_cost)
! op_cost = shiftadd_cost[mode][m];
! op_latency = op_cost;
new_limit.cost = best_cost.cost - op_cost;
new_limit.latency = best_cost.latency - op_latency;
Index: config/i386/i386.md
===================================================================
-c -L config/i386/i386.md (revision 109639) -L config/i386/i386.md (working copy) config/i386/.svn/text-base/i386.md.svn-base config/i386/i386.md
*** config/i386/i386.md (revision 109639)
--- config/i386/i386.md (working copy)
***************
*** 5130,5135 ****
--- 5130,5156 ----
(const_string "alu")))
(set_attr "mode" "DI")])
+ ;; Canonicalize plus operations. Since the PLUS operation allows
+ ;; the "lea" alternative there is nothing to enforce the canonical operand
+ ;; order after reload.
+ (define_split
+ [(set (match_operand 0 "register_operand" "")
+ (plus (match_operand 1 "register_operand" "")
+ (match_operand 2 "register_operand" "")))
+ (clobber (reg:CC FLAGS_REG))]
+ "rtx_equal_p (operands[0], operands[2])
+ && !rtx_equal_p (operands[0], operands[1])"
+ [(const_int 0)]
+ {
+ rtx pat, clob;
+ pat = gen_rtx_SET (VOIDmode, operands[0],
+ gen_rtx_PLUS (GET_MODE (operands[0]),
+ operands[2], operands[1]));
+ clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, pat, clob)));
+ DONE;
+ })
+
;; Convert lea to the lea pattern to avoid flags dependency.
(define_split
[(set (match_operand:DI 0 "register_operand" "")
***************
*** 19876,19881 ****
--- 19897,19945 ----
operands[1] = gen_rtx_SUBREG (mode, operands[1], 0);
operands[0] = dest;
})
+
+ ;; On Athlon and K8 CPUs the sequence:
+ ;; leal 0(,%edx,8), %eax
+ ;; leal (%eax,%edx,2), %eax
+ ;; Has latency of 4, while the second lea can be decomposed
+ ;; resulting in latency of 3.
+ ;; Ideally we would only check for arbitrary first instrucion
+ ;; that use the same operand as the second, but this is not easily doable.
+
+ (define_peephole2
+ [(set (match_operand:SI 0 "register_operand" "")
+ (match_operand:SI 1 "no_seg_address_operand" ""))
+ (set (match_operand:SI 2 "register_operand" "")
+ (plus:SI (mult:SI (match_operand 3)
+ (match_operand:SI 4 "const248_operand" ""))
+ (match_dup 0)))]
+ "!optimize_size
+ && TARGET_ATHLON_K8
+ /* Destination must match one of operands of last plus so we won't end up
+ with lea again. */
+ && (rtx_equal_p (operands[0], operands[2])
+ || rtx_equal_p (operands[3], operands[2]))
+ /* Avoid degenerate cases. */
+ && !reg_overlap_mentioned_p (operands[0], operands[3])
+ /* When there is no use of other other value, transformation might
+ not be profitable. */
+ && reg_overlap_mentioned_p (operands[3], operands[1])
+ && peep2_regno_dead_p (0, FLAGS_REG)"
+ [(set (match_dup 0) (match_dup 1))
+ (parallel [(set (match_dup 3)
+ (ashift:SI (match_dup 3)
+ (match_dup 4)))
+ (clobber (reg:CC FLAGS_REG))])
+ (parallel [(set (match_dup 2)
+ (plus:SI (match_dup 2)
+ (match_dup 5)))
+ (clobber (reg:CC FLAGS_REG))])]
+ {
+ operands[4] = GEN_INT (floor_log2 (INTVAL (operands[4])));
+ operands[5] = (rtx_equal_p (operands[0], operands[2])
+ ? operands[3] : operands[0]);
+ })
+
;; Call-value patterns last so that the wildcard operand does not
;; disrupt insn-recog's switch tables.
More information about the Gcc-patches
mailing list