Yet another attempt to solve multiplication by 11 and many (most?) other constants problem

Jan Hubicka jh@suse.cz
Wed Jan 18 22:51:00 GMT 2006


Hi,
I've been playing bit further witht he mult expansion as this seems bit ugly in
my generic patch and here is some update.  I've added i386 pattern to decompose
LEA when doing so provably reduces latency (this is needed to get optimal *11
sequence not "optimized" into one with one extra cycle).  This issue is however
bit independent on fact that synt_mult sometimes goes very wrong.  

Looking at the synth_mult latency heuristic, one bit I really don't like about
it is that it is trying to introduce the parallelism by:
   alg_add_factor	total := total * coeff + total;
   alg_sub_factor	total := total * coeff - total;
assming their latency to be cost of addition.  This is not really correct, the
sequence it translate into:
   t2 = t;
   t = t<<2;
   t += t2
has no parallelism in it.  If it is preceeded by extra shift operation (like t>>8),
it eventually optimize translates as:

   t2 = t<<8
   t = t>>10
   t += t2

resulting in parallelism on 3 address machine, not on i386.  The parallelism is
however just implicit in the alg_* sequence and thus there is not much to play
about it.  The alg_* contain:
   alg_add_t_m2		total := total + multiplicand * coeff;
   alg_sub_t_m2		total := total - multiplicand * coeff;
This one naturally allows parallelism without any CSE interference:

   t2 = t<<8
   t = somealgorithm
   t = t+t2

that seems like more plausible way to represent sequences above, but
unfortunately this is never considered by synth_mult since it only considers
those two patterns with coef==1 (ie in the useless cases parallelism wise).
The patch bellow adds somewhat code (not cleaned up, just an experiment) to
consider all possible coeffs and latencies.  When the latency code in the
alg_add_factor is disabled it optimizes the resulting sequences quite a bit.  
I seem to believe that it is possible to disable alg_add_factor code since
we will represent the CSEed sequences above using alg_add_t_m2/alg_sub_t_m2
I. e. checking constant in between 11 and 22:

for 11:
        leal    (%eax,%eax,2), %edx
        sall    $3, %eax
        addl    %edx, %eax
(3 cycles, 1 temporary), instead of:
        leal    0(,%eax,4), %ecx
        movl    %eax, %edx
        sall    $4, %edx
        subl    %ecx, %edx
        subl    %eax, %edx
(4 cycles, 2 temporaries)

For 12:
        leal    (%eax,%eax,2), %eax
        sall    $2, %eax
(3 cycles, no temporary) instead of:
        leal    0(,%eax,4), %edx
        sall    $4, %eax
        subl    %edx, %eax
(3 cycles, 1 temporary)

For 13:
        leal    (%eax,%eax,4), %edx
        sall    $3, %eax
        addl    %edx, %eax
(3 cycles, 1 temporary) instead of:
        leal    0(,%eax,4), %ecx
        movl    %eax, %edx
        sall    $4, %edx
        subl    %ecx, %edx
        addl    %eax, %edx

For 18:
        leal    (%eax,%eax,8), %eax
        addl    %eax, %eax
(3 cycles, 0 temporaries) instead of:
        movl    %eax, %edx
        sall    $4, %edx
        leal    (%edx,%eax,2), %eax
(4 cycles, 1 temporary)

For 19:
        leal    (%eax,%eax,2), %edx
        sall    $4, %eax
        addl    %eax, %edx
(3 cycles, 1 temporary) instead of:
        movl    %eax, %edx
        sall    $4, %edx
        leal    (%edx,%eax,4), %edx
        subl    %eax, %edx
(4 cycles, 1 temporary)

For 20:
        leal    (%eax,%eax,4), %eax
        sall    $2, %eax
(3 cycles, 0 temporaries) instead of:
        movl    %eax, %edx
        sall    $4, %edx
        leal    (%edx,%eax,4), %eax
(4 cycles, 1 temporary)

For 21:
        leal    (%eax,%eax,4), %edx
        sall    $4, %eax
        addl    %eax, %edx
(3 cycles, 1 temporary) instead of:
        movl    %eax, %edx
        sall    $4, %edx
        leal    (%edx,%eax,4), %edx
        addl    %eax, %edx
(5 cycles, 1 temporary)


For 22:
        leal    (%eax,%eax,2), %edx
        sall    $3, %eax
        addl    %edx, %eax
        addl    %eax, %eax
(4 cycles, 1 temporary) instead of:
        leal    0(,%eax,4), %ecx
        movl    %eax, %edx
        sall    $4, %edx
        subl    %ecx, %edx
        subl    %eax, %edx
        addl    %edx, %edx
(5 cycles, 2 temporaries)

and so on.  The list of improved constatns is rather dense.

It loses on *14:
        leal    0(,%edx,8), %eax
        subl    %edx, %eax
        addl    %eax, %eax
(4 cycles, 1 temporary)
instead of:
        leal    (%eax,%eax), %edx
        sall    $4, %eax
        subl    %edx, %eax
(3 cycles, 1 temporary)

because 2 address nature of i386 with lea exception is not very precisely
modeled, but perhaps it can be hacked around.
I checked other 30 constants and I don't know of any sequence where old
code would win (even on 3 address machine like PPC).  On first 256
numbers we also never use 3rd temporary.

We still lose on *3/*7/*15 relative to AMD recommendation.  We do:
        leal    0(,%edx,8), %eax
        subl    %edx, %eax
that is one instruction shorter than:
	mov	%eax, %edx
	sall	$4, %eax
	subl	%edx, %eax
but requires one extra cycle.  This is result of regalloc being smart enought to figure
out it can save reg by making destination different from source but not knowing that
it increase latency...

What do you think of this approach?

Honza

Index: expmed.c
===================================================================
-c -L expmed.c	(revision 109639) -L expmed.c	(working copy) .svn/text-base/expmed.c.svn-base expmed.c
*** expmed.c	(revision 109639)
--- expmed.c	(working copy)
*************** synth_mult (struct algorithm *alg_out, u
*** 2650,2657 ****
      {
        unsigned HOST_WIDE_INT d;
  
        d = ((unsigned HOST_WIDE_INT) 1 << m) + 1;
!       if (t % d == 0 && t > d && m < maxm
  	  && (!cache_hit || cache_alg == alg_add_factor))
  	{
  	  /* If the target has a cheap shift-and-add instruction use
--- 2650,2720 ----
      {
        unsigned HOST_WIDE_INT d;
  
+       if (1)
+ 	{
+ 	  /* If the target has a cheap shift-and-add instruction use
+ 	     that in preference to a shift insn followed by an add insn.
+ 	     Assume that the shift-and-add is "atomic" with a latency
+ 	     equal to its cost */
+ 	  op_cost = add_cost[mode] + shift_cost[mode][m];
+ 	  if (shiftadd_cost[mode][m] < op_cost)
+ 	    op_cost = op_latency = shiftadd_cost[mode][m];
+ 	  else
+ 	    op_latency = add_cost[mode];
+ 
+ 	  new_limit.cost = best_cost.cost - op_cost;
+ 	  new_limit.latency = best_cost.latency - op_latency;
+ 	  synth_mult (alg_in, t - ((unsigned HOST_WIDE_INT) 1 << m),
+ 		      &new_limit, mode);
+ 
+ 	  alg_in->cost.cost += op_cost;
+ 	  if (alg_in->cost.latency < shift_cost[mode][m] + add_cost[mode]
+ 	      && op_cost != op_latency)
+ 	    alg_in->cost.latency = shift_cost[mode][m] + add_cost[mode];
+ 	  alg_in->cost.latency += op_latency;
+ 	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
+ 	    {
+ 	      struct algorithm *x;
+ 	      best_cost = alg_in->cost;
+ 	      x = alg_in, alg_in = best_alg, best_alg = x;
+ 	      best_alg->log[best_alg->ops] = m;
+ 	      best_alg->op[best_alg->ops] = alg_add_t_m2;
+ 	    }
+ 	}
+       if (1)
+ 	{
+ 	  /* If the target has a cheap shift-and-add instruction use
+ 	     that in preference to a shift insn followed by an add insn.
+ 	     Assume that the shift-and-add is "atomic" with a latency
+ 	     equal to its cost */
+ 	  op_cost = add_cost[mode] + shift_cost[mode][m];
+ 	  if (shiftadd_cost[mode][m] < op_cost)
+ 	    op_cost = op_latency = shiftadd_cost[mode][m];
+ 	  else
+ 	    op_latency = add_cost[mode];
+ 
+ 	  new_limit.cost = best_cost.cost - op_cost;
+ 	  new_limit.latency = best_cost.latency - op_latency;
+ 	  synth_mult (alg_in, t + ((unsigned HOST_WIDE_INT) 1 << m),
+ 		      &new_limit, mode);
+ 
+ 	  alg_in->cost.cost += op_cost;
+ 	  if (alg_in->cost.latency < shift_cost[mode][m] + add_cost[mode]
+ 	      && op_cost != op_latency)
+ 	    alg_in->cost.latency = shift_cost[mode][m] + add_cost[mode];
+ 	  alg_in->cost.latency += op_latency;
+ 	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
+ 	    {
+ 	      struct algorithm *x;
+ 	      best_cost = alg_in->cost;
+ 	      x = alg_in, alg_in = best_alg, best_alg = x;
+ 	      best_alg->log[best_alg->ops] = m;
+ 	      best_alg->op[best_alg->ops] = alg_sub_t_m2;
+ 	    }
+ 	}
+ 
        d = ((unsigned HOST_WIDE_INT) 1 << m) + 1;
!       if (t % d == 0 && t > d && m < maxm 
  	  && (!cache_hit || cache_alg == alg_add_factor))
  	{
  	  /* If the target has a cheap shift-and-add instruction use
*************** synth_mult (struct algorithm *alg_out, u
*** 2662,2673 ****
  	     earlier steps in the algorithm.  */
  	  op_cost = add_cost[mode] + shift_cost[mode][m];
  	  if (shiftadd_cost[mode][m] < op_cost)
! 	    {
! 	      op_cost = shiftadd_cost[mode][m];
! 	      op_latency = op_cost;
! 	    }
! 	  else
! 	    op_latency = add_cost[mode];
  
  	  new_limit.cost = best_cost.cost - op_cost;
  	  new_limit.latency = best_cost.latency - op_latency;
--- 2725,2732 ----
  	     earlier steps in the algorithm.  */
  	  op_cost = add_cost[mode] + shift_cost[mode][m];
  	  if (shiftadd_cost[mode][m] < op_cost)
! 	    op_cost = shiftadd_cost[mode][m];
! 	  op_latency = op_cost;
  
  	  new_limit.cost = best_cost.cost - op_cost;
  	  new_limit.latency = best_cost.latency - op_latency;
*************** synth_mult (struct algorithm *alg_out, u
*** 2690,2696 ****
  	}
  
        d = ((unsigned HOST_WIDE_INT) 1 << m) - 1;
!       if (t % d == 0 && t > d && m < maxm
  	  && (!cache_hit || cache_alg == alg_sub_factor))
  	{
  	  /* If the target has a cheap shift-and-subtract insn use
--- 2749,2755 ----
  	}
  
        d = ((unsigned HOST_WIDE_INT) 1 << m) - 1;
!       if (t % d == 0 && t > d && m < maxm 
  	  && (!cache_hit || cache_alg == alg_sub_factor))
  	{
  	  /* If the target has a cheap shift-and-subtract insn use
*************** synth_mult (struct algorithm *alg_out, u
*** 2700,2712 ****
  	     hardware the shift may be executed concurrently with the
  	     earlier steps in the algorithm.  */
  	  op_cost = add_cost[mode] + shift_cost[mode][m];
! 	  if (shiftsub_cost[mode][m] < op_cost)
! 	    {
! 	      op_cost = shiftsub_cost[mode][m];
! 	      op_latency = op_cost;
! 	    }
! 	  else
! 	    op_latency = add_cost[mode];
  
  	  new_limit.cost = best_cost.cost - op_cost;
  	  new_limit.latency = best_cost.latency - op_latency;
--- 2759,2767 ----
  	     hardware the shift may be executed concurrently with the
  	     earlier steps in the algorithm.  */
  	  op_cost = add_cost[mode] + shift_cost[mode][m];
! 	  if (shiftadd_cost[mode][m] < op_cost)
! 	    op_cost = shiftadd_cost[mode][m];
! 	  op_latency = op_cost;
  
  	  new_limit.cost = best_cost.cost - op_cost;
  	  new_limit.latency = best_cost.latency - op_latency;
Index: config/i386/i386.md
===================================================================
-c -L config/i386/i386.md	(revision 109639) -L config/i386/i386.md	(working copy) config/i386/.svn/text-base/i386.md.svn-base config/i386/i386.md
*** config/i386/i386.md	(revision 109639)
--- config/i386/i386.md	(working copy)
***************
*** 5130,5135 ****
--- 5130,5156 ----
  	   (const_string "alu")))
     (set_attr "mode" "DI")])
  
+ ;; Canonicalize plus operations.  Since the PLUS operation allows
+ ;; the "lea" alternative there is nothing to enforce the canonical operand
+ ;; order after reload.
+ (define_split
+   [(set (match_operand 0 "register_operand" "")
+ 	(plus (match_operand 1 "register_operand" "")
+ 	      (match_operand 2 "register_operand" "")))
+    (clobber (reg:CC FLAGS_REG))]
+   "rtx_equal_p (operands[0],  operands[2])
+    && !rtx_equal_p (operands[0], operands[1])"
+   [(const_int 0)]
+ {
+   rtx pat, clob;
+   pat = gen_rtx_SET (VOIDmode, operands[0],
+ 		     gen_rtx_PLUS (GET_MODE (operands[0]),
+ 				   operands[2], operands[1]));
+   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, pat, clob)));
+   DONE;
+ })
+ 
  ;; Convert lea to the lea pattern to avoid flags dependency.
  (define_split
    [(set (match_operand:DI 0 "register_operand" "")
***************
*** 19876,19881 ****
--- 19897,19945 ----
      operands[1] = gen_rtx_SUBREG (mode, operands[1], 0);
    operands[0] = dest;
  })
+ 
+ ;; On Athlon and K8 CPUs the sequence:
+ ;;   leal    0(,%edx,8), %eax
+ ;;   leal    (%eax,%edx,2), %eax
+ ;; Has latency of 4, while the second lea can be decomposed
+ ;; resulting in latency of 3.
+ ;; Ideally we would only check for arbitrary first instrucion
+ ;; that use the same operand as the second, but this is not easily doable.
+ 
+ (define_peephole2
+   [(set (match_operand:SI 0 "register_operand" "")
+ 	(match_operand:SI 1 "no_seg_address_operand" ""))
+    (set (match_operand:SI 2 "register_operand" "")
+ 	(plus:SI (mult:SI (match_operand 3)
+ 			  (match_operand:SI 4 "const248_operand" ""))
+ 		 (match_dup 0)))]
+   "!optimize_size
+    && TARGET_ATHLON_K8
+    /* Destination must match one of operands of last plus so we won't end up
+       with lea again.  */
+    && (rtx_equal_p (operands[0], operands[2])
+        || rtx_equal_p (operands[3], operands[2]))
+    /* Avoid degenerate cases.  */
+    && !reg_overlap_mentioned_p (operands[0], operands[3])
+    /* When there is no use of other other value, transformation might
+       not be profitable.  */
+    && reg_overlap_mentioned_p (operands[3], operands[1])
+    && peep2_regno_dead_p (0, FLAGS_REG)"
+   [(set (match_dup 0) (match_dup 1))
+    (parallel [(set (match_dup 3)
+ 		   (ashift:SI (match_dup 3)
+ 			      (match_dup 4)))
+ 	      (clobber (reg:CC FLAGS_REG))])
+    (parallel [(set (match_dup 2)
+ 	      (plus:SI (match_dup 2)
+ 		       (match_dup 5)))
+ 	      (clobber (reg:CC FLAGS_REG))])]
+ {
+    operands[4] = GEN_INT (floor_log2 (INTVAL (operands[4])));
+    operands[5] = (rtx_equal_p (operands[0], operands[2])
+ 		  ? operands[3] : operands[0]);
+ })
+ 
  
  ;; Call-value patterns last so that the wildcard operand does not
  ;; disrupt insn-recog's switch tables.



More information about the Gcc-patches mailing list