This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Tweak signed division by power of two


The following patch splits "signed integer division by a power of
two" out of expand_divmod into its own function, expand_sdiv_pow2.
This is similar to the recently introduced expand_smod_pow2.

Additionally, there are several minor improvements.  The first is
that we lower the threshold for using a branch-less implementation
from "BRANCH_COST>=3" to "BRANCH_COST>=2".  This allows the use of
GCC's straight-line code on PentiumPro and Pentium4 architectures
as done by Intel's and Microsoft's C/C++ compilers for IA-32.

The second change is to special case the division by two code.
Currently, GCC expands initial RTL as a signed right shift by 31,
followed by an unsigned right shift by 31, to implement the
equivalent of "x < 0 ? 1 : 0".  The first shift is of course
redundant, and gets cleaned up by GCC's RTL optimizers.  In the
code below, we not only avoid generating unnecessary RTL, but by
using emit_store_flag we can take advantage of special instructions
on machines that support setcc.

[Thinking about it a bit more, as a follow-up patch I could check
the target's STORE_FLAG_VALUE and choose between an addition or a
subtraction  depending upon whether "x < 0" is natively handled as
one or minus one.  If someone could suggest a target that could
benefit from this, let me know and I'll test a cross-compiler].

Finally, the current branchless implementation uses two consecutive
shift operations to calculate "x < 0 ? d-1 : 0".  On platforms
where shifts are relatively expensive, such as the pentium4, this
is more efficiently implemented as a shift followed by a bit-wise
AND.  There's a trade-off here, the AND requires an larger immediate
constant but is often faster and schedules better than a shift.
Cautiously, if rtx_costs are equal I stick with the current code.


These improvements can be demonstrated on x86 by the simple function

int div8(int x)
{
  return x/8;
}

Previously on mainline, with "-O2 -fomit-frame-pointer" on all x86
CPUs this becomes:

div8:   movl    4(%esp), %eax
        testl   %eax, %eax
        js      .L4
        sarl    $3, %eax
        ret
.L4:    addl    $7, %eax
        sarl    $3, %eax
        ret


And indeed, we continue to generate exactly this sequence on IA-32
when tuning for the i486 and lower.  Adding the command line option
"-mtune=i586", we now generate the branchless sequence:

div8:   movl    4(%esp), %edx
        movl    %edx, %eax
        sarl    $31, %eax
        shrl    $29, %eax
        addl    %edx, %eax
        sarl    $3, %eax
        ret

And using "-mtune=pentium4" instead, we generate the new improved
branchless sequence:

div8:   movl    4(%esp), %edx
        movl    %edx, %eax
        sarl    $31, %eax
        andl    $7, %eax
        addl    %edx, %eax
        sarl    $3, %eax
        ret


The following patch has been tested on i686-pc-linux-gnu with a full
"make bootstrap", all default languages, and regression tested with
a top-level "make -k check" with no new failures.

Ok for mainline?



2004-07-13  Roger Sayle  <roger@eyesopen.com>

	* expmed.c (expand_sdiv_pow2): New function to expand signed division
	by a positive power of two, split out from expand_divmod.  Provide
	an alternate implementation when shifts are expensive.  Lower the
	threshold for using a branchless implementation to BRANCH_COST >= 2.
	(expand_divmod): Call expand_sdiv_pow2 for suitable divisions.


Index: expmed.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/expmed.c,v
retrieving revision 1.181
diff -c -3 -p -r1.181 expmed.c
*** expmed.c	11 Jul 2004 11:20:44 -0000	1.181
--- expmed.c	13 Jul 2004 13:37:32 -0000
*************** static rtx extract_split_bit_field (rtx,
*** 52,57 ****
--- 52,58 ----
  				    unsigned HOST_WIDE_INT, int);
  static void do_cmp_and_jump (rtx, rtx, enum rtx_code, enum machine_mode, rtx);
  static rtx expand_smod_pow2 (enum machine_mode, rtx, HOST_WIDE_INT);
+ static rtx expand_sdiv_pow2 (enum machine_mode, rtx, HOST_WIDE_INT);

  /* Nonzero means divides or modulus operations are relatively cheap for
     powers of two, so don't use branches; emit the operation instead.
*************** expand_smod_pow2 (enum machine_mode mode
*** 3185,3190 ****
--- 3186,3238 ----
    emit_label (label);
    return result;
  }
+
+ /* Expand signed division of OP0 by a power of two D in mode MODE.
+    This routine is only called for positive values of D.  */
+
+ static rtx
+ expand_sdiv_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d)
+ {
+   rtx temp, label;
+   tree shift;
+   int logd;
+
+   logd = floor_log2 (d);
+   shift = build_int_2 (logd, 0);
+
+   if (d == 2 && BRANCH_COST >= 1)
+     {
+       temp = gen_reg_rtx (mode);
+       temp = emit_store_flag (temp, LT, op0, const0_rtx, mode, 0, 1);
+       temp = expand_binop (mode, add_optab, temp, op0, NULL_RTX,
+ 			   0, OPTAB_LIB_WIDEN);
+       return expand_shift (RSHIFT_EXPR, mode, temp, shift, NULL_RTX, 0);
+     }
+
+   if (BRANCH_COST >= 2)
+     {
+       int ushift = GET_MODE_BITSIZE (mode) - logd;
+
+       temp = gen_reg_rtx (mode);
+       temp = emit_store_flag (temp, LT, op0, const0_rtx, mode, 0, -1);
+       if (shift_cost[mode][ushift] > COSTS_N_INSNS (1))
+ 	temp = expand_binop (mode, and_optab, temp, GEN_INT (d - 1),
+ 			     NULL_RTX, 0, OPTAB_LIB_WIDEN);
+       else
+ 	temp = expand_shift (RSHIFT_EXPR, mode, temp,
+ 			     build_int_2 (ushift, 0), NULL_RTX, 1);
+       temp = expand_binop (mode, add_optab, temp, op0, NULL_RTX,
+ 			   0, OPTAB_LIB_WIDEN);
+       return expand_shift (RSHIFT_EXPR, mode, temp, shift, NULL_RTX, 0);
+     }
+
+   label = gen_label_rtx ();
+   temp = copy_to_mode_reg (mode, op0);
+   do_cmp_and_jump (temp, const0_rtx, GE, mode, label);
+   expand_inc (temp, GEN_INT (d - 1));
+   emit_label (label);
+   return expand_shift (RSHIFT_EXPR, mode, temp, shift, NULL_RTX, 0);
+ }

  /* Emit the code to divide OP0 by OP1, putting the result in TARGET
     if that is convenient, and returning where the result is.
*************** expand_divmod (int rem_flag, enum tree_c
*** 3597,3634 ****
  			if (remainder)
  			  return gen_lowpart (mode, remainder);
  		      }
! 		    lgup = floor_log2 (abs_d);
! 		    if (BRANCH_COST < 1 || (abs_d != 2 && BRANCH_COST < 3))
! 		      {
! 			rtx label = gen_label_rtx ();
! 			rtx t1;
!
! 			t1 = copy_to_mode_reg (compute_mode, op0);
! 			do_cmp_and_jump (t1, const0_rtx, GE,
! 					 compute_mode, label);
! 			expand_inc (t1, gen_int_mode (abs_d - 1,
! 						      compute_mode));
! 			emit_label (label);
! 			quotient = expand_shift (RSHIFT_EXPR, compute_mode, t1,
! 						 build_int_2 (lgup, 0),
! 						 tquotient, 0);
! 		      }
! 		    else
! 		      {
! 			rtx t1, t2, t3;
! 			t1 = expand_shift (RSHIFT_EXPR, compute_mode, op0,
! 					   build_int_2 (size - 1, 0),
! 					   NULL_RTX, 0);
! 			t2 = expand_shift (RSHIFT_EXPR, compute_mode, t1,
! 					   build_int_2 (size - lgup, 0),
! 					   NULL_RTX, 1);
! 			t3 = force_operand (gen_rtx_PLUS (compute_mode,
! 							  op0, t2),
! 					    NULL_RTX);
! 			quotient = expand_shift (RSHIFT_EXPR, compute_mode, t3,
! 						 build_int_2 (lgup, 0),
! 						 tquotient, 0);
! 		      }

  		    /* We have computed OP0 / abs(OP1).  If OP1 is negative,
  		       negate the quotient.  */
--- 3645,3651 ----
  			if (remainder)
  			  return gen_lowpart (mode, remainder);
  		      }
! 		    quotient = expand_sdiv_pow2 (compute_mode, op0, abs_d);

  		    /* We have computed OP0 / abs(OP1).  If OP1 is negative,
  		       negate the quotient.  */

Roger
--
Roger Sayle,                         E-mail: roger@eyesopen.com
OpenEye Scientific Software,         WWW: http://www.eyesopen.com/
Suite 1107, 3600 Cerrillos Road,     Tel: (+1) 505-473-7385
Santa Fe, New Mexico, 87507.         Fax: (+1) 505-473-0833


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]