This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Tweak signed division by power of two
- From: Roger Sayle <roger at eyesopen dot com>
- To: gcc-patches at gcc dot gnu dot org
- Date: Tue, 13 Jul 2004 11:48:46 -0600 (MDT)
- Subject: [PATCH] Tweak signed division by power of two
The following patch splits "signed integer division by a power of
two" out of expand_divmod into its own function, expand_sdiv_pow2.
This is similar to the recently introduced expand_smod_pow2.
Additionally, there are several minor improvements. The first is
that we lower the threshold for using a branch-less implementation
from "BRANCH_COST>=3" to "BRANCH_COST>=2". This allows the use of
GCC's straight-line code on PentiumPro and Pentium4 architectures
as done by Intel's and Microsoft's C/C++ compilers for IA-32.
The second change is to special case the division by two code.
Currently, GCC expands initial RTL as a signed right shift by 31,
followed by an unsigned right shift by 31, to implement the
equivalent of "x < 0 ? 1 : 0". The first shift is of course
redundant, and gets cleaned up by GCC's RTL optimizers. In the
code below, we not only avoid generating unnecessary RTL, but by
using emit_store_flag we can take advantage of special instructions
on machines that support setcc.
[Thinking about it a bit more, as a follow-up patch I could check
the target's STORE_FLAG_VALUE and choose between an addition or a
subtraction depending upon whether "x < 0" is natively handled as
one or minus one. If someone could suggest a target that could
benefit from this, let me know and I'll test a cross-compiler].
Finally, the current branchless implementation uses two consecutive
shift operations to calculate "x < 0 ? d-1 : 0". On platforms
where shifts are relatively expensive, such as the pentium4, this
is more efficiently implemented as a shift followed by a bit-wise
AND. There's a trade-off here, the AND requires an larger immediate
constant but is often faster and schedules better than a shift.
Cautiously, if rtx_costs are equal I stick with the current code.
These improvements can be demonstrated on x86 by the simple function
int div8(int x)
{
return x/8;
}
Previously on mainline, with "-O2 -fomit-frame-pointer" on all x86
CPUs this becomes:
div8: movl 4(%esp), %eax
testl %eax, %eax
js .L4
sarl $3, %eax
ret
.L4: addl $7, %eax
sarl $3, %eax
ret
And indeed, we continue to generate exactly this sequence on IA-32
when tuning for the i486 and lower. Adding the command line option
"-mtune=i586", we now generate the branchless sequence:
div8: movl 4(%esp), %edx
movl %edx, %eax
sarl $31, %eax
shrl $29, %eax
addl %edx, %eax
sarl $3, %eax
ret
And using "-mtune=pentium4" instead, we generate the new improved
branchless sequence:
div8: movl 4(%esp), %edx
movl %edx, %eax
sarl $31, %eax
andl $7, %eax
addl %edx, %eax
sarl $3, %eax
ret
The following patch has been tested on i686-pc-linux-gnu with a full
"make bootstrap", all default languages, and regression tested with
a top-level "make -k check" with no new failures.
Ok for mainline?
2004-07-13 Roger Sayle <roger@eyesopen.com>
* expmed.c (expand_sdiv_pow2): New function to expand signed division
by a positive power of two, split out from expand_divmod. Provide
an alternate implementation when shifts are expensive. Lower the
threshold for using a branchless implementation to BRANCH_COST >= 2.
(expand_divmod): Call expand_sdiv_pow2 for suitable divisions.
Index: expmed.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/expmed.c,v
retrieving revision 1.181
diff -c -3 -p -r1.181 expmed.c
*** expmed.c 11 Jul 2004 11:20:44 -0000 1.181
--- expmed.c 13 Jul 2004 13:37:32 -0000
*************** static rtx extract_split_bit_field (rtx,
*** 52,57 ****
--- 52,58 ----
unsigned HOST_WIDE_INT, int);
static void do_cmp_and_jump (rtx, rtx, enum rtx_code, enum machine_mode, rtx);
static rtx expand_smod_pow2 (enum machine_mode, rtx, HOST_WIDE_INT);
+ static rtx expand_sdiv_pow2 (enum machine_mode, rtx, HOST_WIDE_INT);
/* Nonzero means divides or modulus operations are relatively cheap for
powers of two, so don't use branches; emit the operation instead.
*************** expand_smod_pow2 (enum machine_mode mode
*** 3185,3190 ****
--- 3186,3238 ----
emit_label (label);
return result;
}
+
+ /* Expand signed division of OP0 by a power of two D in mode MODE.
+ This routine is only called for positive values of D. */
+
+ static rtx
+ expand_sdiv_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d)
+ {
+ rtx temp, label;
+ tree shift;
+ int logd;
+
+ logd = floor_log2 (d);
+ shift = build_int_2 (logd, 0);
+
+ if (d == 2 && BRANCH_COST >= 1)
+ {
+ temp = gen_reg_rtx (mode);
+ temp = emit_store_flag (temp, LT, op0, const0_rtx, mode, 0, 1);
+ temp = expand_binop (mode, add_optab, temp, op0, NULL_RTX,
+ 0, OPTAB_LIB_WIDEN);
+ return expand_shift (RSHIFT_EXPR, mode, temp, shift, NULL_RTX, 0);
+ }
+
+ if (BRANCH_COST >= 2)
+ {
+ int ushift = GET_MODE_BITSIZE (mode) - logd;
+
+ temp = gen_reg_rtx (mode);
+ temp = emit_store_flag (temp, LT, op0, const0_rtx, mode, 0, -1);
+ if (shift_cost[mode][ushift] > COSTS_N_INSNS (1))
+ temp = expand_binop (mode, and_optab, temp, GEN_INT (d - 1),
+ NULL_RTX, 0, OPTAB_LIB_WIDEN);
+ else
+ temp = expand_shift (RSHIFT_EXPR, mode, temp,
+ build_int_2 (ushift, 0), NULL_RTX, 1);
+ temp = expand_binop (mode, add_optab, temp, op0, NULL_RTX,
+ 0, OPTAB_LIB_WIDEN);
+ return expand_shift (RSHIFT_EXPR, mode, temp, shift, NULL_RTX, 0);
+ }
+
+ label = gen_label_rtx ();
+ temp = copy_to_mode_reg (mode, op0);
+ do_cmp_and_jump (temp, const0_rtx, GE, mode, label);
+ expand_inc (temp, GEN_INT (d - 1));
+ emit_label (label);
+ return expand_shift (RSHIFT_EXPR, mode, temp, shift, NULL_RTX, 0);
+ }
/* Emit the code to divide OP0 by OP1, putting the result in TARGET
if that is convenient, and returning where the result is.
*************** expand_divmod (int rem_flag, enum tree_c
*** 3597,3634 ****
if (remainder)
return gen_lowpart (mode, remainder);
}
! lgup = floor_log2 (abs_d);
! if (BRANCH_COST < 1 || (abs_d != 2 && BRANCH_COST < 3))
! {
! rtx label = gen_label_rtx ();
! rtx t1;
!
! t1 = copy_to_mode_reg (compute_mode, op0);
! do_cmp_and_jump (t1, const0_rtx, GE,
! compute_mode, label);
! expand_inc (t1, gen_int_mode (abs_d - 1,
! compute_mode));
! emit_label (label);
! quotient = expand_shift (RSHIFT_EXPR, compute_mode, t1,
! build_int_2 (lgup, 0),
! tquotient, 0);
! }
! else
! {
! rtx t1, t2, t3;
! t1 = expand_shift (RSHIFT_EXPR, compute_mode, op0,
! build_int_2 (size - 1, 0),
! NULL_RTX, 0);
! t2 = expand_shift (RSHIFT_EXPR, compute_mode, t1,
! build_int_2 (size - lgup, 0),
! NULL_RTX, 1);
! t3 = force_operand (gen_rtx_PLUS (compute_mode,
! op0, t2),
! NULL_RTX);
! quotient = expand_shift (RSHIFT_EXPR, compute_mode, t3,
! build_int_2 (lgup, 0),
! tquotient, 0);
! }
/* We have computed OP0 / abs(OP1). If OP1 is negative,
negate the quotient. */
--- 3645,3651 ----
if (remainder)
return gen_lowpart (mode, remainder);
}
! quotient = expand_sdiv_pow2 (compute_mode, op0, abs_d);
/* We have computed OP0 / abs(OP1). If OP1 is negative,
negate the quotient. */
Roger
--
Roger Sayle, E-mail: roger@eyesopen.com
OpenEye Scientific Software, WWW: http://www.eyesopen.com/
Suite 1107, 3600 Cerrillos Road, Tel: (+1) 505-473-7385
Santa Fe, New Mexico, 87507. Fax: (+1) 505-473-0833