This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Improved signed integer remainder by power of two
- From: Roger Sayle <roger at eyesopen dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: Steven Bosscher <stevenb at suse dot de>
- Date: Sun, 27 Jun 2004 19:23:31 -0600 (MDT)
- Subject: [PATCH] Improved signed integer remainder by power of two
The following patch improves the code that GCC generates for signed
integer remainder, modulo a constant which is a power of two. Many
thanks for Steven Bosscher for bringing this to my attention, pointing
out that GCC generates inferior code to other x86 compilers, including
the IBM compiler for OS/2 and the OpenWatcom compiler. Thanks also
to Andi Kleen who apparently pointed out this failing to Steven.
Consider the simple test program below:
int smod16(int x)
{
return x % 16;
}
On x86, using -O2 -fomit-frame-pointer mainline CVS currently generates
the rather poor:
smod16: movl 4(%esp), %eax
testl %eax, %eax
movl %eax, %edx
js .L4
andl $-16, %edx
subl %edx, %eax
ret
.L4:
leal 15(%eax), %edx
andl $-16, %edx
subl %edx, %eax
ret
Examining the middle-end code in expmed.c's expand_divmod, I'm actually
impressed that we do this well, as there's currently no special case code
for handling this form of TRUNC_MOD_EXPR. Instead, the initial RTL that
we generate is calculated as "return x - 16*(x/16);", where the divisions
and multiplications are handled by arithmetic shifts.
The patch below introduces a new expand_smod_pow2 function, as a helper
function to expand_divmod, to generate improved RTL for TRUNC_MOD_EXPR.
With this patch, we now generate the following code with the above
options.
smod16: movl 4(%esp), %eax
andl $-2147483633, %eax
js .L4
ret
.L4:
decl %eax
orl $-16, %eax
incl %eax
ret
The first "andl" is actually 0x8000000f, i.e. the signbit of the mode of
this operation, and the significant low-order bits of the modulus. By
performing this bitwise AND early, many targets (including x86) can avoid
an explicit compare/test instruction when checking for the signedness of
the operand. This happens to be the same idiom as used by Microsoft's
Visual C/C++ version 6 compilers.
And for targets with a significant conditional branch penalty, when
BRANCH_COST >= 2, the patch below can also generate a straight-line
sequence without conditional jumps.
For example, using "-O2 -fomit-frame-pointer -mtune=pentium4", we now
generate the following code:
smod16: movl 4(%esp), %eax
cltd
xorl %edx, %eax
subl %edx, %eax
andl $15, %eax
xorl %edx, %eax
subl %edx, %eax
ret
which as luck would have it is identical to the code generated by the
Intel version 7 compiler (and the IBM OS/2 and OpenWatcom compilers
mentioned above).
For the curious, the above assembly code is approximately equal to the
following function:
int smod16a(int x)
{
int signmask = x >> 31;
x = (x ^ signmask) - signmask; // x = fabs(x)
x = x & 15;
x = (x ^ signmask) - signmask;
return x;
}
This is relying on the idiom that "(x ^ -1) - -1" is equal to "-x",
but that "(x ^ 0) - 0" is the identity operation. So the second
and fourth lines above are effectively "conditional negation"
instructions, controlled by the original sign of x.
Or to make things clearer by removing this conditional execution:
int smod16b(int x)
{
if (x < 0)
{
x = -x;
x = x & 15;
x = -x;
}
else
x = x & 15;
return x;
}
The following patch has been tested on i686-pc-linux-gnu, with a full
"make bootstrap", all default languages, and regression tested with a
top-level "make -k check" with no new failures.
Ok for mainline?
2004-06-27 Roger Sayle <roger@eyesopen.com>
* expmed.c (expand_smod_pow2): New function to expand signed
remainder by a constant power of 2, such as "x % 16".
(expand_divmod): Call new expand_smod_pow2 when appropriate.
Minor corrections to comments, e.g. splitting long lines.
Index: expmed.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/expmed.c,v
retrieving revision 1.169
diff -c -3 -p -r1.169 expmed.c
*** expmed.c 24 Jun 2004 20:38:59 -0000 1.169
--- expmed.c 27 Jun 2004 21:53:12 -0000
*************** static rtx lshift_value (enum machine_mo
*** 51,56 ****
--- 51,57 ----
static rtx extract_split_bit_field (rtx, unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT, int);
static void do_cmp_and_jump (rtx, rtx, enum rtx_code, enum machine_mode, rtx);
+ static rtx expand_smod_pow2 (enum machine_mode, rtx, HOST_WIDE_INT);
/* Nonzero means divides or modulus operations are relatively cheap for
powers of two, so don't use branches; emit the operation instead.
*************** expand_mult_highpart (enum machine_mode
*** 3055,3060 ****
--- 3056,3127 ----
return expand_mult_highpart_optab (mode, op0, op1, target,
unsignedp, max_cost);
}
+
+
+ /* Expand signed modulus of OP0 by a power of two D in mode MODE. */
+
+ static rtx
+ expand_smod_pow2 (enum machine_mode mode, rtx op0, HOST_WIDE_INT d)
+ {
+ unsigned HOST_WIDE_INT mask;
+ rtx result, temp, label;
+ int logd;
+
+ logd = floor_log2 (d);
+
+ /* Avoid conditional branches when they're expensive. */
+ if (BRANCH_COST >= 2
+ && !optimize_size
+ && ashr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
+ {
+ rtx signmask = expand_binop (mode, ashr_optab, op0,
+ GEN_INT (GET_MODE_BITSIZE (mode) - 1),
+ NULL_RTX, 0, OPTAB_LIB_WIDEN);
+ signmask = force_reg (mode, signmask);
+
+ temp = expand_binop (mode, xor_optab, op0, signmask,
+ NULL_RTX, 1, OPTAB_LIB_WIDEN);
+ temp = expand_binop (mode, sub_optab, temp, signmask,
+ NULL_RTX, 0, OPTAB_LIB_WIDEN);
+ mask = ((HOST_WIDE_INT) 1 << logd) - 1;
+ temp = expand_binop (mode, and_optab, temp, GEN_INT (mask),
+ NULL_RTX, 1, OPTAB_LIB_WIDEN);
+ temp = expand_binop (mode, xor_optab, temp, signmask,
+ NULL_RTX, 1, OPTAB_LIB_WIDEN);
+ temp = expand_binop (mode, sub_optab, temp, signmask,
+ NULL_RTX, 1, OPTAB_LIB_WIDEN);
+ return temp;
+ }
+
+ /* Mask contains the mode's signbit and the significant bits of the
+ modulus. By including the signbit in the operation, many targets
+ can avoid an explicit compare operation in the following comparison
+ against zero. */
+
+ result = gen_reg_rtx (mode);
+ mask = (HOST_WIDE_INT) 1 << (GET_MODE_BITSIZE (mode) - 1)
+ | (((HOST_WIDE_INT) 1 << logd) - 1);
+
+ temp = expand_binop (mode, and_optab, op0, GEN_INT (mask), result,
+ 1, OPTAB_LIB_WIDEN);
+ if (temp != result)
+ emit_move_insn (result, temp);
+
+ label = gen_label_rtx ();
+ do_cmp_and_jump (result, const0_rtx, GE, mode, label);
+
+ temp = expand_binop (mode, sub_optab, result, const1_rtx, result,
+ 0, OPTAB_LIB_WIDEN);
+ mask = (HOST_WIDE_INT) -1 << logd;
+ temp = expand_binop (mode, ior_optab, temp, GEN_INT (mask), result,
+ 1, OPTAB_LIB_WIDEN);
+ temp = expand_binop (mode, add_optab, temp, const1_rtx, result,
+ 0, OPTAB_LIB_WIDEN);
+ if (temp != result)
+ emit_move_insn (result, temp);
+ emit_label (label);
+ return result;
+ }
/* Emit the code to divide OP0 by OP1, putting the result in TARGET
if that is convenient, and returning where the result is.
*************** expand_divmod (int rem_flag, enum tree_c
*** 3451,3460 ****
else if (EXACT_POWER_OF_2_OR_ZERO_P (d)
&& (rem_flag ? smod_pow2_cheap[compute_mode]
: sdiv_pow2_cheap[compute_mode])
! /* ??? The cheap metric is computed only for
! word_mode. If this operation is wider, this may
! not be so. Assume true if the optab has an
! expander for this mode. */
&& (((rem_flag ? smod_optab : sdiv_optab)
->handlers[compute_mode].insn_code
!= CODE_FOR_nothing)
--- 3518,3525 ----
else if (EXACT_POWER_OF_2_OR_ZERO_P (d)
&& (rem_flag ? smod_pow2_cheap[compute_mode]
: sdiv_pow2_cheap[compute_mode])
! /* We assume that cheap metric is true if the
! optab has an expander for this mode. */
&& (((rem_flag ? smod_optab : sdiv_optab)
->handlers[compute_mode].insn_code
!= CODE_FOR_nothing)
*************** expand_divmod (int rem_flag, enum tree_c
*** 3463,3468 ****
--- 3528,3539 ----
;
else if (EXACT_POWER_OF_2_OR_ZERO_P (abs_d))
{
+ if (rem_flag)
+ {
+ remainder = expand_smod_pow2 (compute_mode, op0, d);
+ if (remainder)
+ return gen_lowpart (mode, remainder);
+ }
lgup = floor_log2 (abs_d);
if (BRANCH_COST < 1 || (abs_d != 2 && BRANCH_COST < 3))
{
*************** expand_divmod (int rem_flag, enum tree_c
*** 3496,3503 ****
tquotient, 0);
}
! /* We have computed OP0 / abs(OP1). If OP1 is negative, negate
! the quotient. */
if (d < 0)
{
insn = get_last_insn ();
--- 3567,3574 ----
tquotient, 0);
}
! /* We have computed OP0 / abs(OP1). If OP1 is negative,
! negate the quotient. */
if (d < 0)
{
insn = get_last_insn ();
Roger
--
Roger Sayle, E-mail: roger@eyesopen.com
OpenEye Scientific Software, WWW: http://www.eyesopen.com/
Suite 1107, 3600 Cerrillos Road, Tel: (+1) 505-473-7385
Santa Fe, New Mexico, 87507. Fax: (+1) 505-473-0833