This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Arch-independent double-word clz expansion and a few other tweaks for bitscanning
- From: Zack Weinberg <zack at codesourcery dot com>
- To: gcc-patches at gcc dot gnu dot org
- Date: Fri, 10 Aug 2007 23:45:33 -0700
- Subject: Arch-independent double-word clz expansion and a few other tweaks for bitscanning
This patch builds on the work done by Sandra Loosemore on open-coding
bit-scan operations which have no direct hardware support. The main
purpose is to be able to open-code a DImode clz operation on a target
with only SImode clz instructions. That's implemented in
expand_doubleword_clz(). I made some other generic improvements: we can
now open-code ffs() on targets with undefined or inconvenient
CTZ_DEFINED_VALUE_AT_ZERO, we share code between expand_ctz and
expand_ffs, REG_EQUAL notes are correctly tacked onto the result of all
these operations, and there is now an expand_unop_direct() which doesn't
do libcalls or attempt to widen/narrow -- looks like a bunch of other
places in optabs.c could use that.
I also removed the now-redundant ffssi2 and ctzsi2 expanders from the
arm and i386 back ends, gave the i386 back end a definition of
CLZ_DEFINED_VALUE_AT_ZERO so we know it's well-behaved when TARGET_ABM
is true, and added CC-setting variants of all the bit-scan insns to the
i386 back end. We now get optimal code for DImode clz on ARM:
cmp r1, #0
clzeq r3, r0
clzne r0, r1
addeq r0, r3, #32
and SI/DImode ffs on i386:
bsfl %edi, %eax
movl $-1, %edx
cmove %edx, %eax
addl $1, %eax
(formerly there would have been a testl %edi,%edi in there too).
Bootstrapped without regression on amd64-linux (64-bit C/C++ only) and
arm-elf (newlib simulator, --with-cpu=cortex-a8, --disable-multilib,
C/C++ only). OK to commit?
zw
* optabs.c: Remove unnecessary forward declarations.
(expand_unop_direct): New, broken out of expand_unop.
(expand_doubleword_clz): New.
(expand_ctz): Move above expand_ffs. Use
start_sequence, end_sequence, add_equal_note, and
expand_unop_direct. Add more commentary.
(expand_ffs): Try both ctz optab and expand_ctz.
Generate a test and branch if the hardware doesn't give us
a useful value for input zero. Style improvements similar to
expand_ctz.
* config/arm/arm.md (ffssi2, ctzsi2): Delete.
* config/i386/i386.h (CLZ_DEFINED_VALUE_AT_ZERO): Define.
* config/i386/i386.md (ffssi2, ffs_cmove, *ffs_no_cmove)
(*ffssi_1, ffsdi2, *ffsdi_1): Delete.
(*ctzsi2_ccz, *bsr_ccz, *ctzdi2_ccz, *bsr_rex64_ccz)
(*bsrhi_ccz): New insns.
(ctzdi2): Move next to clzdi2.
==================================================================
--- config/arm/arm.md (revision 127388)
+++ config/arm/arm.md (local)
@@ -10780,46 +10780,6 @@ (define_insn "clzsi2"
[(set_attr "predicable" "yes")
(set_attr "insn" "clz")])
-(define_expand "ffssi2"
- [(set (match_operand:SI 0 "s_register_operand" "")
- (ffs:SI (match_operand:SI 1 "s_register_operand" "")))]
- "TARGET_32BIT && arm_arch5"
- "
- {
- rtx t1, t2, t3;
-
- t1 = gen_reg_rtx (SImode);
- t2 = gen_reg_rtx (SImode);
- t3 = gen_reg_rtx (SImode);
-
- emit_insn (gen_negsi2 (t1, operands[1]));
- emit_insn (gen_andsi3 (t2, operands[1], t1));
- emit_insn (gen_clzsi2 (t3, t2));
- emit_insn (gen_subsi3 (operands[0], GEN_INT (32), t3));
- DONE;
- }"
-)
-
-(define_expand "ctzsi2"
- [(set (match_operand:SI 0 "s_register_operand" "")
- (ctz:SI (match_operand:SI 1 "s_register_operand" "")))]
- "TARGET_32BIT && arm_arch5"
- "
- {
- rtx t1, t2, t3;
-
- t1 = gen_reg_rtx (SImode);
- t2 = gen_reg_rtx (SImode);
- t3 = gen_reg_rtx (SImode);
-
- emit_insn (gen_negsi2 (t1, operands[1]));
- emit_insn (gen_andsi3 (t2, operands[1], t1));
- emit_insn (gen_clzsi2 (t3, t2));
- emit_insn (gen_subsi3 (operands[0], GEN_INT (31), t3));
- DONE;
- }"
-)
-
;; V5E instructions.
(define_insn "prefetch"
==================================================================
--- config/i386/i386.h (revision 127388)
+++ config/i386/i386.h (local)
@@ -1960,6 +1960,10 @@ do { \
is done just by pretending it is already truncated. */
#define TRULY_NOOP_TRUNCATION(OUTPREC, INPREC) 1
+/* clz is defined at zero if we are using lzcnt. */
+#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VAL) \
+ (TARGET_ABM ? ((VAL) = GET_MODE_BITSIZE (MODE), 2) : 0)
+
/* A macro to update M and UNSIGNEDP when an object whose type is
TYPE and which has the specified mode and signedness is to be
stored in a register. This macro is only called when TYPE is a
==================================================================
--- config/i386/i386.md (revision 127388)
+++ config/i386/i386.md (local)
@@ -14583,98 +14583,16 @@ (define_insn "leave_rex64"
"leave"
[(set_attr "type" "leave")])
-(define_expand "ffssi2"
- [(parallel
- [(set (match_operand:SI 0 "register_operand" "")
- (ffs:SI (match_operand:SI 1 "nonimmediate_operand" "")))
- (clobber (match_scratch:SI 2 ""))
- (clobber (reg:CC FLAGS_REG))])]
- ""
-{
- if (TARGET_CMOVE)
- {
- emit_insn (gen_ffs_cmove (operands[0], operands[1]));
- DONE;
- }
-})
-
-(define_expand "ffs_cmove"
- [(set (match_dup 2) (const_int -1))
- (parallel [(set (reg:CCZ FLAGS_REG)
- (compare:CCZ (match_operand:SI 1 "register_operand" "")
- (const_int 0)))
- (set (match_operand:SI 0 "nonimmediate_operand" "")
- (ctz:SI (match_dup 1)))])
- (set (match_dup 0) (if_then_else:SI
- (eq (reg:CCZ FLAGS_REG) (const_int 0))
- (match_dup 2)
- (match_dup 0)))
- (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
- (clobber (reg:CC FLAGS_REG))])]
- "TARGET_CMOVE"
- "operands[2] = gen_reg_rtx (SImode);")
-
-(define_insn_and_split "*ffs_no_cmove"
- [(set (match_operand:SI 0 "nonimmediate_operand" "=r")
- (ffs:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))
- (clobber (match_scratch:SI 2 "=&q"))
- (clobber (reg:CC FLAGS_REG))]
- "!TARGET_CMOVE"
- "#"
- "&& reload_completed"
- [(parallel [(set (reg:CCZ FLAGS_REG)
- (compare:CCZ (match_dup 1) (const_int 0)))
- (set (match_dup 0) (ctz:SI (match_dup 1)))])
- (set (strict_low_part (match_dup 3))
- (eq:QI (reg:CCZ FLAGS_REG) (const_int 0)))
- (parallel [(set (match_dup 2) (neg:SI (match_dup 2)))
- (clobber (reg:CC FLAGS_REG))])
- (parallel [(set (match_dup 0) (ior:SI (match_dup 0) (match_dup 2)))
- (clobber (reg:CC FLAGS_REG))])
- (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
- (clobber (reg:CC FLAGS_REG))])]
-{
- operands[3] = gen_lowpart (QImode, operands[2]);
- ix86_expand_clear (operands[2]);
-})
-
-(define_insn "*ffssi_1"
- [(set (reg:CCZ FLAGS_REG)
- (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
- (const_int 0)))
- (set (match_operand:SI 0 "register_operand" "=r")
- (ctz:SI (match_dup 1)))]
+(define_insn "ctzsi2"
+ [(set (match_operand:SI 0 "register_operand" "=r")
+ (ctz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))
+ (set (reg:CCZ FLAGS_REG)
+ (compare:CCZ (match_dup 1) (const_int 0)))]
""
"bsf{l}\t{%1, %0|%0, %1}"
[(set_attr "prefix_0f" "1")])
-(define_expand "ffsdi2"
- [(set (match_dup 2) (const_int -1))
- (parallel [(set (reg:CCZ FLAGS_REG)
- (compare:CCZ (match_operand:DI 1 "register_operand" "")
- (const_int 0)))
- (set (match_operand:DI 0 "nonimmediate_operand" "")
- (ctz:DI (match_dup 1)))])
- (set (match_dup 0) (if_then_else:DI
- (eq (reg:CCZ FLAGS_REG) (const_int 0))
- (match_dup 2)
- (match_dup 0)))
- (parallel [(set (match_dup 0) (plus:DI (match_dup 0) (const_int 1)))
- (clobber (reg:CC FLAGS_REG))])]
- "TARGET_64BIT"
- "operands[2] = gen_reg_rtx (DImode);")
-
-(define_insn "*ffsdi_1"
- [(set (reg:CCZ FLAGS_REG)
- (compare:CCZ (match_operand:DI 1 "nonimmediate_operand" "rm")
- (const_int 0)))
- (set (match_operand:DI 0 "register_operand" "=r")
- (ctz:DI (match_dup 1)))]
- "TARGET_64BIT"
- "bsf{q}\t{%1, %0|%0, %1}"
- [(set_attr "prefix_0f" "1")])
-
-(define_insn "ctzsi2"
+(define_insn "*ctzsi2_ccz"
[(set (match_operand:SI 0 "register_operand" "=r")
(ctz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))
(clobber (reg:CC FLAGS_REG))]
@@ -14682,14 +14600,6 @@ (define_insn "ctzsi2"
"bsf{l}\t{%1, %0|%0, %1}"
[(set_attr "prefix_0f" "1")])
-(define_insn "ctzdi2"
- [(set (match_operand:DI 0 "register_operand" "=r")
- (ctz:DI (match_operand:DI 1 "nonimmediate_operand" "rm")))
- (clobber (reg:CC FLAGS_REG))]
- "TARGET_64BIT"
- "bsf{q}\t{%1, %0|%0, %1}"
- [(set_attr "prefix_0f" "1")])
-
(define_expand "clzsi2"
[(parallel
[(set (match_operand:SI 0 "register_operand" "")
@@ -14728,6 +14638,17 @@ (define_insn "*bsr"
[(set_attr "prefix_0f" "1")
(set_attr "mode" "SI")])
+(define_insn "*bsr_ccz"
+ [(set (match_operand:SI 0 "register_operand" "=r")
+ (minus:SI (const_int 31)
+ (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
+ (set (reg:CCZ FLAGS_REG)
+ (compare:CCZ (match_dup 1) (const_int 0)))]
+ ""
+ "bsr{l}\t{%1, %0|%0, %1}"
+ [(set_attr "prefix_0f" "1")
+ (set_attr "mode" "SI")])
+
(define_insn "popcountsi2"
[(set (match_operand:SI 0 "register_operand" "=r")
(popcount:SI (match_operand:SI 1 "nonimmediate_operand" "")))
@@ -14817,6 +14738,23 @@ (define_insn "bswapdi2"
[(set_attr "prefix_0f" "1")
(set_attr "length" "3")])
+(define_insn "ctzdi2"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (ctz:DI (match_operand:DI 1 "nonimmediate_operand" "rm")))
+ (clobber (reg:CC FLAGS_REG))]
+ "TARGET_64BIT"
+ "bsf{q}\t{%1, %0|%0, %1}"
+ [(set_attr "prefix_0f" "1")])
+
+(define_insn "*ctzdi2_ccz"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (ctz:DI (match_operand:DI 1 "nonimmediate_operand" "rm")))
+ (set (reg:CCZ FLAGS_REG)
+ (compare:CCZ (match_dup 1) (const_int 0)))]
+ "TARGET_64BIT"
+ "bsf{q}\t{%1, %0|%0, %1}"
+ [(set_attr "prefix_0f" "1")])
+
(define_expand "clzdi2"
[(parallel
[(set (match_operand:DI 0 "register_operand" "")
@@ -14855,6 +14793,17 @@ (define_insn "*bsr_rex64"
[(set_attr "prefix_0f" "1")
(set_attr "mode" "DI")])
+(define_insn "*bsr_rex64_ccz"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (minus:DI (const_int 63)
+ (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
+ (set (reg:CCZ FLAGS_REG)
+ (compare:CCZ (match_dup 1) (const_int 0)))]
+ "TARGET_64BIT"
+ "bsr{q}\t{%1, %0|%0, %1}"
+ [(set_attr "prefix_0f" "1")
+ (set_attr "mode" "DI")])
+
(define_insn "popcountdi2"
[(set (match_operand:DI 0 "register_operand" "=r")
(popcount:DI (match_operand:DI 1 "nonimmediate_operand" "")))
@@ -14916,6 +14865,17 @@ (define_insn "*bsrhi"
[(set_attr "prefix_0f" "1")
(set_attr "mode" "HI")])
+(define_insn "*bsrhi_ccz"
+ [(set (match_operand:HI 0 "register_operand" "=r")
+ (minus:HI (const_int 15)
+ (clz:HI (match_operand:HI 1 "nonimmediate_operand" "rm"))))
+ (set (reg:CCZ FLAGS_REG)
+ (compare:CCZ (match_dup 1) (const_int 0)))]
+ ""
+ "bsr{w}\t{%1, %0|%0, %1}"
+ [(set_attr "prefix_0f" "1")
+ (set_attr "mode" "HI")])
+
(define_insn "popcounthi2"
[(set (match_operand:HI 0 "register_operand" "=r")
(popcount:HI (match_operand:HI 1 "nonimmediate_operand" "")))
==================================================================
--- optabs.c (revision 127388)
+++ optabs.c (local)
@@ -95,37 +95,9 @@ enum insn_code vcondu_gen_code[NUM_MACHI
the code to be used in the trap insn and all other fields are ignored. */
static GTY(()) rtx trap_rtx;
-static int add_equal_note (rtx, rtx, enum rtx_code, rtx, rtx);
-static rtx widen_operand (rtx, enum machine_mode, enum machine_mode, int,
- int);
-static void prepare_cmp_insn (rtx *, rtx *, enum rtx_code *, rtx,
- enum machine_mode *, int *,
- enum can_compare_purpose);
-static enum insn_code can_fix_p (enum machine_mode, enum machine_mode, int,
- int *);
-static enum insn_code can_float_p (enum machine_mode, enum machine_mode, int);
-static optab new_optab (void);
-static convert_optab new_convert_optab (void);
-static inline optab init_optab (enum rtx_code);
-static inline optab init_optabv (enum rtx_code);
-static inline convert_optab init_convert_optab (enum rtx_code);
-static void init_libfuncs (optab, int, int, const char *, int);
-static void init_integral_libfuncs (optab, const char *, int);
-static void init_floating_libfuncs (optab, const char *, int);
-static void init_interclass_conv_libfuncs (convert_optab, const char *,
- enum mode_class, enum mode_class);
-static void init_intraclass_conv_libfuncs (convert_optab, const char *,
- enum mode_class, bool);
-static void emit_cmp_and_jump_insn_1 (rtx, rtx, enum machine_mode,
- enum rtx_code, int, rtx);
static void prepare_float_lib_cmp (rtx *, rtx *, enum rtx_code *,
enum machine_mode *, int *);
-static rtx widen_clz (enum machine_mode, rtx, rtx);
-static rtx expand_parity (enum machine_mode, rtx, rtx);
-static rtx expand_ffs (enum machine_mode, rtx, rtx);
-static rtx expand_ctz (enum machine_mode, rtx, rtx);
-static enum rtx_code get_rtx_code (enum tree_code, bool);
-static rtx vector_compare_rtx (tree, bool, enum insn_code);
+static rtx expand_unop_direct (enum machine_mode, optab, rtx, rtx, int);
/* Current libcall id. It doesn't matter what these are, as long
as they are unique to each libcall that is emitted. */
@@ -2459,6 +2431,76 @@ widen_clz (enum machine_mode mode, rtx o
return 0;
}
+/* Try calculating clz of a double-word quantity as two clz's of word-sized
+ quantities, choosing which based on whether the high word is nonzero. */
+static rtx
+expand_doubleword_clz (enum machine_mode mode, rtx op0, rtx target)
+{
+ rtx xop0 = force_reg (mode, op0);
+ rtx subhi = gen_highpart (word_mode, xop0);
+ rtx sublo = gen_lowpart (word_mode, xop0);
+ rtx hi0_label = gen_label_rtx ();
+ rtx after_label = gen_label_rtx ();
+ rtx seq, temp, result;
+
+ /* If we were not given a target, use a word_mode register, not a
+ 'mode' register. The result will fit, and nobody is expecting
+ anything bigger (the return type of __builtin_clz* is int). */
+ if (!target)
+ target = gen_reg_rtx (word_mode);
+
+ /* In any case, write to a word_mode scratch in both branches of the
+ conditional, so we can ensure there is a single move insn setting
+ 'target' to tag a REG_EQUAL note on. */
+ result = gen_reg_rtx (word_mode);
+
+ start_sequence ();
+
+ /* If the high word is not equal to zero,
+ then clz of the full value is clz of the high word. */
+ emit_cmp_and_jump_insns (subhi, CONST0_RTX (word_mode), EQ, 0,
+ word_mode, true, hi0_label);
+
+ temp = expand_unop_direct (word_mode, clz_optab, subhi, result, true);
+ if (!temp)
+ goto fail;
+
+ if (temp != result)
+ convert_move (result, temp, true);
+
+ emit_jump_insn (gen_jump (after_label));
+ emit_barrier ();
+
+ /* Else clz of the full value is clz of the low word plus the number
+ of bits in the high word. */
+ emit_label (hi0_label);
+
+ temp = expand_unop_direct (word_mode, clz_optab, sublo, 0, true);
+ if (!temp)
+ goto fail;
+ temp = expand_binop (word_mode, add_optab, temp,
+ GEN_INT (GET_MODE_BITSIZE (word_mode)),
+ result, true, OPTAB_DIRECT);
+ if (!temp)
+ goto fail;
+ if (temp != result)
+ convert_move (result, temp, true);
+
+ emit_label (after_label);
+ convert_move (target, result, true);
+
+ seq = get_insns ();
+ end_sequence ();
+
+ add_equal_note (seq, target, CLZ, xop0, 0);
+ emit_insn (seq);
+ return target;
+
+ fail:
+ end_sequence ();
+ return 0;
+}
+
/* Try calculating
(bswap:narrow x)
as
@@ -2563,65 +2605,130 @@ expand_parity (enum machine_mode mode, r
return 0;
}
-/* Try calculating ffs(x) using clz(x). Since the ffs builtin promises
- to return zero for a zero value and clz may have an undefined value
- in that case, only do this if we know clz returns the right thing so
- that we don't have to generate a test and branch. */
+/* Try calculating ctz(x) as K - clz(x & -x) ,
+ where K is GET_MODE_BITSIZE(mode) - 1.
+
+ Both __builtin_ctz and __builtin_clz are undefined at zero, so we
+ don't have to worry about what the hardware does in that case. (If
+ the clz instruction produces the usual value at 0, which is K, the
+ result of this code sequence will be -1; expand_ffs, below, relies
+ on this. It might be nice to have it be K instead, for consistency
+ with the (very few) processors that provide a ctz with a defined
+ value, but that would take one more instruction, and it would be
+ less convenient for expand_ffs anyway. */
+
static rtx
-expand_ffs (enum machine_mode mode, rtx op0, rtx target)
+expand_ctz (enum machine_mode mode, rtx op0, rtx target)
{
- HOST_WIDE_INT val;
- if (clz_optab->handlers[(int) mode].insn_code != CODE_FOR_nothing
- && CLZ_DEFINED_VALUE_AT_ZERO (mode, val) == 2
- && val == GET_MODE_BITSIZE (mode))
- {
- rtx last = get_last_insn ();
- rtx temp;
+ rtx seq, temp;
+
+ if (optab_handler (clz_optab, mode)->insn_code == CODE_FOR_nothing)
+ return 0;
+
+ start_sequence ();
- temp = expand_unop (mode, neg_optab, op0, NULL_RTX, true);
- if (temp)
- temp = expand_binop (mode, and_optab, op0, temp, NULL_RTX,
- true, OPTAB_DIRECT);
- if (temp)
- temp = expand_unop (mode, clz_optab, temp, NULL_RTX, true);
- if (temp)
- temp = expand_binop (mode, sub_optab,
- GEN_INT (GET_MODE_BITSIZE (mode)),
- temp,
- target, true, OPTAB_DIRECT);
- if (temp == 0)
- delete_insns_since (last);
- return temp;
+ temp = expand_unop_direct (mode, neg_optab, op0, NULL_RTX, true);
+ if (temp)
+ temp = expand_binop (mode, and_optab, op0, temp, NULL_RTX,
+ true, OPTAB_DIRECT);
+ if (temp)
+ temp = expand_unop_direct (mode, clz_optab, temp, NULL_RTX, true);
+ if (temp)
+ temp = expand_binop (mode, sub_optab, GEN_INT (GET_MODE_BITSIZE (mode) - 1),
+ temp, target,
+ true, OPTAB_DIRECT);
+ if (temp == 0)
+ {
+ end_sequence ();
+ return 0;
}
- return 0;
+
+ seq = get_insns ();
+ end_sequence ();
+
+ add_equal_note (seq, temp, CTZ, op0, 0);
+ emit_insn (seq);
+ return temp;
}
-/* We can compute ctz(x) using clz(x) with a similar recipe. Here the ctz
- builtin has an undefined result on zero, just like clz, so we don't have
- to do that check. */
+
+/* Try calculating ffs(x) using ctz(x) if we have that instruction, or
+ else with the sequence used by expand_clz.
+
+ The ffs builtin promises to return zero for a zero value and ctz/clz
+ may have an undefined value in that case. If they do not give us a
+ convenient value, we have to generate a test and branch. */
static rtx
-expand_ctz (enum machine_mode mode, rtx op0, rtx target)
+expand_ffs (enum machine_mode mode, rtx op0, rtx target)
{
- if (clz_optab->handlers[(int) mode].insn_code != CODE_FOR_nothing)
+ HOST_WIDE_INT val;
+ bool defined_at_zero;
+ rtx temp, seq;
+
+ if (optab_handler (ctz_optab, mode)->insn_code != CODE_FOR_nothing)
{
- rtx last = get_last_insn ();
- rtx temp;
+ start_sequence ();
- temp = expand_unop (mode, neg_optab, op0, NULL_RTX, true);
- if (temp)
- temp = expand_binop (mode, and_optab, op0, temp, NULL_RTX,
- true, OPTAB_DIRECT);
- if (temp)
- temp = expand_unop (mode, clz_optab, temp, NULL_RTX, true);
- if (temp)
- temp = expand_binop (mode, xor_optab, temp,
- GEN_INT (GET_MODE_BITSIZE (mode) - 1),
- target,
- true, OPTAB_DIRECT);
- if (temp == 0)
- delete_insns_since (last);
- return temp;
+ temp = expand_unop_direct (mode, ctz_optab, op0, 0, true);
+ if (!temp)
+ goto fail;
+
+ defined_at_zero = (CTZ_DEFINED_VALUE_AT_ZERO (mode, val) == 2);
+ }
+ else if (optab_handler (clz_optab, mode)->insn_code != CODE_FOR_nothing)
+ {
+ start_sequence ();
+ temp = expand_ctz (mode, op0, 0);
+ if (!temp)
+ goto fail;
+
+ if (CLZ_DEFINED_VALUE_AT_ZERO (mode, val) == 2)
+ {
+ defined_at_zero = true;
+ val = (GET_MODE_BITSIZE (mode) - 1) - val;
+ }
}
+ else
+ return 0;
+
+ if (defined_at_zero && val == -1)
+ /* No correction needed at zero. */;
+ else
+ {
+ /* We don't try to do anything clever with the situation found
+ on some processors (eg Alpha) where clz(0:mode) ==
+ bitsize(mode). If someone can think of a way to send N to -1
+ and leave alone all values in the range 0..N-1 (where N is a
+ power of two), cheaper than this test-and-branch, please add it.
+
+ The test-and-branch is done after the operation itself, in case
+ the operation sets condition codes that can be recycled for this.
+ (This is true on i386, for instance.) */
+
+ rtx nonzero_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (op0, CONST0_RTX (mode), NE, 0,
+ mode, true, nonzero_label);
+
+ convert_move (temp, GEN_INT (-1), false);
+ emit_label (nonzero_label);
+ }
+
+ /* temp now has a value in the range -1..bitsize-1. ffs is supposed
+ to produce a value in the range 0..bitsize. */
+ temp = expand_binop (mode, add_optab, temp, GEN_INT (1),
+ target, false, OPTAB_DIRECT);
+ if (!temp)
+ goto fail;
+
+ seq = get_insns ();
+ end_sequence ();
+
+ add_equal_note (seq, temp, FFS, op0, 0);
+ emit_insn (seq);
+ return temp;
+
+ fail:
+ end_sequence ();
return 0;
}
@@ -2750,34 +2857,19 @@ expand_absneg_bit (enum rtx_code code, e
return target;
}
-/* Generate code to perform an operation specified by UNOPTAB
- on operand OP0, with result having machine-mode MODE.
-
- UNSIGNEDP is for the case where we have to widen the operands
- to perform the operation. It says to use zero-extension.
-
- If TARGET is nonzero, the value
- is generated there, if it is convenient to do so.
- In all cases an rtx is returned for the locus of the value;
- this may or may not be TARGET. */
-
-rtx
-expand_unop (enum machine_mode mode, optab unoptab, rtx op0, rtx target,
+/* As expand_unop, but will fail rather than attempt the operation in a
+ different mode or with a libcall. */
+static rtx
+expand_unop_direct (enum machine_mode mode, optab unoptab, rtx op0, rtx target,
int unsignedp)
{
- enum mode_class class;
- enum machine_mode wider_mode;
- rtx temp;
- rtx last = get_last_insn ();
- rtx pat;
-
- class = GET_MODE_CLASS (mode);
-
if (optab_handler (unoptab, mode)->insn_code != CODE_FOR_nothing)
{
int icode = (int) optab_handler (unoptab, mode)->insn_code;
enum machine_mode mode0 = insn_data[icode].operand[1].mode;
rtx xop0 = op0;
+ rtx last = get_last_insn ();
+ rtx pat, temp;
if (target)
temp = target;
@@ -2813,16 +2905,49 @@ expand_unop (enum machine_mode mode, opt
else
delete_insns_since (last);
}
+ return 0;
+}
+
+/* Generate code to perform an operation specified by UNOPTAB
+ on operand OP0, with result having machine-mode MODE.
+
+ UNSIGNEDP is for the case where we have to widen the operands
+ to perform the operation. It says to use zero-extension.
+
+ If TARGET is nonzero, the value
+ is generated there, if it is convenient to do so.
+ In all cases an rtx is returned for the locus of the value;
+ this may or may not be TARGET. */
+
+rtx
+expand_unop (enum machine_mode mode, optab unoptab, rtx op0, rtx target,
+ int unsignedp)
+{
+ enum mode_class class = GET_MODE_CLASS (mode);
+ enum machine_mode wider_mode;
+ rtx temp;
+
+ temp = expand_unop_direct (mode, unoptab, op0, target, unsignedp);
+ if (temp)
+ return temp;
/* It can't be done in this mode. Can we open-code it in a wider mode? */
- /* Widening clz needs special treatment. */
+ /* Widening (or narrowing) clz needs special treatment. */
if (unoptab == clz_optab)
{
temp = widen_clz (mode, op0, target);
if (temp)
return temp;
- else
+
+ if (GET_MODE_SIZE (mode) == 2 * UNITS_PER_WORD
+ && optab_handler (unoptab, word_mode)->insn_code != CODE_FOR_nothing)
+ {
+ temp = expand_doubleword_clz (mode, op0, target);
+ if (temp)
+ return temp;
+ }
+
goto try_libcall;
}
@@ -2852,6 +2977,7 @@ expand_unop (enum machine_mode mode, opt
if (optab_handler (unoptab, wider_mode)->insn_code != CODE_FOR_nothing)
{
rtx xop0 = op0;
+ rtx last = get_last_insn ();
/* For certain operations, we need not actually extend
the narrow operand, as long as we will truncate the
@@ -3011,6 +3137,7 @@ expand_unop (enum machine_mode mode, opt
|| optab_handler (unoptab, wider_mode)->libfunc)
{
rtx xop0 = op0;
+ rtx last = get_last_insn ();
/* For certain operations, we need not actually extend
the narrow operand, as long as we will truncate the