[gcc(refs/users/aoliva/heads/testme)] add memcmp loop expander
Alexandre Oliva
aoliva@gcc.gnu.org
Thu Jan 26 08:21:39 GMT 2023
https://gcc.gnu.org/g:c234c022348c1ada7594294ade986951521a6a5b
commit c234c022348c1ada7594294ade986951521a6a5b
Author: Alexandre Oliva <oliva@gnu.org>
Date: Fri Jan 20 22:01:15 2023 -0300
add memcmp loop expander
Diff:
---
gcc/builtins.cc | 3 +-
gcc/expr.cc | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
gcc/expr.h | 3 +-
3 files changed, 167 insertions(+), 4 deletions(-)
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index ca4d4721cdc..e55b53485e2 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4809,7 +4809,8 @@ expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
TREE_TYPE (len), target,
result_eq, constfn,
- CONST_CAST (char *, rep));
+ CONST_CAST (char *, rep),
+ tree_ctz (len));
if (result)
{
diff --git a/gcc/expr.cc b/gcc/expr.cc
index aceb3f514fc..7cb693525ba 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -83,6 +83,8 @@ static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+ unsigned, unsigned);
static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
static rtx_insn *compress_float_constant (rtx, rtx);
static rtx get_subtarget (rtx);
@@ -2569,7 +2571,8 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
Both X and Y must be MEM rtx's. LEN is an rtx that says how long
they are. LEN_TYPE is the type of the expression that was used to
- calculate it.
+ calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+ so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
If EQUALITY_ONLY is true, it means we don't have to return the tri-state
value of a normal memcmp call, instead we can just compare for equality.
@@ -2585,7 +2588,7 @@ emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
rtx
emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
bool equality_only, by_pieces_constfn y_cfn,
- void *y_cfndata)
+ void *y_cfndata, unsigned ctz_len)
{
rtx result = 0;
@@ -2607,8 +2610,166 @@ emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
else
result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
+ if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+ result = emit_block_cmp_via_loop (x, y, len, len_type,
+ target, equality_only,
+ align, ctz_len);
+
return result;
}
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree size_type, rtx target,
+ bool equality_only, unsigned align, unsigned ctz_len)
+{
+ unsigned incr = align / BITS_PER_UNIT;
+
+ if (CONST_INT_P (len))
+ ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+ if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+ incr = HOST_WIDE_INT_1U << ctz_len;
+
+ while (incr > 1
+ && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+ incr >>= 1;
+
+ rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+ rtx iter, x_addr, y_addr, tmp;
+ machine_mode x_addr_mode = get_address_mode (x);
+ machine_mode y_addr_mode = get_address_mode (y);
+ machine_mode iter_mode;
+
+ iter_mode = GET_MODE (len);
+ if (iter_mode == VOIDmode)
+ iter_mode = word_mode;
+
+ top_label = gen_label_rtx ();
+ cmp_label = gen_label_rtx ();
+ ne_label = gen_label_rtx ();
+ res_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ rtx iter_init = const0_rtx;
+ rtx_code iter_cond = LT;
+ rtx iter_limit = len;
+ rtx iter_incr = GEN_INT (incr);
+ machine_mode cmp_mode;
+
+ emit_move_insn (iter, iter_init);
+
+ scalar_int_mode int_cmp_mode
+ = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+ if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+ || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+ {
+ cmp_mode = BLKmode;
+ gcc_checking_assert (incr != 1);
+ }
+ else
+ cmp_mode = int_cmp_mode;
+
+ x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+ y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+ do_pending_stack_adjust ();
+
+ emit_jump (cmp_label);
+ emit_label (top_label);
+
+ tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+ x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+ if (x_addr_mode != y_addr_mode)
+ tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+ y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+ x = change_address (x, cmp_mode, x_addr);
+ y = change_address (y, cmp_mode, y_addr);
+
+ rtx part_res;
+ if (cmp_mode == BLKmode)
+ part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+ else
+ part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+ true, OPTAB_LIB_WIDEN);
+
+ emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+ GET_MODE (part_res), true, ne_label,
+ profile_probability::guessed_always ()
+ .apply_scale (1, 10));
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_label (cmp_label);
+ emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+ true, top_label,
+ profile_probability::guessed_always ()
+ .apply_scale (9, 10));
+
+ if (target == NULL_RTX
+ || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+ emit_move_insn (target, const0_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ne_label);
+
+ if (equality_only)
+ emit_move_insn (target, const1_rtx);
+ else
+ {
+ if (incr > UNITS_PER_WORD)
+ /* ??? Re-compare the block found to be different one word at a
+ time. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_WORD, 0);
+ else if (incr > 1)
+ /* ??? Re-compare the block found to be different one byte at a
+ time. We could do better using part_res, and being careful
+ about endianness. */
+ part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), size_type,
+ target, equality_only,
+ BITS_PER_UNIT, 0);
+ else if (GET_MODE_BITSIZE (GET_MODE (target))
+ > GET_MODE_BITSIZE (cmp_mode))
+ part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+ true, OPTAB_LIB_WIDEN);
+ else
+ {
+ /* In the odd chance target is QImode, we can't count on
+ widening subtract to capture the result of the unsigned
+ compares. */
+ rtx_code_label *ltu_label;
+ ltu_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+ cmp_mode, true, ltu_label,
+ profile_probability::guessed_always ()
+ .apply_scale (5, 10));
+
+ emit_move_insn (target, const1_rtx);
+ emit_jump (res_label);
+ emit_barrier ();
+
+ emit_label (ltu_label);
+ emit_move_insn (target, constm1_rtx);
+ part_res = target;
+ }
+
+ if (target != part_res)
+ convert_move (target, part_res, false);
+ }
+
+ emit_label (res_label);
+
+ return target;
+}
+
/* Copy all or part of a value X into registers starting at REGNO.
The number of registers to be filled is NREGS. */
diff --git a/gcc/expr.h b/gcc/expr.h
index d9fc47c9114..976c8b69fc1 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -138,7 +138,8 @@ extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
bool might_overlap = false,
unsigned ctz_size = 0);
extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
- by_pieces_constfn, void *);
+ by_pieces_constfn, void *,
+ unsigned ctz_len = 0);
extern bool emit_storent_insn (rtx to, rtx from);
/* Copy all or part of a value X into registers starting at REGNO.
More information about the Gcc-cvs
mailing list