[gcc(refs/users/aoliva/heads/testme)] [PR94092] introduce try store by multiple pieces
Alexandre Oliva
aoliva@gcc.gnu.org
Thu Feb 11 05:19:12 GMT 2021
https://gcc.gnu.org/g:4257c4da95860043d600cfb9474f4a4ba5bddc64
commit 4257c4da95860043d600cfb9474f4a4ba5bddc64
Author: Alexandre Oliva <oliva@adacore.com>
Date: Wed Feb 3 17:12:07 2021 -0300
[PR94092] introduce try store by multiple pieces
The ldist pass turns even very short loops into memset calls. E.g.,
the TFmode emulation calls end with a loop of up to 3 iterations, to
zero out trailing words, and the loop distribution pass turns them
into calls of the memset builtin.
Though short constant-length memsets are usually dealt with
efficiently, for non-constant-length ones, the options are setmemM, or
a function calls.
RISC-V doesn't have any setmemM pattern, so the loops above end up
"optimized" into memset calls, incurring not only the overhead of an
explicit call, but also discarding the information the compiler has
about the alignment of the destination, and that the length is a
multiple of the word alignment.
This patch handles variable lengths with multiple conditional
power-of-2-constant-sized stores-by-pieces, so as to reduce the
overhead of length compares.
It also propagates the alignment of the memset dest, introduced by
loop-distribution, to the SSA pointer used to hold it, so that it is
not discarded right away, and may be recovered by the memset builtin
expander.
Finally, it computes the ctz of the length, and uses it to omit blocks
for stores of small byte counts when we're storing whole words.
tree_ctz is improved so as to look at the length DEF stmt, and zero
out the least-significant bits if it's a multiply by a power of two.
for gcc/ChangeLog
PR tree-optimization/94092
* builtins.c (try_store_by_multiple_pieces): New.
(expand_builtin_memset_args): Use it. If target_char_cast
fails, proceed as for non-constant val. Pass len's ctz to...
* expr.c (clear_storage_hints): ... this. Try store by
multiple pieces after setmem.
(clear_storage): Adjust.
* expr.h (clear_storage_hints): Likewise.
(try_store_by_multiple_pieces): Declare.
* tree-loop-distribution.c: Include builtins.h.
(generate_memset_builtin): Propagate dst_base alignmen to mem.
* tree-ssanames.c (get_nonzero_bits): Zero out low bits of
integral types, when a MULT_EXPR INTEGER_CST operand ensures
the result will be a multiple of a power of two.
Diff:
---
gcc/builtins.c | 113 ++++++++++++++++++++++++++++++++++++++++---
gcc/expr.c | 9 +++-
gcc/expr.h | 12 ++++-
gcc/tree-loop-distribution.c | 11 +++++
gcc/tree-ssanames.c | 23 ++++++++-
5 files changed, 156 insertions(+), 12 deletions(-)
diff --git a/gcc/builtins.c b/gcc/builtins.c
index 0aed008687c..44f3af92536 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -6600,6 +6600,97 @@ expand_builtin_memset (tree exp, rtx target, machine_mode mode)
return expand_builtin_memset_args (dest, val, len, target, mode, exp);
}
+/* Try to store VAL (or, if NULL_RTX, VALC) in LEN bytes starting at TO.
+ Return TRUE if successful, FALSE otherwise. TO is assumed to be
+ aligned at an ALIGN boundary. LEN is assumed to be a multiple of
+ 1<<CTZ_LEN, and between min_len and max_len.
+
+ The strategy is to issue one store_by_pieces for each power of two,
+ for most to least significant, guarded by a test on whether there are
+ at least that many bytes to copy. */
+
+bool
+try_store_by_multiple_pieces (rtx to, rtx len, int ctz_len,
+ unsigned HOST_WIDE_INT min_len,
+ unsigned HOST_WIDE_INT max_len,
+ rtx val, char valc, unsigned int align)
+{
+ int max_bits = floor_log2 (max_len);
+ int min_bits = floor_log2 (min_len);
+
+ if (val)
+ valc = 1;
+
+ if (!can_store_by_pieces ((HOST_WIDE_INT_1U << max_bits) * 2
+ - (HOST_WIDE_INT_1U << ctz_len),
+ builtin_memset_read_str, &valc, align, true))
+ return false;
+
+ rtx (*constfun) (void *, HOST_WIDE_INT, scalar_int_mode);
+ void *constfundata;
+ if (val)
+ {
+ constfun = builtin_memset_gen_str;
+ constfundata = val = force_reg (TYPE_MODE (unsigned_char_type_node),
+ val);
+ }
+ else
+ {
+ constfun = builtin_memset_read_str;
+ constfundata = &valc;
+ }
+
+ /* Bits above SHR_BITS don't need to be tested. */
+ int shr_bits = (max_bits != min_bits
+ ? max_bits
+ : floor_log2 (max_len ^ min_len));
+
+ rtx ptr = copy_addr_to_reg (convert_to_mode (ptr_mode, XEXP (to, 0), 0));
+ rtx rem = copy_to_mode_reg (ptr_mode, convert_to_mode (ptr_mode, len, 0));
+ set_mem_align (to, align);
+
+ for (int i = max_bits; i >= ctz_len; i--)
+ {
+ unsigned HOST_WIDE_INT blksize = HOST_WIDE_INT_1U << i;
+ rtx_code_label *label = NULL;
+
+ if (i <= shr_bits)
+ {
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (rem, GEN_INT (blksize), LT, NULL,
+ ptr_mode, 1, label,
+ profile_probability::even ());
+ }
+ else if ((max_len & blksize) == 0)
+ continue;
+
+ if (align > blksize * BITS_PER_UNIT)
+ {
+ align = blksize * BITS_PER_UNIT;
+ set_mem_align (to, align);
+ }
+
+ to = store_by_pieces (to, blksize,
+ constfun, constfundata,
+ align, true, RETURN_END);
+
+ if (label)
+ {
+ emit_move_insn (ptr, XEXP (to, 0));
+ XEXP (to, 0) = ptr;
+
+ clear_mem_offset (to);
+ clear_mem_size (to);
+
+ emit_move_insn (rem, plus_constant (ptr_mode, rem, -blksize));
+
+ emit_label (label);
+ }
+ }
+
+ return true;
+}
+
/* Helper function to do the actual work for expand_builtin_memset. The
arguments to the builtin_memset call DEST, VAL, and LEN are broken out
so that this can also be called without constructing an actual CALL_EXPR.
@@ -6654,7 +6745,8 @@ expand_builtin_memset_args (tree dest, tree val, tree len,
dest_mem = get_memory_rtx (dest, len);
val_mode = TYPE_MODE (unsigned_char_type_node);
- if (TREE_CODE (val) != INTEGER_CST)
+ if (TREE_CODE (val) != INTEGER_CST
+ || target_char_cast (val, &c))
{
rtx val_rtx;
@@ -6678,7 +6770,12 @@ expand_builtin_memset_args (tree dest, tree val, tree len,
else if (!set_storage_via_setmem (dest_mem, len_rtx, val_rtx,
dest_align, expected_align,
expected_size, min_size, max_size,
- probable_max_size))
+ probable_max_size)
+ && !try_store_by_multiple_pieces (dest_mem, len_rtx,
+ tree_ctz (len),
+ min_size, max_size,
+ val_rtx, 0,
+ dest_align))
goto do_libcall;
dest_mem = force_operand (XEXP (dest_mem, 0), NULL_RTX);
@@ -6686,9 +6783,6 @@ expand_builtin_memset_args (tree dest, tree val, tree len,
return dest_mem;
}
- if (target_char_cast (val, &c))
- goto do_libcall;
-
if (c)
{
if (tree_fits_uhwi_p (len)
@@ -6702,7 +6796,12 @@ expand_builtin_memset_args (tree dest, tree val, tree len,
gen_int_mode (c, val_mode),
dest_align, expected_align,
expected_size, min_size, max_size,
- probable_max_size))
+ probable_max_size)
+ && !try_store_by_multiple_pieces (dest_mem, len_rtx,
+ tree_ctz (len),
+ min_size, max_size,
+ NULL_RTX, c,
+ dest_align))
goto do_libcall;
dest_mem = force_operand (XEXP (dest_mem, 0), NULL_RTX);
@@ -6716,7 +6815,7 @@ expand_builtin_memset_args (tree dest, tree val, tree len,
? BLOCK_OP_TAILCALL : BLOCK_OP_NORMAL,
expected_align, expected_size,
min_size, max_size,
- probable_max_size);
+ probable_max_size, tree_ctz (len));
if (dest_addr == 0)
{
diff --git a/gcc/expr.c b/gcc/expr.c
index 86dc1b6c973..f92e4136f67 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -3056,7 +3056,8 @@ clear_storage_hints (rtx object, rtx size, enum block_op_methods method,
unsigned int expected_align, HOST_WIDE_INT expected_size,
unsigned HOST_WIDE_INT min_size,
unsigned HOST_WIDE_INT max_size,
- unsigned HOST_WIDE_INT probable_max_size)
+ unsigned HOST_WIDE_INT probable_max_size,
+ unsigned ctz_size)
{
machine_mode mode = GET_MODE (object);
unsigned int align;
@@ -3103,6 +3104,10 @@ clear_storage_hints (rtx object, rtx size, enum block_op_methods method,
expected_align, expected_size,
min_size, max_size, probable_max_size))
;
+ else if (try_store_by_multiple_pieces (object, size, ctz_size,
+ min_size, max_size,
+ NULL_RTX, 0, align))
+ ;
else if (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (object)))
return set_storage_via_libcall (object, size, const0_rtx,
method == BLOCK_OP_TAILCALL);
@@ -3120,7 +3125,7 @@ clear_storage (rtx object, rtx size, enum block_op_methods method)
min = max = UINTVAL (size);
else
max = GET_MODE_MASK (GET_MODE (size));
- return clear_storage_hints (object, size, method, 0, -1, min, max, max);
+ return clear_storage_hints (object, size, method, 0, -1, min, max, max, 0);
}
diff --git a/gcc/expr.h b/gcc/expr.h
index 1f0177a4cfa..6ff2384df63 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -193,7 +193,8 @@ extern rtx clear_storage_hints (rtx, rtx, enum block_op_methods,
unsigned int, HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
- unsigned HOST_WIDE_INT);
+ unsigned HOST_WIDE_INT,
+ unsigned);
/* The same, but always output an library call. */
extern rtx set_storage_via_libcall (rtx, rtx, rtx, bool = false);
@@ -224,6 +225,15 @@ extern int can_store_by_pieces (unsigned HOST_WIDE_INT,
extern rtx store_by_pieces (rtx, unsigned HOST_WIDE_INT, by_pieces_constfn,
void *, unsigned int, bool, memop_ret);
+/* If can_store_by_pieces passes for worst-case values near MAX_LEN, call
+ store_by_pieces within conditionals so as to handle variable LEN efficiently,
+ storing VAL, if non-NULL_RTX, or valc instead. */
+extern bool try_store_by_multiple_pieces (rtx to, rtx len, int ctz_len,
+ unsigned HOST_WIDE_INT min_len,
+ unsigned HOST_WIDE_INT max_len,
+ rtx val, char valc,
+ unsigned int align);
+
/* Emit insns to set X from Y. */
extern rtx_insn *emit_move_insn (rtx, rtx);
extern rtx_insn *gen_move_insn (rtx, rtx);
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index 7ee19fc8677..0e219cbcee3 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -115,6 +115,7 @@ along with GCC; see the file COPYING3. If not see
#include "tree-vectorizer.h"
#include "tree-eh.h"
#include "gimple-fold.h"
+#include "builtins.h"
#define MAX_DATAREFS_NUM \
@@ -1155,6 +1156,16 @@ generate_memset_builtin (class loop *loop, partition *partition)
mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
false, GSI_CONTINUE_LINKING);
+ if (TREE_CODE (mem) == SSA_NAME)
+ if (ptr_info_def *pi = get_ptr_info (mem))
+ {
+ unsigned al;
+ unsigned HOST_WIDE_INT misal;
+ if (get_pointer_alignment_1 (builtin->dst_base, &al, &misal))
+ set_ptr_info_alignment (pi, al / BITS_PER_UNIT,
+ misal / BITS_PER_UNIT);
+ }
+
/* This exactly matches the pattern recognition in classify_partition. */
val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
/* Handle constants like 0x15151515 and similarly
diff --git a/gcc/tree-ssanames.c b/gcc/tree-ssanames.c
index 51a26d2fce1..c4b5bf2a499 100644
--- a/gcc/tree-ssanames.c
+++ b/gcc/tree-ssanames.c
@@ -546,10 +546,29 @@ get_nonzero_bits (const_tree name)
}
range_info_def *ri = SSA_NAME_RANGE_INFO (name);
+ wide_int ret;
if (!ri)
- return wi::shwi (-1, precision);
+ ret = wi::shwi (-1, precision);
+ else
+ ret = ri->get_nonzero_bits ();
+
+ /* If NAME is defined as a multiple of a constant C, we know the ctz(C) low
+ bits are zero. ??? Should we handle LSHIFT_EXPR too? Non-constants,
+ e.g. the minimum shift count, and ctz from both MULT_EXPR operands? That
+ could make for deep recursion. */
+ if (INTEGRAL_TYPE_P (TREE_TYPE (name))
+ && SSA_NAME_DEF_STMT (name)
+ && is_gimple_assign (SSA_NAME_DEF_STMT (name))
+ && gimple_assign_rhs_code (SSA_NAME_DEF_STMT (name)) == MULT_EXPR
+ && TREE_CODE (gimple_assign_rhs2 (SSA_NAME_DEF_STMT (name))) == INTEGER_CST)
+ {
+ unsigned HOST_WIDE_INT bits
+ = tree_ctz (gimple_assign_rhs2 (SSA_NAME_DEF_STMT (name)));
+ wide_int mask = wi::shwi (-1, precision) << bits;
+ ret &= mask;
+ }
- return ri->get_nonzero_bits ();
+ return ret;
}
/* Return TRUE is OP, an SSA_NAME has a range of values [0..1], false
More information about the Gcc-cvs
mailing list