[PATCH v3] Add QI vector mode support to by-pieces for memset
Richard Sandiford
richard.sandiford@arm.com
Wed Jul 21 14:50:15 GMT 2021
"H.J. Lu" <hjl.tools@gmail.com> writes:
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index 39ab139b7e1..1972301ce3c 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)
>
> static rtx
> builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> - scalar_int_mode mode)
> + fixed_size_mode mode)
> {
> /* The REPresentation pointed to by DATA need not be a nul-terminated
> string but the caller guarantees it's large enough for MODE. */
> const char *rep = (const char *) data;
>
> - return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
> + /* NB: Vector mode in the by-pieces infrastructure is only used by
> + the memset expander. */
Sorry to nitpick, but I guess this might get out out-of-date. Maybe:
/* The by-pieces infrastructure does not try to pick a vector mode
for memcpy expansion. */
> + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
> + /*nul_terminated=*/false);
> }
>
> /* LEN specify length of the block of memcpy/memset operation.
> @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx)
>
> rtx
> builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> - scalar_int_mode mode)
> + fixed_size_mode mode)
> {
> const char *str = (const char *) data;
>
> if ((unsigned HOST_WIDE_INT) offset > strlen (str))
> return const0_rtx;
>
> - return c_readstr (str + offset, mode);
> + /* NB: Vector mode in the by-pieces infrastructure is only used by
> + the memset expander. */
Similarly here for strncpy expansion.
> + return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
> }
>
> /* Helper to check the sizes of sequences and the destination of calls
> @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target)
> return NULL_RTX;
> }
>
> -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> - bytes from constant string DATA + OFFSET and return it as target
> - constant. If PREV isn't nullptr, it has the RTL info from the
> +/* Return the RTL of a register in MODE generated from PREV in the
> previous iteration. */
>
> -rtx
> -builtin_memset_read_str (void *data, void *prevp,
> - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> - scalar_int_mode mode)
> +static rtx
> +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode)
> {
> - by_pieces_prev *prev = (by_pieces_prev *) prevp;
> + rtx target = nullptr;
> if (prev != nullptr && prev->data != nullptr)
> {
> /* Use the previous data in the same mode. */
> if (prev->mode == mode)
> return prev->data;
> +
> + fixed_size_mode prev_mode = prev->mode;
> +
> + /* Don't use the previous data to write QImode if it is in a
> + vector mode. */
> + if (VECTOR_MODE_P (prev_mode) && mode == QImode)
> + return target;
> +
> + rtx prev_rtx = prev->data;
> +
> + if (REG_P (prev_rtx)
> + && HARD_REGISTER_P (prev_rtx)
> + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> + {
> + /* If we can't put a hard register in MODE, first generate a
> + subreg of word mode if the previous mode is wider than word
> + mode and word mode is wider than MODE. */
> + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode)
> + && UNITS_PER_WORD > GET_MODE_SIZE (mode))
> + {
> + prev_rtx = lowpart_subreg (word_mode, prev_rtx,
> + prev_mode);
> + if (prev_rtx != nullptr)
> + prev_mode = word_mode;
> + }
> + else
> + prev_rtx = nullptr;
I don't understand this. Why not just do the:
if (REG_P (prev_rtx)
&& HARD_REGISTER_P (prev_rtx)
&& lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
prev_rtx = copy_to_reg (prev_rtx);
that I suggested in the previous review?
IMO anything that relies on a sequence of two subreg operations is
doing something wrong.
> + }
> + if (prev_rtx != nullptr)
> + target = lowpart_subreg (mode, prev_rtx, prev_mode);
> }
> + return target;
> +}
> +
> […]
> @@ -769,21 +769,41 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
> return align;
> }
>
> -/* Return the widest integer mode that is narrower than SIZE bytes. */
> +/* Return the widest QI vector, if QI_MODE is true, or integer mode
> + that is narrower than SIZE bytes. */
>
> -static scalar_int_mode
> -widest_int_mode_for_size (unsigned int size)
> +static fixed_size_mode
> +widest_fixed_size_mode_for_size (unsigned int size, bool qi_vector)
> {
> - scalar_int_mode result = NARROWEST_INT_MODE;
> + machine_mode result = NARROWEST_INT_MODE;
>
> gcc_checking_assert (size > 1);
>
> + /* Use QI vector only if size is wider than a WORD. */
> + if (qi_vector && size > UNITS_PER_WORD)
> + {
> + machine_mode mode;
> + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> + if (GET_MODE_INNER (mode) == QImode
> + && GET_MODE_SIZE (mode).is_constant ())
> + {
> + if (GET_MODE_SIZE (mode).to_constant () >= size)
> + break;
> + if (optab_handler (vec_duplicate_optab, mode)
> + != CODE_FOR_nothing)
> + result = mode;
> + }
> +
> + if (result != NARROWEST_INT_MODE)
> + return as_a <fixed_size_mode> (result);
> + }
> +
> opt_scalar_int_mode tmode;
> FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
> if (GET_MODE_SIZE (tmode.require ()) < size)
> result = tmode.require ();
>
> - return result;
> + return as_a <fixed_size_mode> (result);
> }
A less cast-heavy way to write this would be:
fixed_size_mode result = NARROWEST_INT_MODE;
gcc_checking_assert (size > 1);
/* Use QI vector only if size is wider than a WORD. */
if (qi_vector && size > UNITS_PER_WORD)
{
machine_mode mode;
fixed_size_mode candidate;
FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
if (is_a<fixed_size_mode> (mode, &candidate)
&& GET_MODE_INNER (candidate) == QImode)
{
if (GET_MODE_SIZE (candidate) >= size)
break;
if (optab_handler (vec_duplicate_optab, candidate)
!= CODE_FOR_nothing)
result = candidate;
}
if (result != NARROWEST_INT_MODE)
return result;
}
opt_scalar_int_mode tmode;
FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
if (GET_MODE_SIZE (tmode.require ()) < size)
result = tmode.require ();
return result;
> @@ -1148,13 +1178,43 @@ op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
> size = GET_MODE_SIZE (mode);
> if (len >= size && prepare_mode (mode, m_align))
> break;
> - /* NB: widest_int_mode_for_size checks SIZE > 1. */
> - mode = widest_int_mode_for_size (size);
> + /* NB: widest_fixed_size_mode_for_size checks SIZE > 1. */
> + mode = widest_fixed_size_mode_for_size (size, m_qi_vector_mode);
> }
> while (1);
> return mode;
> }
>
> +/* Return the smallest integer or QI vector mode that is not narrower
> + than SIZE bytes. */
> +
> +fixed_size_mode
> +op_by_pieces_d::smallest_fixed_size_mode_for_size (unsigned int size)
> +{
> + /* Use QI vector only for > size of WORD. */
> + if (m_qi_vector_mode && size > UNITS_PER_WORD)
> + {
> + machine_mode mode;
> + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> + if (GET_MODE_INNER (mode) == QImode
> + && GET_MODE_SIZE (mode).is_constant ())
> + {
> + unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
> +
> + /* NB: Don't return a mode wider than M_LEN. */
> + if (mode_size > m_len)
> + break;
> +
> + if (mode_size >= size
> + && (optab_handler (vec_duplicate_optab, mode)
> + != CODE_FOR_nothing))
> + return as_a <fixed_size_mode> (mode);
> + }
> + }
Same idea here.
I realise this is pre-existing, but the heavy use of “NB:” seems a bit odd.
I don't think it's something that the patch needs to adopt.
Otherwise it looks good, thanks. I think the main sticking point is
the subreg thing.
Richard
More information about the Gcc-patches
mailing list