[PATCH v3] Add QI vector mode support to by-pieces for memset

Wed Jul 21 14:50:15 GMT 2021

"H.J. Lu" <hjl.tools@gmail.com> writes:
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index 39ab139b7e1..1972301ce3c 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)
>  
>  static rtx
>  builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> -			 scalar_int_mode mode)
> +			 fixed_size_mode mode)
>  {
>    /* The REPresentation pointed to by DATA need not be a nul-terminated
>       string but the caller guarantees it's large enough for MODE.  */
>    const char *rep = (const char *) data;
>  
> -  return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
> +  /* NB: Vector mode in the by-pieces infrastructure is only used by
> +     the memset expander.  */

Sorry to nitpick, but I guess this might get out out-of-date.  Maybe:

  /* The by-pieces infrastructure does not try to pick a vector mode
     for memcpy expansion.  */

> +  return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
> +		    /*nul_terminated=*/false);
>  }
>  
>  /* LEN specify length of the block of memcpy/memset operation.
> @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx)
>  
>  rtx
>  builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> -			  scalar_int_mode mode)
> +			  fixed_size_mode mode)
>  {
>    const char *str = (const char *) data;
>  
>    if ((unsigned HOST_WIDE_INT) offset > strlen (str))
>      return const0_rtx;
>  
> -  return c_readstr (str + offset, mode);
> +  /* NB: Vector mode in the by-pieces infrastructure is only used by
> +     the memset expander.  */

Similarly here for strncpy expansion.

> +  return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
>  }
>  
>  /* Helper to check the sizes of sequences and the destination of calls
> @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target)
>    return NULL_RTX;
>  }
>  
> -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> -   bytes from constant string DATA + OFFSET and return it as target
> -   constant.  If PREV isn't nullptr, it has the RTL info from the
> +/* Return the RTL of a register in MODE generated from PREV in the
>     previous iteration.  */
>  
> -rtx
> -builtin_memset_read_str (void *data, void *prevp,
> -			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> -			 scalar_int_mode mode)
> +static rtx
> +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode)
>  {
> -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
> +  rtx target = nullptr;
>    if (prev != nullptr && prev->data != nullptr)
>      {
>        /* Use the previous data in the same mode.  */
>        if (prev->mode == mode)
>  	return prev->data;
> +
> +      fixed_size_mode prev_mode = prev->mode;
> +
> +      /* Don't use the previous data to write QImode if it is in a
> +	 vector mode.  */
> +      if (VECTOR_MODE_P (prev_mode) && mode == QImode)
> +	return target;
> +
> +      rtx prev_rtx = prev->data;
> +
> +      if (REG_P (prev_rtx)
> +	  && HARD_REGISTER_P (prev_rtx)
> +	  && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> +	{
> +	  /* If we can't put a hard register in MODE, first generate a
> +	     subreg of word mode if the previous mode is wider than word
> +	     mode and word mode is wider than MODE.  */
> +	  if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode)
> +	      && UNITS_PER_WORD > GET_MODE_SIZE (mode))
> +	    {
> +	      prev_rtx = lowpart_subreg (word_mode, prev_rtx,
> +					 prev_mode);
> +	      if (prev_rtx != nullptr)
> +		prev_mode = word_mode;
> +	    }
> +	  else
> +	    prev_rtx = nullptr;

I don't understand this.  Why not just do the:

      if (REG_P (prev_rtx)
	  && HARD_REGISTER_P (prev_rtx)
	  && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
        prev_rtx = copy_to_reg (prev_rtx);

that I suggested in the previous review?

IMO anything that relies on a sequence of two subreg operations is
doing something wrong.

> +	}
> +      if (prev_rtx != nullptr)
> +	target = lowpart_subreg (mode, prev_rtx, prev_mode);
>      }
> +  return target;
> +}
> +
> […]
> @@ -769,21 +769,41 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
>    return align;
>  }
>  
> -/* Return the widest integer mode that is narrower than SIZE bytes.  */
> +/* Return the widest QI vector, if QI_MODE is true, or integer mode
> +   that is narrower than SIZE bytes.  */
>  
> -static scalar_int_mode
> -widest_int_mode_for_size (unsigned int size)
> +static fixed_size_mode
> +widest_fixed_size_mode_for_size (unsigned int size, bool qi_vector)
>  {
> -  scalar_int_mode result = NARROWEST_INT_MODE;
> +  machine_mode result = NARROWEST_INT_MODE;
>  
>    gcc_checking_assert (size > 1);
>  
> +  /* Use QI vector only if size is wider than a WORD.  */
> +  if (qi_vector && size > UNITS_PER_WORD)
> +    {
> +      machine_mode mode;
> +      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> +	if (GET_MODE_INNER (mode) == QImode
> +	    && GET_MODE_SIZE (mode).is_constant ())
> +	  {
> +	    if (GET_MODE_SIZE (mode).to_constant () >= size)
> +	      break;
> +	    if (optab_handler (vec_duplicate_optab, mode)
> +		!= CODE_FOR_nothing)
> +	      result = mode;
> +	  }
> +
> +      if (result != NARROWEST_INT_MODE)
> +	return as_a <fixed_size_mode> (result);
> +    }
> +
>    opt_scalar_int_mode tmode;
>    FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
>      if (GET_MODE_SIZE (tmode.require ()) < size)
>        result = tmode.require ();
>  
> -  return result;
> +  return as_a <fixed_size_mode> (result);
>  }

A less cast-heavy way to write this would be:

  fixed_size_mode result = NARROWEST_INT_MODE;

  gcc_checking_assert (size > 1);

  /* Use QI vector only if size is wider than a WORD.  */
  if (qi_vector && size > UNITS_PER_WORD)
    {
      machine_mode mode;
      fixed_size_mode candidate;
      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
	if (is_a<fixed_size_mode> (mode, &candidate)
	    && GET_MODE_INNER (candidate) == QImode)
	  {
	    if (GET_MODE_SIZE (candidate) >= size)
	      break;
	    if (optab_handler (vec_duplicate_optab, candidate)
		!= CODE_FOR_nothing)
	      result = candidate;
	  }

      if (result != NARROWEST_INT_MODE)
	return result;
    }

  opt_scalar_int_mode tmode;
  FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
    if (GET_MODE_SIZE (tmode.require ()) < size)
      result = tmode.require ();

  return result;

> @@ -1148,13 +1178,43 @@ op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
>        size = GET_MODE_SIZE (mode);
>        if (len >= size && prepare_mode (mode, m_align))
>  	break;
> -      /* NB: widest_int_mode_for_size checks SIZE > 1.  */
> -      mode = widest_int_mode_for_size (size);
> +      /* NB: widest_fixed_size_mode_for_size checks SIZE > 1.  */
> +      mode = widest_fixed_size_mode_for_size (size, m_qi_vector_mode);
>      }
>    while (1);
>    return mode;
>  }
>  
> +/* Return the smallest integer or QI vector mode that is not narrower
> +   than SIZE bytes.  */
> +
> +fixed_size_mode
> +op_by_pieces_d::smallest_fixed_size_mode_for_size (unsigned int size)
> +{
> +  /* Use QI vector only for > size of WORD.  */
> +  if (m_qi_vector_mode && size > UNITS_PER_WORD)
> +    {
> +      machine_mode mode;
> +      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> +	if (GET_MODE_INNER (mode) == QImode
> +	    && GET_MODE_SIZE (mode).is_constant ())
> +	  {
> +	    unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
> +
> +	    /* NB: Don't return a mode wider than M_LEN.  */
> +	    if (mode_size > m_len)
> +	      break;
> +
> +	    if (mode_size >= size
> +		&& (optab_handler (vec_duplicate_optab, mode)
> +		    != CODE_FOR_nothing))
> +	      return as_a <fixed_size_mode> (mode);
> +	  }
> +    }

Same idea here.

I realise this is pre-existing, but the heavy use of “NB:” seems a bit odd.
I don't think it's something that the patch needs to adopt.

Otherwise it looks good, thanks.  I think the main sticking point is
the subreg thing.

Richard