This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] RISC-V: Implement movmemsi


Committed.

On Mon, 06 Nov 2017 17:31:05 PST (-0800), Palmer Dabbelt wrote:
> From: Andrew Waterman <andrew@sifive.com>
>
> Without this we aren't getting proper memcpy inlining on RISC-V systems,
> which is particularly disastrous for Dhrystone performance on RV32IM
> systems.
>
> gcc/ChangeLog
>
> 2017-11-06  Andrew Waterman  <andrew@sifive.com>
>
>         * config/riscv/riscv-protos.h (riscv_hard_regno_nregs): New
>         prototype.
>         (riscv_expand_block_move): Likewise.
>         gcc/config/riscv/riscv.h (MOVE_RATIO): Tune cost to movmemsi
>         implementation.
>         (RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER): New define.
>         (RISCV_MAX_MOVE_BYTES_STRAIGHT): New define.
>         gcc/config/riscv/riscv.c (riscv_block_move_straight): New
>         function.
>         (riscv_adjust_block_mem): Likewise.
>         (riscv_block_move_loop): Likewise.
>         (riscv_expand_block_move): Likewise.
>         gcc/config/riscv/riscv.md (movmemsi): New pattern.
> ---
>  gcc/config/riscv/riscv-protos.h |   4 +-
>  gcc/config/riscv/riscv.c        | 156 ++++++++++++++++++++++++++++++++++++++++
>  gcc/config/riscv/riscv.h        |  21 +++++-
>  gcc/config/riscv/riscv.md       |  13 ++++
>  4 files changed, 190 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index ae551fb39775..34f9859928e2 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -67,7 +67,9 @@ extern HOST_WIDE_INT riscv_initial_elimination_offset (int, int);
>  extern void riscv_expand_prologue (void);
>  extern void riscv_expand_epilogue (bool);
>  extern bool riscv_can_use_return_insn (void);
> -extern rtx riscv_function_value (const_tree, const_tree, machine_mode);
> +extern rtx riscv_function_value (const_tree, const_tree, enum machine_mode);
> +extern unsigned int riscv_hard_regno_nregs (int, enum machine_mode);
> +extern bool riscv_expand_block_move (rtx, rtx, rtx);
>
>  /* Routines implemented in riscv-c.c.  */
>  void riscv_cpu_cpp_builtins (cpp_reader *);
> diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
> index 4a16a75fbafa..e9783e920ef6 100644
> --- a/gcc/config/riscv/riscv.c
> +++ b/gcc/config/riscv/riscv.c
> @@ -2642,6 +2642,162 @@ riscv_legitimize_call_address (rtx addr)
>    return addr;
>  }
>
> +/* Emit straight-line code to move LENGTH bytes from SRC to DEST.
> +   Assume that the areas do not overlap.  */
> +
> +static void
> +riscv_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length)
> +{
> +  HOST_WIDE_INT offset, delta;
> +  unsigned HOST_WIDE_INT bits;
> +  int i;
> +  enum machine_mode mode;
> +  rtx *regs;
> +
> +  bits = MAX (BITS_PER_UNIT,
> +	      MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest))));
> +
> +  mode = mode_for_size (bits, MODE_INT, 0);
> +  delta = bits / BITS_PER_UNIT;
> +
> +  /* Allocate a buffer for the temporary registers.  */
> +  regs = XALLOCAVEC (rtx, length / delta);
> +
> +  /* Load as many BITS-sized chunks as possible.  Use a normal load if
> +     the source has enough alignment, otherwise use left/right pairs.  */
> +  for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
> +    {
> +      regs[i] = gen_reg_rtx (mode);
> +      riscv_emit_move (regs[i], adjust_address (src, mode, offset));
> +    }
> +
> +  /* Copy the chunks to the destination.  */
> +  for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
> +    riscv_emit_move (adjust_address (dest, mode, offset), regs[i]);
> +
> +  /* Mop up any left-over bytes.  */
> +  if (offset < length)
> +    {
> +      src = adjust_address (src, BLKmode, offset);
> +      dest = adjust_address (dest, BLKmode, offset);
> +      move_by_pieces (dest, src, length - offset,
> +		      MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), 0);
> +    }
> +}
> +
> +/* Helper function for doing a loop-based block operation on memory
> +   reference MEM.  Each iteration of the loop will operate on LENGTH
> +   bytes of MEM.
> +
> +   Create a new base register for use within the loop and point it to
> +   the start of MEM.  Create a new memory reference that uses this
> +   register.  Store them in *LOOP_REG and *LOOP_MEM respectively.  */
> +
> +static void
> +riscv_adjust_block_mem (rtx mem, HOST_WIDE_INT length,
> +		       rtx *loop_reg, rtx *loop_mem)
> +{
> +  *loop_reg = copy_addr_to_reg (XEXP (mem, 0));
> +
> +  /* Although the new mem does not refer to a known location,
> +     it does keep up to LENGTH bytes of alignment.  */
> +  *loop_mem = change_address (mem, BLKmode, *loop_reg);
> +  set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
> +}
> +
> +/* Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
> +   bytes at a time.  LENGTH must be at least BYTES_PER_ITER.  Assume that
> +   the memory regions do not overlap.  */
> +
> +static void
> +riscv_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
> +		      HOST_WIDE_INT bytes_per_iter)
> +{
> +  rtx label, src_reg, dest_reg, final_src, test;
> +  HOST_WIDE_INT leftover;
> +
> +  leftover = length % bytes_per_iter;
> +  length -= leftover;
> +
> +  /* Create registers and memory references for use within the loop.  */
> +  riscv_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
> +  riscv_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
> +
> +  /* Calculate the value that SRC_REG should have after the last iteration
> +     of the loop.  */
> +  final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
> +				   0, 0, OPTAB_WIDEN);
> +
> +  /* Emit the start of the loop.  */
> +  label = gen_label_rtx ();
> +  emit_label (label);
> +
> +  /* Emit the loop body.  */
> +  riscv_block_move_straight (dest, src, bytes_per_iter);
> +
> +  /* Move on to the next block.  */
> +  riscv_emit_move (src_reg, plus_constant (Pmode, src_reg, bytes_per_iter));
> +  riscv_emit_move (dest_reg, plus_constant (Pmode, dest_reg, bytes_per_iter));
> +
> +  /* Emit the loop condition.  */
> +  test = gen_rtx_NE (VOIDmode, src_reg, final_src);
> +  if (Pmode == DImode)
> +    emit_jump_insn (gen_cbranchdi4 (test, src_reg, final_src, label));
> +  else
> +    emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));
> +
> +  /* Mop up any left-over bytes.  */
> +  if (leftover)
> +    riscv_block_move_straight (dest, src, leftover);
> +  else
> +    emit_insn(gen_nop ());
> +}
> +
> +/* Expand a movmemsi instruction, which copies LENGTH bytes from
> +   memory reference SRC to memory reference DEST.  */
> +
> +bool
> +riscv_expand_block_move (rtx dest, rtx src, rtx length)
> +{
> +  if (CONST_INT_P (length))
> +    {
> +      HOST_WIDE_INT factor, align;
> +
> +      align = MIN (MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), BITS_PER_WORD);
> +      factor = BITS_PER_WORD / align;
> +
> +      if (optimize_function_for_size_p (cfun)
> +	  && INTVAL (length) * factor * UNITS_PER_WORD > MOVE_RATIO (false))
> +	return false;
> +
> +      if (INTVAL (length) <= RISCV_MAX_MOVE_BYTES_STRAIGHT / factor)
> +	{
> +	  riscv_block_move_straight (dest, src, INTVAL (length));
> +	  return true;
> +	}
> +      else if (optimize && align >= BITS_PER_WORD)
> +	{
> +	  unsigned min_iter_words
> +	    = RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD;
> +	  unsigned iter_words = min_iter_words;
> +	  HOST_WIDE_INT bytes = INTVAL (length), words = bytes / UNITS_PER_WORD;
> +
> +	  /* Lengthen the loop body if it shortens the tail.  */
> +	  for (unsigned i = min_iter_words; i < min_iter_words * 2 - 1; i++)
> +	    {
> +	      unsigned cur_cost = iter_words + words % iter_words;
> +	      unsigned new_cost = i + words % i;
> +	      if (new_cost <= cur_cost)
> +		iter_words = i;
> +	    }
> +
> +	  riscv_block_move_loop (dest, src, bytes, iter_words * UNITS_PER_WORD);
> +	  return true;
> +	}
> +    }
> +  return false;
> +}
> +
>  /* Print symbolic operand OP, which is part of a HIGH or LO_SUM
>     in context CONTEXT.  HI_RELOC indicates a high-part reloc.  */
>
> diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
> index a802a3f8cbbb..c0901a093033 100644
> --- a/gcc/config/riscv/riscv.h
> +++ b/gcc/config/riscv/riscv.h
> @@ -808,10 +808,25 @@ while (0)
>  #undef PTRDIFF_TYPE
>  #define PTRDIFF_TYPE (POINTER_SIZE == 64 ? "long int" : "int")
>
> -/* If a memory-to-memory move would take MOVE_RATIO or more simple
> -   move-instruction pairs, we will do a movmem or libcall instead.  */
> +/* The maximum number of bytes copied by one iteration of a movmemsi loop.  */
> +
> +#define RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4)
> +
> +/* The maximum number of bytes that can be copied by a straight-line
> +   movmemsi implementation.  */
>
> -#define MOVE_RATIO(speed) (CLEAR_RATIO (speed) / 2)
> +#define RISCV_MAX_MOVE_BYTES_STRAIGHT (RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER * 3)
> +
> +/* If a memory-to-memory move would take MOVE_RATIO or more simple
> +   move-instruction pairs, we will do a movmem or libcall instead.
> +   Do not use move_by_pieces at all when strict alignment is not
> +   in effect but the target has slow unaligned accesses; in this
> +   case, movmem or libcall is more efficient.  */
> +
> +#define MOVE_RATIO(speed)						\
> +  (!STRICT_ALIGNMENT && riscv_slow_unaligned_access ? 1 :		\
> +   (speed) ? RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD :	\
> +   CLEAR_RATIO (speed) / 2)
>
>  /* For CLEAR_RATIO, when optimizing for size, give a better estimate
>     of the length of a memset call, but use the default otherwise.  */
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 53e1db97db7d..814ff6ec6ad7 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -1436,6 +1436,19 @@
>    DONE;
>  })
>
> +(define_expand "movmemsi"
> +  [(parallel [(set (match_operand:BLK 0 "general_operand")
> +		   (match_operand:BLK 1 "general_operand"))
> +	      (use (match_operand:SI 2 ""))
> +	      (use (match_operand:SI 3 "const_int_operand"))])]
> +  ""
> +{
> +  if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
> +    DONE;
> +  else
> +    FAIL;
> +})
> +
>  ;; Expand in-line code to clear the instruction cache between operand[0] and
>  ;; operand[1].
>  (define_expand "clear_cache"


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]