[PATCH 1/6] RISC-V Port: gcc/config/riscv/riscv.c

Sat Jan 21 05:22:00 GMT 2017

On 01/11/2017 06:30 PM, Palmer Dabbelt wrote:
> +/* The largest number of operations needed to load an integer constant.
> +   The worst case is LUI, ADDI, SLLI, ADDI, SLLI, ADDI, SLLI, ADDI,
> +   but we may attempt and reject even worse sequences.  */
> +#define RISCV_MAX_INTEGER_OPS 32

Why would you?  Surely after you exhaust 8 you'd just abandon that search as 
unprofitable.

> +  if (cost > 2 && (value & 1) == 0)
> +    {
> +      int shift = 0;
> +      while ((value & 1) == 0)
> +	shift++, value >>= 1;

   shift = ctz_hwi (value);

You also may want to test for

   value | (HOST_WIDE_INT_M1U << (HOST_BITS_PER_WIDE_INT - shift))

i.e. shifting out leading 1's.  As well as checking with shift - 12.  The 
latter is interesting for shifting a 20-bit value up into the high word.

I once proposed a generic framework for this that cached results for 
computation of sequences and costs.  Unfortunately it never gained traction. 
Perhaps it's time to try again.

> +      /* Try filling trailing bits with 1s.  */
> +      while ((value << shift) >= 0)
> +	shift++;

clz_hwi.

> +  return GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0;

SYMBOL_REF_P.

> +riscv_symbol_binds_local_p (const_rtx x)
> +{
> +  return (SYMBOL_REF_DECL (x)
> +	  ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
> +	  : SYMBOL_REF_LOCAL_P (x));

Missing SYMOL_REF_P?

Surely encode_section_info will already have set SYMBOL_FLAG_LOCAL, and thus 
you need not invoke targetm.binds_local_p again.

> +    case LABEL_REF:
> +      if (LABEL_REF_NONLOCAL_P (x))
> +	return SYMBOL_GOT_DISP;
> +      break;

Non-local labels are not local to the current function, but they are still 
local to the translation unit (they'll be local to one of the outer functions 
of a nested function).

> +  switch (*symbol_type)
> +    {
> +    case SYMBOL_ABSOLUTE:
> +    case SYMBOL_PCREL:
> +    case SYMBOL_TLS_LE:
> +      return (int32_t) INTVAL (offset) == INTVAL (offset);

Why?

> +    case MINUS:
> +      if (float_mode_p
> +	  && !HONOR_NANS (mode)
> +	  && !HONOR_SIGNED_ZEROS (mode))
> +	{
> +	  /* See if we can use NMADD or NMSUB.  See riscv.md for the
> +	     associated patterns.  */
> +	  rtx op0 = XEXP (x, 0);
> +	  rtx op1 = XEXP (x, 1);
> +	  if (GET_CODE (op0) == MULT && GET_CODE (XEXP (op0, 0)) == NEG)
> +	    {
> +	      *total = (tune_info->fp_mul[mode == DFmode]
> +			+ set_src_cost (XEXP (XEXP (op0, 0), 0), mode, speed)
> +			+ set_src_cost (XEXP (op0, 1), mode, speed)
> +			+ set_src_cost (op1, mode, speed));
> +	      return true;
> +	    }
> +	  if (GET_CODE (op1) == MULT)
> +	    {
> +	      *total = (tune_info->fp_mul[mode == DFmode]
> +			+ set_src_cost (op0, mode, speed)
> +			+ set_src_cost (XEXP (op1, 0), mode, speed)
> +			+ set_src_cost (XEXP (op1, 1), mode, speed));
> +	      return true;
> +	    }
> +	}

Do we not fold these to FMA + NEG?  If not, that's surprising and maybe should 
be fixed.  Also, you appear to be missing costs for FMA in riscv_rtx_costs.

> +	case UNORDERED:
> +	  *code = EQ;
> +	  /* Fall through.  */
> +
> +	case ORDERED:
> +	  /* a == a && b == b */
> +	  tmp0 = gen_reg_rtx (SImode);
> +	  riscv_emit_binary (EQ, tmp0, cmp_op0, cmp_op0);
> +	  tmp1 = gen_reg_rtx (SImode);
> +	  riscv_emit_binary (EQ, tmp1, cmp_op1, cmp_op1);
> +	  *op0 = gen_reg_rtx (SImode);
> +	  riscv_emit_binary (AND, *op0, tmp0, tmp1);
> +	  break;

Better with FCLASS + AND?  At least for a branch?

> +static int
> +riscv_flatten_aggregate_field (const_tree type,
> +			       riscv_aggregate_field fields[2],
> +			       int n, HOST_WIDE_INT offset)

I don't see any code within to bound N to 2, so as not to overflow FIELDS.  Am 
I missing something?

Are you missing code for COMPLEX_TYPE?  In the default case I only see 
SCALAR_FLOAT_TYPE_P.

> +  memset (info, 0, sizeof (*info));
> +  info->gpr_offset = cum->num_gprs;
> +  info->fpr_offset = cum->num_fprs;

Since GPRs and FPRs are allocated sequentially, and indicies that are used for 
GPRs are unused in FPRs and vice versa, why store both gpr_offset and gpr_offset?

> +/* Emit straight-line code to move LENGTH bytes from SRC to DEST.
> +   Assume that the areas do not overlap.  */
> +
> +static void
> +riscv_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length)
> +{
> +  HOST_WIDE_INT offset, delta;
> +  unsigned HOST_WIDE_INT bits;
> +  int i;
> +  enum machine_mode mode;
> +  rtx *regs;
> +
> +  bits = MAX (BITS_PER_UNIT,
> +	      MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest))));
> +
> +  mode = mode_for_size (bits, MODE_INT, 0);
> +  delta = bits / BITS_PER_UNIT;
> +
> +  /* Allocate a buffer for the temporary registers.  */
> +  regs = XALLOCAVEC (rtx, length / delta);
> +
> +  /* Load as many BITS-sized chunks as possible.  Use a normal load if
> +     the source has enough alignment, otherwise use left/right pairs.  */

The comment sounds like it was copied from MIPS.

Do you actually need to implement anything related to block moves at all? 
There's nothing processor specific in RISCV wrt moves...

> +   'z'	Print $0 if OP is zero, otherwise print OP normally.  */

Global replase $0 with x0?  I assume that's a copy from elsewhere.

> +static bool
> +riscv_size_ok_for_small_data_p (int size)
> +{
> +  return g_switch_value && IN_RANGE (size, 1, g_switch_value);
> +}

Huh.  With only a 4k displacement from gp, it's still worthwhile to manage 
small data?

> +  /* Move past any dynamic stack allocations.  */
> +  if (cfun->calls_alloca)
> +    {
> +      rtx adjust = GEN_INT (-frame->hard_frame_pointer_offset);
> +      if (!SMALL_OPERAND (INTVAL (adjust)))
> +	{
> +	  riscv_emit_move (RISCV_PROLOGUE_TEMP (Pmode), adjust);
> +	  adjust = RISCV_PROLOGUE_TEMP (Pmode);
> +	}
> +
> +      emit_insn (gen_add3_insn (stack_pointer_rtx, hard_frame_pointer_rtx,
> +				adjust));
> +    }

Normally we require some sort of FP/SP tie, and memory barrier, when 
deallocating portions of the stack frame.  Otherwise scheduling can produce 
accesses to addresses below the stack pointer.

If you've got alloca, then you can free up lots of stack all at once; you 
needn't simply reset SP to where it ought to be.  You can free up everything up 
to HFP itself, since that's below the register saves.

> +static bool
> +riscv_scalar_mode_supported_p (enum machine_mode mode)
> +{
> +  if (ALL_FIXED_POINT_MODE_P (mode)
> +      && GET_MODE_PRECISION (mode) <= 2 * BITS_PER_WORD)
> +    return true;
> +
> +  return default_scalar_mode_supported_p (mode);
> +}

You really support fixed-point modes?  Not in hardware, certainly, so... why?

> +/* Implement TARGET_TRAMPOLINE_INIT.  */
> +
> +static void
> +riscv_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
> +{
> +  rtx addr, end_addr, mem;
> +  uint32_t trampoline[4];
> +  unsigned int i;
> +  HOST_WIDE_INT static_chain_offset, target_function_offset;
> +
> +  /* Work out the offsets of the pointers from the start of the
> +     trampoline code.  */
> +  gcc_assert (ARRAY_SIZE (trampoline) * 4 == TRAMPOLINE_CODE_SIZE);
> +  static_chain_offset = TRAMPOLINE_CODE_SIZE;
> +  target_function_offset = static_chain_offset + GET_MODE_SIZE (ptr_mode);
> +
> +  /* Get pointers to the beginning and end of the code block.  */
> +  addr = force_reg (Pmode, XEXP (m_tramp, 0));
> +  end_addr = riscv_force_binary (Pmode, PLUS, addr, GEN_INT (TRAMPOLINE_CODE_SIZE));
> +
> +  /* auipc   t0, 0
> +     l[wd]   t1, target_function_offset(t0)
> +     l[wd]   t0, static_chain_offset(t0)
> +     jr      t1
> +  */

For rv32 (or any ilp32), surely

	lui	t0, hi(chain)
	lui	t1, hi(func)
	addi	t0, t0, lo(chain)
	jr	r1, lo(func)

is better.

> +#ifdef HAVE_AS_TLS
> +#undef TARGET_HAVE_TLS
> +#define TARGET_HAVE_TLS true
> +#endif

When would you not have TLS?  Nor DTPRELWORD?

r~