[PATCH v2] aarch64: Add split-stack initial support

Adhemerval Zanella adhemerval.zanella@linaro.org
Tue Oct 11 19:39:00 GMT 2016



On 07/10/2016 05:28, Kyrill Tkachov wrote:
> Hi Adhemerval,
> 
> CC'ing the aarch64 maintainers/reviewers.
> I have some comments inline, mostly about the GCC coding conventions.

Thanks for the review.

>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index df6514d..c689440 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -3149,6 +3149,49 @@ aarch64_restore_callee_saves (machine_mode mode,
>>       }
>>   }
>>   +static bool
>> +split_stack_arg_pointer_used_p (void)
>> +{
> 
> New function needs a function comment.
> 

Ack, I will add one.

>> +  bool using_split_stack = (flag_split_stack
>> +                && (lookup_attribute ("no_split_stack",
>> +                          DECL_ATTRIBUTES (cfun->decl))
>> +                               == NULL));
>> +  if (using_split_stack == false)
>> +    return false;
>> +
>> +  /* If the pseudo holding the arg pointer is no longer a pseudo,
>> +     then the arg pointer is used.  */
>> +  if (cfun->machine->frame.split_stack_arg_pointer != NULL_RTX
>> +      && (!REG_P (cfun->machine->frame.split_stack_arg_pointer)
>> +          || (REGNO (cfun->machine->frame.split_stack_arg_pointer)
>> +              < FIRST_PSEUDO_REGISTER)))
>> +    return true;
>> +
>> +  /* Unfortunately we also need to do some code scanning, since
>> +     x10 may have been substituted for the pseudo.  */
>> +  rtx_insn *insn;
>> +  basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
>> +  FOR_BB_INSNS (bb, insn)
>> +    if (NONDEBUG_INSN_P (insn))
>> +      {
>> +        df_ref use;
>> +        FOR_EACH_INSN_USE (use, insn)
>> +          {
>> +            rtx x = DF_REF_REG (use);
>> +            if (REG_P (x) && REGNO (x) == R10_REGNUM)
>> +              return true;
>> +          }
>> +        df_ref def;
>> +        FOR_EACH_INSN_DEF (def, insn)
>> +          {
>> +            rtx x = DF_REF_REG (def);
>> +            if (REG_P (x) && REGNO (x) == R10_REGNUM)
>> +              return false;
>> +          }
>> +      }
>> +  return bitmap_bit_p (DF_LR_OUT (bb), R10_REGNUM);
>> +}
>> +
>>   /* AArch64 stack frames generated by this compiler look like:
>>         +-------------------------------+
>> @@ -3204,6 +3247,7 @@ aarch64_expand_prologue (void)
>>     unsigned reg1 = cfun->machine->frame.wb_candidate1;
>>     unsigned reg2 = cfun->machine->frame.wb_candidate2;
>>     rtx_insn *insn;
>> +  bool split_stack_arg_pointer_used = split_stack_arg_pointer_used_p ();
>>       if (flag_stack_usage_info)
>>       current_function_static_stack_size = frame_size;
>> @@ -3220,6 +3264,10 @@ aarch64_expand_prologue (void)
>>       aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
>>       }
>>   +  /* Save split-stack argument pointer before stack adjustment.  */
>> +  if (split_stack_arg_pointer_used)
>> +    emit_move_insn (gen_rtx_REG (Pmode, R10_REGNUM), stack_pointer_rtx);
>> +
>>     aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, true);
>>       if (callee_adjust != 0)
>> @@ -3243,6 +3291,30 @@ aarch64_expand_prologue (void)
>>                    callee_adjust != 0 || frame_pointer_needed);
>>     aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust,
>>               !frame_pointer_needed);
>> +
>> +  if (split_stack_arg_pointer_used_p ())
>> +    {
>> +      /* Setup the argument pointer (x10) for -fsplit-stack code.  If
>> +     __morestack was called, it will left the arg pointer to the
>> +     old stack in x28.  Otherwise, the argument pointer is the top
>> +     of current frame.  */
>> +      rtx x10 = gen_rtx_REG (Pmode, R10_REGNUM);
>> +      rtx x28 = gen_rtx_REG (Pmode, R28_REGNUM);
>> +      rtx not_more = gen_label_rtx ();
>> +      rtx cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
>> +      rtx jump;
>> +
>> +      jump = gen_rtx_IF_THEN_ELSE (VOIDmode,
>> +                   gen_rtx_GEU (VOIDmode, cc_reg,
>> +                        const0_rtx),
>> +                   gen_rtx_LABEL_REF (VOIDmode, not_more),
>> +                   pc_rtx);
>> +      jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump));
> 
> Are you trying to emit a conditional jump here?
> If so it would be cleaner to use the pattern name you have in mind
> directly like so (barring typos ;):
> rtx cmp = gen_rtx_fmt_ee (GEU, VOIDmode, cc_reg, const0_rtx);
> jump = emit_jump_insn (gen_condjump (cmp, cc_reg, not_more));
> This will avoid constructing the SET and IF_THEN_ELSE rtxes manually.

Yes, the idea is to emit a conditional jump.  I have no preference here,
I used the IF_THEN_ELSE functions mainly because the other arches are
using it, but I think your suggestion should be simpler.

> 
>> +      JUMP_LABEL (jump) = not_more;
>> +      LABEL_NUSES (not_more) += 1;
>> +      emit_move_insn (x10, x28);
>> +      emit_label (not_more);
>> +    }
>>   }
>>     /* Return TRUE if we can use a simple_return insn.
>> @@ -9677,7 +9749,7 @@ aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
>>     /* Emit code to initialize STACK, which points to the next varargs stack
>>        argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
>>        by named arguments.  STACK is 8-byte aligned.  */
>> -  t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
>> +  t = make_tree (TREE_TYPE (stack), crtl->args.internal_arg_pointer);
>>     if (cum->aapcs_stack_size > 0)
>>       t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
>>     t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
>> @@ -14054,6 +14126,189 @@ aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
>>       }
>>   }
>>   +/* -fsplit-stack support.  */
>> +
>> +/* A SYMBOL_REF for __morestack.  */
>> +static GTY(()) rtx morestack_ref;
>> +
>> +/* Emit -fsplit-stack prologue, which goes before the regular function
>> +   prologue.  */
>> +void
>> +aarch64_expand_split_stack_prologue (void)
>> +{
> 
> Nit: new line between function comment and its type.

Ack.

> 
>> +  HOST_WIDE_INT frame_size;
>> +  rtx_code_label *ok_label = NULL;
>> +  rtx mem, ssvalue, cc, jump, insn, call_fusage;
>> +  rtx reg30, temp;
>> +  rtx new_cfa, cfi_ops = NULL;
>> +  /* Offset from thread pointer to __private_ss.  */
>> +  int psso = 0x10;
>> +  int ninsn;
>> +
>> +  gcc_assert (flag_split_stack && reload_completed);
>> +
>> +  /* A minimal stack frame would be created for __morestack call.  */
>> +  frame_size = cfun->machine->frame.frame_size + 16;
>> +
>> +  /* It limits total maximum stack allocation on 2G so its value can be
>> +     materialized with two instruction at most (movn/movk).  It might be
> 
> s/with two instruction/using two instructions/
> 

Ack.

>> +     used by the linker to add some extra space for split calling non split
>> +     stack functions.  */
>> +  if (frame_size > ((HOST_WIDE_INT) 1 << 31))
>> +    {
>> +      sorry ("Stack frame larger than 2G is not supported for -fsplit-stack");
>> +      return;
>> +    }
>> +
>> +  if (morestack_ref == NULL_RTX)
>> +    {
>> +      morestack_ref = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
>> +      SYMBOL_REF_FLAGS (morestack_ref) |= (SYMBOL_FLAG_LOCAL
>> +                       | SYMBOL_FLAG_FUNCTION);
>> +    }
>> +
>> +  /* Load __private_ss from TCB.  */
>> +  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
>> +  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
>> +  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
>> +  emit_move_insn (ssvalue, mem);
>> +
>> +  temp = gen_rtx_REG (Pmode, R10_REGNUM);
>> +
>> +  /* Always emit two insns to calculate the requested stack, so the linker
>> +     can edit them when adjusting size for calling non-split-stack code.  */
>> +  ninsn = aarch64_internal_mov_immediate (temp, GEN_INT (-frame_size), true,
>> +                      Pmode);
>> +  gcc_assert (ninsn == 1 || ninsn == 2);
>> +  if (ninsn == 1)
>> +    emit_insn (gen_nop ());
>> +  emit_insn (gen_add3_insn (temp, stack_pointer_rtx, temp));
>> +
>> +  /* Jump to __morestack call if current __private_ss is not suffice.  */
>> +  ok_label = gen_label_rtx ();
>> +  cc = aarch64_gen_compare_reg (LT, temp, ssvalue);
>> +  jump = gen_rtx_IF_THEN_ELSE (VOIDmode,
>> +                   gen_rtx_GEU (VOIDmode, cc, const0_rtx),
>> +                   gen_rtx_LABEL_REF (Pmode, ok_label),
>> +                   pc_rtx);
>> +  jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump));
> 
> similar to the above about emitting the conditional jump.

Ack.

> 
>> +  /* Mark the jump as very likely to be taken.  */
>> +  add_int_reg_note (jump, REG_BR_PROB,
>> +                    REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
>> +  JUMP_LABEL (jump) = ok_label;
>> +  LABEL_NUSES (ok_label) = 1;
>> +
>> +  /* Call __morestack with a non-standard call procedure: x10 will hold
>> +     the requested stack pointer.  */
>> +  call_fusage = NULL_RTX;
>> +
>> +  /* Set up a minimum frame pointer to call __morestack.  The SP is not
>> +     save on x29 prior so in __morestack x29 points to the called SP.  */
>> +  reg30 = gen_rtx_REG (Pmode, R30_REGNUM);
>> +  aarch64_pushwb_single_reg (Pmode, R30_REGNUM, 16);
>> +
>> +  insn = emit_call_insn (gen_call (gen_rtx_MEM (DImode, morestack_ref),
>> +                   const0_rtx, const0_rtx));
>> +  add_function_usage_to (insn, call_fusage);
>> +
>> +  cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg30, cfi_ops);
>> +  mem = plus_constant (Pmode, stack_pointer_rtx, 16);
>> +  cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
>> +
>> +  mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
>> +  mem = gen_rtx_MEM (DImode, mem);
>> +  insn = emit_move_insn (reg30, mem);
>> +
>> +  new_cfa = stack_pointer_rtx;
>> +  new_cfa = plus_constant (Pmode, new_cfa, 16);
>> +  cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
>> +  REG_NOTES (insn) = cfi_ops;
>> +  RTX_FRAME_RELATED_P (insn) = 1;
>> +
>> +  emit_insn (gen_split_stack_return ());
>> +
>> +  /* __morestack will call us here.  */
>> +  emit_label (ok_label);
>> +}
>> +
>> +/* Implement TARGET_ASM_FILE_END.  */
>> +static void
>> +aarch64_file_end (void)
>> +{
> 
> New line between comment and function type.

Ack.

> 
>> +  file_end_indicate_exec_stack ();
>> +
>> +  if (flag_split_stack)
>> +    file_end_indicate_split_stack ();
>> +}
>> +
>> +/* Return the internal arg pointer used for function incoming arguments.  */
>> +static rtx
>> +aarch64_internal_arg_pointer (void)
>> +{
> 
> Likewise.

Ack.

> 
>> +  if (flag_split_stack
>> +     && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl))
>> +         == NULL))
>> +    {
>> +      if (cfun->machine->frame.split_stack_arg_pointer == NULL_RTX)
>> +    {
>> +      rtx pat;
>> +
>> +      cfun->machine->frame.split_stack_arg_pointer = gen_reg_rtx (Pmode);
>> +      REG_POINTER (cfun->machine->frame.split_stack_arg_pointer) = 1;
>> +
>> +      /* Put the pseudo initialization right after the note at the
>> +         beginning of the function.  */
>> +      pat = gen_rtx_SET (cfun->machine->frame.split_stack_arg_pointer,
>> +                 gen_rtx_REG (Pmode, R10_REGNUM));
>> +      push_topmost_sequence ();
>> +      emit_insn_after (pat, get_insns ());
>> +      pop_topmost_sequence ();
>> +    }
>> +      return plus_constant (Pmode, cfun->machine->frame.split_stack_arg_pointer,
>> +                FIRST_PARM_OFFSET (current_function_decl));
>> +    }
>> +  return virtual_incoming_args_rtx;
>> +}
>> +
>> +static void
>> +aarch64_live_on_entry (bitmap regs)
>> +{
> 
> Needs function comment.

Ack, I will add one.

> 
>> +  if (flag_split_stack)
>> +    bitmap_set_bit (regs, R10_REGNUM);
>> +}
>> +
>> +/* Emit -fsplit-stack dynamic stack allocation space check.  */
>> +
>> +void
>> +aarch64_split_stack_space_check (rtx size, rtx label)
>> +{
>> +  rtx mem, ssvalue, cc, jump, temp;
>> +  rtx requested = gen_reg_rtx (Pmode);
>> +  /* Offset from thread pointer to __private_ss.  */
>> +  int psso = 0x10;
>> +
>> +  /* Load __private_ss from TCB.  */
>> +  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
>> +  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
>> +  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
>> +  emit_move_insn (ssvalue, mem);
>> +
>> +  temp = gen_rtx_REG (Pmode, R10_REGNUM);
>> +
>> +  /* And compare it with frame pointer plus required stack.  */
>> +  size = force_reg (Pmode, size);
>> +  emit_move_insn (requested, gen_rtx_MINUS (Pmode, stack_pointer_rtx, size));
>> +
>> +  /* Jump to __morestack call if current __private_ss is not suffice.  */
>> +  cc = aarch64_gen_compare_reg (LT, temp, ssvalue);
>> +  jump = gen_rtx_IF_THEN_ELSE (VOIDmode,
>> +                   gen_rtx_GEU (VOIDmode, cc, const0_rtx),
>> +                   gen_rtx_LABEL_REF (VOIDmode, label),
>> +                   pc_rtx);
>> +  jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump));
> 
> Likewise about emitting conditional jumps.

Ack.

> 
> <snip>
> 
>>
>> diff --git a/gcc/testsuite/gcc.dg/split-3.c b/gcc/testsuite/gcc.dg/split-3.c
>> index 64bbb8c..5ba7616 100644
>> --- a/gcc/testsuite/gcc.dg/split-3.c
>> +++ b/gcc/testsuite/gcc.dg/split-3.c
>> @@ -40,6 +40,7 @@ down (int i, ...)
>>         || va_arg (ap, int) != 9
>>         || va_arg (ap, int) != 10)
>>       abort ();
>> +  va_end (ap);
>>       if (i > 0)
>>       {
>> diff --git a/gcc/testsuite/gcc.dg/split-6.c b/gcc/testsuite/gcc.dg/split-6.c
>> index b32cf8d..b3016ba 100644
>> --- a/gcc/testsuite/gcc.dg/split-6.c
>> +++ b/gcc/testsuite/gcc.dg/split-6.c
>> @@ -37,6 +37,7 @@ down (int i, ...)
>>         || va_arg (ap, int) != 9
>>         || va_arg (ap, int) != 10)
>>       abort ();
>> +  va_end (ap);
>>       if (i > 0)
>>       {
> 
> Is this just undefined behaviour in existing tests?
> If so, I recommend you split these testsuite fixes into a separate patch
> and submit that separately. It can probably go in much sooner independently
> of slit stack support.
> 
> Also, I think some tests for split-stack support are appropriate.
> At least something that checks that gcc generates the jumps to the appropriate
> helper functions?

In theory yes, although it does not really invalidate the test since the
idea is to check for the variadic arguments.  Based on your comment I think
it would better to split this patch in 2 with this modifications at first.

Regarding the tests, the go testsuite itself covers pretty much all the
morestack call usage (it indeed showed a lot of issues with some initial
drafts for this patches). So I am not sure how more extensible testsuite
we would like to push for this.

> 
> Thanks,
> Kyrill



More information about the Gcc-patches mailing list