[PATCH v2] aarch64: Add split-stack initial support
Adhemerval Zanella
adhemerval.zanella@linaro.org
Tue Oct 11 19:39:00 GMT 2016
On 07/10/2016 05:28, Kyrill Tkachov wrote:
> Hi Adhemerval,
>
> CC'ing the aarch64 maintainers/reviewers.
> I have some comments inline, mostly about the GCC coding conventions.
Thanks for the review.
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index df6514d..c689440 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -3149,6 +3149,49 @@ aarch64_restore_callee_saves (machine_mode mode,
>> }
>> }
>> +static bool
>> +split_stack_arg_pointer_used_p (void)
>> +{
>
> New function needs a function comment.
>
Ack, I will add one.
>> + bool using_split_stack = (flag_split_stack
>> + && (lookup_attribute ("no_split_stack",
>> + DECL_ATTRIBUTES (cfun->decl))
>> + == NULL));
>> + if (using_split_stack == false)
>> + return false;
>> +
>> + /* If the pseudo holding the arg pointer is no longer a pseudo,
>> + then the arg pointer is used. */
>> + if (cfun->machine->frame.split_stack_arg_pointer != NULL_RTX
>> + && (!REG_P (cfun->machine->frame.split_stack_arg_pointer)
>> + || (REGNO (cfun->machine->frame.split_stack_arg_pointer)
>> + < FIRST_PSEUDO_REGISTER)))
>> + return true;
>> +
>> + /* Unfortunately we also need to do some code scanning, since
>> + x10 may have been substituted for the pseudo. */
>> + rtx_insn *insn;
>> + basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
>> + FOR_BB_INSNS (bb, insn)
>> + if (NONDEBUG_INSN_P (insn))
>> + {
>> + df_ref use;
>> + FOR_EACH_INSN_USE (use, insn)
>> + {
>> + rtx x = DF_REF_REG (use);
>> + if (REG_P (x) && REGNO (x) == R10_REGNUM)
>> + return true;
>> + }
>> + df_ref def;
>> + FOR_EACH_INSN_DEF (def, insn)
>> + {
>> + rtx x = DF_REF_REG (def);
>> + if (REG_P (x) && REGNO (x) == R10_REGNUM)
>> + return false;
>> + }
>> + }
>> + return bitmap_bit_p (DF_LR_OUT (bb), R10_REGNUM);
>> +}
>> +
>> /* AArch64 stack frames generated by this compiler look like:
>> +-------------------------------+
>> @@ -3204,6 +3247,7 @@ aarch64_expand_prologue (void)
>> unsigned reg1 = cfun->machine->frame.wb_candidate1;
>> unsigned reg2 = cfun->machine->frame.wb_candidate2;
>> rtx_insn *insn;
>> + bool split_stack_arg_pointer_used = split_stack_arg_pointer_used_p ();
>> if (flag_stack_usage_info)
>> current_function_static_stack_size = frame_size;
>> @@ -3220,6 +3264,10 @@ aarch64_expand_prologue (void)
>> aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
>> }
>> + /* Save split-stack argument pointer before stack adjustment. */
>> + if (split_stack_arg_pointer_used)
>> + emit_move_insn (gen_rtx_REG (Pmode, R10_REGNUM), stack_pointer_rtx);
>> +
>> aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, true);
>> if (callee_adjust != 0)
>> @@ -3243,6 +3291,30 @@ aarch64_expand_prologue (void)
>> callee_adjust != 0 || frame_pointer_needed);
>> aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust,
>> !frame_pointer_needed);
>> +
>> + if (split_stack_arg_pointer_used_p ())
>> + {
>> + /* Setup the argument pointer (x10) for -fsplit-stack code. If
>> + __morestack was called, it will left the arg pointer to the
>> + old stack in x28. Otherwise, the argument pointer is the top
>> + of current frame. */
>> + rtx x10 = gen_rtx_REG (Pmode, R10_REGNUM);
>> + rtx x28 = gen_rtx_REG (Pmode, R28_REGNUM);
>> + rtx not_more = gen_label_rtx ();
>> + rtx cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
>> + rtx jump;
>> +
>> + jump = gen_rtx_IF_THEN_ELSE (VOIDmode,
>> + gen_rtx_GEU (VOIDmode, cc_reg,
>> + const0_rtx),
>> + gen_rtx_LABEL_REF (VOIDmode, not_more),
>> + pc_rtx);
>> + jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump));
>
> Are you trying to emit a conditional jump here?
> If so it would be cleaner to use the pattern name you have in mind
> directly like so (barring typos ;):
> rtx cmp = gen_rtx_fmt_ee (GEU, VOIDmode, cc_reg, const0_rtx);
> jump = emit_jump_insn (gen_condjump (cmp, cc_reg, not_more));
> This will avoid constructing the SET and IF_THEN_ELSE rtxes manually.
Yes, the idea is to emit a conditional jump. I have no preference here,
I used the IF_THEN_ELSE functions mainly because the other arches are
using it, but I think your suggestion should be simpler.
>
>> + JUMP_LABEL (jump) = not_more;
>> + LABEL_NUSES (not_more) += 1;
>> + emit_move_insn (x10, x28);
>> + emit_label (not_more);
>> + }
>> }
>> /* Return TRUE if we can use a simple_return insn.
>> @@ -9677,7 +9749,7 @@ aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
>> /* Emit code to initialize STACK, which points to the next varargs stack
>> argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
>> by named arguments. STACK is 8-byte aligned. */
>> - t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
>> + t = make_tree (TREE_TYPE (stack), crtl->args.internal_arg_pointer);
>> if (cum->aapcs_stack_size > 0)
>> t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
>> t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
>> @@ -14054,6 +14126,189 @@ aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
>> }
>> }
>> +/* -fsplit-stack support. */
>> +
>> +/* A SYMBOL_REF for __morestack. */
>> +static GTY(()) rtx morestack_ref;
>> +
>> +/* Emit -fsplit-stack prologue, which goes before the regular function
>> + prologue. */
>> +void
>> +aarch64_expand_split_stack_prologue (void)
>> +{
>
> Nit: new line between function comment and its type.
Ack.
>
>> + HOST_WIDE_INT frame_size;
>> + rtx_code_label *ok_label = NULL;
>> + rtx mem, ssvalue, cc, jump, insn, call_fusage;
>> + rtx reg30, temp;
>> + rtx new_cfa, cfi_ops = NULL;
>> + /* Offset from thread pointer to __private_ss. */
>> + int psso = 0x10;
>> + int ninsn;
>> +
>> + gcc_assert (flag_split_stack && reload_completed);
>> +
>> + /* A minimal stack frame would be created for __morestack call. */
>> + frame_size = cfun->machine->frame.frame_size + 16;
>> +
>> + /* It limits total maximum stack allocation on 2G so its value can be
>> + materialized with two instruction at most (movn/movk). It might be
>
> s/with two instruction/using two instructions/
>
Ack.
>> + used by the linker to add some extra space for split calling non split
>> + stack functions. */
>> + if (frame_size > ((HOST_WIDE_INT) 1 << 31))
>> + {
>> + sorry ("Stack frame larger than 2G is not supported for -fsplit-stack");
>> + return;
>> + }
>> +
>> + if (morestack_ref == NULL_RTX)
>> + {
>> + morestack_ref = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
>> + SYMBOL_REF_FLAGS (morestack_ref) |= (SYMBOL_FLAG_LOCAL
>> + | SYMBOL_FLAG_FUNCTION);
>> + }
>> +
>> + /* Load __private_ss from TCB. */
>> + ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
>> + emit_insn (gen_aarch64_load_tp_hard (ssvalue));
>> + mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
>> + emit_move_insn (ssvalue, mem);
>> +
>> + temp = gen_rtx_REG (Pmode, R10_REGNUM);
>> +
>> + /* Always emit two insns to calculate the requested stack, so the linker
>> + can edit them when adjusting size for calling non-split-stack code. */
>> + ninsn = aarch64_internal_mov_immediate (temp, GEN_INT (-frame_size), true,
>> + Pmode);
>> + gcc_assert (ninsn == 1 || ninsn == 2);
>> + if (ninsn == 1)
>> + emit_insn (gen_nop ());
>> + emit_insn (gen_add3_insn (temp, stack_pointer_rtx, temp));
>> +
>> + /* Jump to __morestack call if current __private_ss is not suffice. */
>> + ok_label = gen_label_rtx ();
>> + cc = aarch64_gen_compare_reg (LT, temp, ssvalue);
>> + jump = gen_rtx_IF_THEN_ELSE (VOIDmode,
>> + gen_rtx_GEU (VOIDmode, cc, const0_rtx),
>> + gen_rtx_LABEL_REF (Pmode, ok_label),
>> + pc_rtx);
>> + jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump));
>
> similar to the above about emitting the conditional jump.
Ack.
>
>> + /* Mark the jump as very likely to be taken. */
>> + add_int_reg_note (jump, REG_BR_PROB,
>> + REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
>> + JUMP_LABEL (jump) = ok_label;
>> + LABEL_NUSES (ok_label) = 1;
>> +
>> + /* Call __morestack with a non-standard call procedure: x10 will hold
>> + the requested stack pointer. */
>> + call_fusage = NULL_RTX;
>> +
>> + /* Set up a minimum frame pointer to call __morestack. The SP is not
>> + save on x29 prior so in __morestack x29 points to the called SP. */
>> + reg30 = gen_rtx_REG (Pmode, R30_REGNUM);
>> + aarch64_pushwb_single_reg (Pmode, R30_REGNUM, 16);
>> +
>> + insn = emit_call_insn (gen_call (gen_rtx_MEM (DImode, morestack_ref),
>> + const0_rtx, const0_rtx));
>> + add_function_usage_to (insn, call_fusage);
>> +
>> + cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg30, cfi_ops);
>> + mem = plus_constant (Pmode, stack_pointer_rtx, 16);
>> + cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
>> +
>> + mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
>> + mem = gen_rtx_MEM (DImode, mem);
>> + insn = emit_move_insn (reg30, mem);
>> +
>> + new_cfa = stack_pointer_rtx;
>> + new_cfa = plus_constant (Pmode, new_cfa, 16);
>> + cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
>> + REG_NOTES (insn) = cfi_ops;
>> + RTX_FRAME_RELATED_P (insn) = 1;
>> +
>> + emit_insn (gen_split_stack_return ());
>> +
>> + /* __morestack will call us here. */
>> + emit_label (ok_label);
>> +}
>> +
>> +/* Implement TARGET_ASM_FILE_END. */
>> +static void
>> +aarch64_file_end (void)
>> +{
>
> New line between comment and function type.
Ack.
>
>> + file_end_indicate_exec_stack ();
>> +
>> + if (flag_split_stack)
>> + file_end_indicate_split_stack ();
>> +}
>> +
>> +/* Return the internal arg pointer used for function incoming arguments. */
>> +static rtx
>> +aarch64_internal_arg_pointer (void)
>> +{
>
> Likewise.
Ack.
>
>> + if (flag_split_stack
>> + && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl))
>> + == NULL))
>> + {
>> + if (cfun->machine->frame.split_stack_arg_pointer == NULL_RTX)
>> + {
>> + rtx pat;
>> +
>> + cfun->machine->frame.split_stack_arg_pointer = gen_reg_rtx (Pmode);
>> + REG_POINTER (cfun->machine->frame.split_stack_arg_pointer) = 1;
>> +
>> + /* Put the pseudo initialization right after the note at the
>> + beginning of the function. */
>> + pat = gen_rtx_SET (cfun->machine->frame.split_stack_arg_pointer,
>> + gen_rtx_REG (Pmode, R10_REGNUM));
>> + push_topmost_sequence ();
>> + emit_insn_after (pat, get_insns ());
>> + pop_topmost_sequence ();
>> + }
>> + return plus_constant (Pmode, cfun->machine->frame.split_stack_arg_pointer,
>> + FIRST_PARM_OFFSET (current_function_decl));
>> + }
>> + return virtual_incoming_args_rtx;
>> +}
>> +
>> +static void
>> +aarch64_live_on_entry (bitmap regs)
>> +{
>
> Needs function comment.
Ack, I will add one.
>
>> + if (flag_split_stack)
>> + bitmap_set_bit (regs, R10_REGNUM);
>> +}
>> +
>> +/* Emit -fsplit-stack dynamic stack allocation space check. */
>> +
>> +void
>> +aarch64_split_stack_space_check (rtx size, rtx label)
>> +{
>> + rtx mem, ssvalue, cc, jump, temp;
>> + rtx requested = gen_reg_rtx (Pmode);
>> + /* Offset from thread pointer to __private_ss. */
>> + int psso = 0x10;
>> +
>> + /* Load __private_ss from TCB. */
>> + ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
>> + emit_insn (gen_aarch64_load_tp_hard (ssvalue));
>> + mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
>> + emit_move_insn (ssvalue, mem);
>> +
>> + temp = gen_rtx_REG (Pmode, R10_REGNUM);
>> +
>> + /* And compare it with frame pointer plus required stack. */
>> + size = force_reg (Pmode, size);
>> + emit_move_insn (requested, gen_rtx_MINUS (Pmode, stack_pointer_rtx, size));
>> +
>> + /* Jump to __morestack call if current __private_ss is not suffice. */
>> + cc = aarch64_gen_compare_reg (LT, temp, ssvalue);
>> + jump = gen_rtx_IF_THEN_ELSE (VOIDmode,
>> + gen_rtx_GEU (VOIDmode, cc, const0_rtx),
>> + gen_rtx_LABEL_REF (VOIDmode, label),
>> + pc_rtx);
>> + jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump));
>
> Likewise about emitting conditional jumps.
Ack.
>
> <snip>
>
>>
>> diff --git a/gcc/testsuite/gcc.dg/split-3.c b/gcc/testsuite/gcc.dg/split-3.c
>> index 64bbb8c..5ba7616 100644
>> --- a/gcc/testsuite/gcc.dg/split-3.c
>> +++ b/gcc/testsuite/gcc.dg/split-3.c
>> @@ -40,6 +40,7 @@ down (int i, ...)
>> || va_arg (ap, int) != 9
>> || va_arg (ap, int) != 10)
>> abort ();
>> + va_end (ap);
>> if (i > 0)
>> {
>> diff --git a/gcc/testsuite/gcc.dg/split-6.c b/gcc/testsuite/gcc.dg/split-6.c
>> index b32cf8d..b3016ba 100644
>> --- a/gcc/testsuite/gcc.dg/split-6.c
>> +++ b/gcc/testsuite/gcc.dg/split-6.c
>> @@ -37,6 +37,7 @@ down (int i, ...)
>> || va_arg (ap, int) != 9
>> || va_arg (ap, int) != 10)
>> abort ();
>> + va_end (ap);
>> if (i > 0)
>> {
>
> Is this just undefined behaviour in existing tests?
> If so, I recommend you split these testsuite fixes into a separate patch
> and submit that separately. It can probably go in much sooner independently
> of slit stack support.
>
> Also, I think some tests for split-stack support are appropriate.
> At least something that checks that gcc generates the jumps to the appropriate
> helper functions?
In theory yes, although it does not really invalidate the test since the
idea is to check for the variadic arguments. Based on your comment I think
it would better to split this patch in 2 with this modifications at first.
Regarding the tests, the go testsuite itself covers pretty much all the
morestack call usage (it indeed showed a lot of issues with some initial
drafts for this patches). So I am not sure how more extensible testsuite
we would like to push for this.
>
> Thanks,
> Kyrill
More information about the Gcc-patches
mailing list