[PATCH][GCC][AArch64] Add support for SVE stack clash probing [patch (2/7)]
Richard Sandiford
richard.sandiford@arm.com
Tue Aug 28 20:40:00 GMT 2018
I'll leave the AArch64 maintainers to review, but some comments.
Tamar Christina <tamar.christina@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 06451f38b11822ea77323438fe8c7e373eb9e614..e7efde79bb111e820f4df44a276f6f73070ecd17 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -3970,6 +3970,90 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
> return "";
> }
> +/* Emit the probe loop for doing stack clash probes and stack adjustments for
> + SVE. This emits probes from BASE to BASE + ADJUSTMENT based on a guard size
> + of GUARD_SIZE and emits a probe when at least LIMIT bytes are allocated. By
> + the end of this function BASE = BASE + ADJUSTMENT. */
> +
> +const char *
> +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment, rtx limit,
> + rtx guard_size)
> +{
> + /* This function is not allowed to use any instruction generation function
> + like gen_ and friends. If you do you'll likely ICE during CFG validation,
> + so instead emit the code you want using output_asm_insn. */
> + gcc_assert (flag_stack_clash_protection);
> + gcc_assert (CONST_INT_P (limit) && CONST_INT_P (guard_size));
> + gcc_assert (aarch64_uimm12_shift (INTVAL (limit)));
> + gcc_assert (aarch64_uimm12_shift (INTVAL (guard_size)));
> +
> + static int labelno = 0;
> + char loop_start_lab[32];
> + char loop_res_lab[32];
> + char loop_end_lab[32];
> + rtx xops[2];
> +
> + ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSRL", labelno);
> + ASM_GENERATE_INTERNAL_LABEL (loop_res_lab, "BRRCHK", labelno);
> + ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "BERCHK", labelno++);
> +
> + /* Emit loop start label. */
> + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
> +
> + /* Test if ADJUSTMENT < GUARD_SIZE. */
> + xops[0] = adjustment;
> + xops[1] = guard_size;
> + output_asm_insn ("cmp\t%0, %1", xops);
> +
> + /* Branch to residual loop if it is. */
> + fputs ("\tb.lt\t", asm_out_file);
> + assemble_name_raw (asm_out_file, loop_res_lab);
> + fputc ('\n', asm_out_file);
> +
> + /* BASE = BASE - GUARD_SIZE. */
> + xops[0] = base;
> + xops[1] = guard_size;
> + output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> + /* Probe at BASE + LIMIT. */
> + xops[1] = limit;
> + output_asm_insn ("str\txzr, [%0, %1]", xops);
> +
> + /* ADJUSTMENT = ADJUSTMENT - GUARD_SIZE. */
> + xops[0] = adjustment;
> + xops[1] = guard_size;
> + output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> + /* Branch to loop start. */
> + fputs ("\tb\t", asm_out_file);
> + assemble_name_raw (asm_out_file, loop_start_lab);
> + fputc ('\n', asm_out_file);
> +
> + /* Emit residual check label. */
> + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_res_lab);
> +
> + /* BASE = BASE - ADJUSTMENT. */
> + xops[0] = base;
> + xops[1] = adjustment;
> + output_asm_insn ("sub\t%0, %0, %1", xops);
> +
> + /* Test if BASE < LIMIT. */
> + xops[1] = limit;
> + output_asm_insn ("cmp\t%0, %1", xops);
Think this should be ADJUSTMENT < LIMIT.
> + /* Branch to end. */
> + fputs ("\tb.lt\t", asm_out_file);
> + assemble_name_raw (asm_out_file, loop_end_lab);
> + fputc ('\n', asm_out_file);
> +
> + /* Probe at BASE + LIMIT. */
> + output_asm_insn ("str\txzr, [%0, %1]", xops);
It looks like this would probe at LIMIT when ADJUSTMENT is exactly LIMIT,
which could clobber the caller's frame.
> +
> + /* No probe leave. */
> + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
> + return "";
With the CFA stuff and constant load, I think this works out as:
---------------------------------------------
# 12 insns
mov r15, base
mov adjustment, N
1:
cmp adjustment, guard_size
b.lt 2f
sub base, base, guard_size
str xzr, [base, limit]
sub adjustment, adjustment, guard_size
b 1b
2:
sub base, base, adjustment
cmp adjustment, limit
b.le 3f
str xzr, [base, limit]
3:
---------------------------------------------
What do you think about something like:
---------------------------------------------
# 10 insns
mov adjustment, N
sub r15, base, adjustment
subs adjustment, adjustment, min_probe_threshold
b.lo 2f
1:
add base, x15, adjustment
str xzr, [base, 0]
subs adjustment, adjustment, 16
and adjustment, adjustment, ~(guard_size-1)
b.hs 1b
2:
mov base, r15
---------------------------------------------
or (with different trade-offs):
---------------------------------------------
# 11 insns
mov adjustment, N
sub r15, base, adjustment
subs adjustment, adjustment, min_probe_threshold
b.lo 2f
# Might be 0, leading to a double probe
and r14, adjustment, guard_size-1
1:
add base, x15, adjustment
str xzr, [base, 0]
subs adjustment, adjustment, r14
mov r14, guard_size
b.hs 1b
2:
mov base, r15
---------------------------------------------
or (longer, but with a simpler loop):
---------------------------------------------
# 12 insns
mov adjustment, N
sub r15, base, adjustment
subs adjustment, adjustment, min_probe_threshold
b.lo 2f
str xzr, [base, -16]!
sub adjustment, adjustment, 32
and adjustment, adjustment, -(guard_size-1)
1:
add base, x15, adjustment
str xzr, [base, 0]
subs adjustment, adjustment, guard_size
b.hs 1b
2:
mov base, r15
---------------------------------------------
with the CFA based on r15+offset?
These loops probe more often than necessary in some cases,
but they only need a single branch in the common case that
ADJUSTMENT <= MIN_PROBE_THRESHOLD.
> @@ -4826,22 +4910,30 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> }
> }
>
> - HOST_WIDE_INT size;
> + /* GCC's initialization analysis is broken so initialize size. */
> + HOST_WIDE_INT size = 0;
It's not broken in this case. :-) is_constant only modifies its argument
when returning true. In other cases the variable keeps whatever value
it had originally. And the code does use "size" when !is_constant,
so an explicit initialisation is necessary.
> /* If SIZE is not large enough to require probing, just adjust the stack and
> exit. */
> - if (!poly_size.is_constant (&size)
> - || known_lt (poly_size, min_probe_threshold)
> + if ((poly_size.is_constant (&size)
> + && known_lt (poly_size, min_probe_threshold))
> || !flag_stack_clash_protection)
No need for the is_constant here, just known_lt is enough.
> {
> aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
> return;
> }
>
> - if (dump_file)
> + if (dump_file && poly_size.is_constant ())
> fprintf (dump_file,
> "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> ", probing will be required.\n", size);
>
> + if (dump_file && !poly_size.is_constant ())
> + {
> + fprintf (dump_file, "Stack clash SVE prologue: ");
> + dump_dec (MSG_NOTE, poly_size);
This should be print_dec (poly_size, dump_file);
> + fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> + }
> +
> /* Round size to the nearest multiple of guard_size, and calculate the
> residual as the difference between the original size and the rounded
> size. */
> @@ -4850,7 +4942,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
>
> /* We can handle a small number of allocations/probes inline. Otherwise
> punt to a loop. */
> - if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
> + if (poly_size.is_constant ()
> + && rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
> {
> for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
> {
> @@ -4861,7 +4954,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> }
> dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
> }
> - else
> + else if (poly_size.is_constant ())
> {
> /* Compute the ending address. */
> aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
> @@ -4910,6 +5003,48 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
> emit_insn (gen_blockage ());
> dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
> }
> + else
> + {
It would probably be better to handle "!poly_size.is_constant ()"
after the "!flag_stack_clash_protection" if statement and exit early,
so that we don't do calculations based on "size" when "size" has a
fairly meaningless value. It would also avoid repeated checks for
is_constant.
> + rtx probe_const = gen_rtx_CONST_INT (Pmode, STACK_CLASH_CALLER_GUARD);
> + rtx guard_const = gen_rtx_CONST_INT (Pmode, guard_size);
CONST_INTs don't have a recorded mode, so this should either be GEN_INT or
(better) gen_int_mode.
Thanks,
Richard
More information about the Gcc-patches
mailing list