This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] nvptx per-warp compiler-defined stacks (-msoft-stack)
- From: Alexander Monakov <amonakov at ispras dot ru>
- To: gcc-patches at gcc dot gnu dot org
- Cc: Nathan Sidwell <nathan at acm dot org>
- Date: Wed, 20 Apr 2016 19:59:51 +0300 (MSK)
- Subject: [PATCH] nvptx per-warp compiler-defined stacks (-msoft-stack)
- Authentication-results: sourceware.org; auth=none
This patch implements per-warp compiler-defined stacks under -msoft-stack
option, and implements alloca on top of that. In a few obvious places,
changes from -muniform-simt patch are present in the hunks.
Previously posted here:
[PATCH] nvptx: implement automatic storage in custom stacks
https://gcc.gnu.org/ml/gcc-patches/2015-11/msg01519.html
[gomp-nvptx] nvptx backend: implement alloca with -msoft-stack
https://gcc.gnu.org/ml/gcc-patches/2015-12/msg01397.html
[gomp-nvptx 7/7] nvptx backend: define STACK_SIZE_MODE
https://gcc.gnu.org/ml/gcc-patches/2016-03/msg01108.html
2016-03-15 Alexander Monakov <amonakov@ispras.ru>
* config/nvptx/nvptx.h (STACK_SIZE_MODE): Define.
2015-12-14 Alexander Monakov <amonakov@ispras.ru>
* config/nvptx/nvptx.c (nvptx_declare_function_name): Emit %outargs
using .local %outargs_ar only if not TARGET_SOFT_STACK. Emit %outargs
under TARGET_SOFT_STACK by offsetting from %frame.
(nvptx_get_drap_rtx): Return %argp as the DRAP if needed.
* config/nvptx/nvptx.md (nvptx_register_operand): Allow %outargs under
TARGET_SOFT_STACK.
(nvptx_nonimmediate_operand): Ditto.
(allocate_stack): Implement for TARGET_SOFT_STACK. Remove unused code.
(allocate_stack_<mode>): Remove unused pattern.
(set_softstack_insn): New pattern.
(restore_stack_block): Handle for TARGET_SOFT_STACK.
2015-12-09 Alexander Monakov <amonakov@ispras.ru>
* config/nvptx/nvptx.c: (need_softstack_decl): New variable.
(nvptx_declare_function_name): Handle TARGET_SOFT_STACK.
(nvptx_output_return): Emit stack restore if needed.
(nvptx_file_end): Handle need_softstack_decl.
* config/nvptx/nvptx.h: (TARGET_CPU_CPP_BUILTINS): Define
__nvptx_softstack__ when -msoft-stack is active.
(struct machine_function): New bool field using_softstack.
* config/nvptx/nvptx.opt: (msoft-stack): New option.
* doc/invoke.texi (msoft-stack): Document.
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 2d4dad1..e9e4d06 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -139,6 +129,12 @@ static GTY(()) rtx worker_red_sym;
/* Global lock variable, needed for 128bit worker & gang reductions. */
static GTY(()) tree global_lock_var;
+/* True if any function references __nvptx_stacks. */
+static bool need_softstack_decl;
+
+/* True if any function references __nvptx_uni. */
+static bool need_unisimt_decl;
+
/* Allocate a new, cleared machine_function structure. */
static struct machine_function *
@@ -992,16 +1086,55 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
fprintf (file, "%s", s.str().c_str());
- /* Declare a local var for outgoing varargs. */
- if (cfun->machine->has_varadic)
- init_frame (file, STACK_POINTER_REGNUM,
- UNITS_PER_WORD, crtl->outgoing_args_size);
-
- /* Declare a local variable for the frame. */
HOST_WIDE_INT sz = get_frame_size ();
- if (sz || cfun->machine->has_chain)
- init_frame (file, FRAME_POINTER_REGNUM,
- crtl->stack_alignment_needed / BITS_PER_UNIT, sz);
+ bool need_frameptr = sz || cfun->machine->has_chain;
+ int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
+ if (!TARGET_SOFT_STACK)
+ {
+ /* Declare a local var for outgoing varargs. */
+ if (cfun->machine->has_varadic)
+ init_frame (file, STACK_POINTER_REGNUM,
+ UNITS_PER_WORD, crtl->outgoing_args_size);
+
+ /* Declare a local variable for the frame. */
+ if (need_frameptr)
+ init_frame (file, FRAME_POINTER_REGNUM, alignment, sz);
+ }
+ else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca)
+ {
+ /* Maintain 64-bit stack alignment. */
+ int keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+ sz = ROUND_UP (sz, keep_align);
+ int bits = POINTER_SIZE;
+ fprintf (file, "\t.reg.u%d %%frame;\n", bits);
+ fprintf (file, "\t.reg.u32 %%fstmp0;\n");
+ fprintf (file, "\t.reg.u%d %%fstmp1;\n", bits);
+ fprintf (file, "\t.reg.u%d %%fstmp2;\n", bits);
+ fprintf (file, "\tmov.u32 %%fstmp0, %%tid.y;\n");
+ fprintf (file, "\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
+ bits == 64 ? ".wide" : ".lo", bits / 8);
+ fprintf (file, "\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
+ /* fstmp2 = &__nvptx_stacks[tid.y]; */
+ fprintf (file, "\tadd.u%d %%fstmp2, %%fstmp2, %%fstmp1;\n", bits);
+ fprintf (file, "\tld.shared.u%d %%fstmp1, [%%fstmp2];\n", bits);
+ fprintf (file, "\tsub.u%d %%frame, %%fstmp1, "
+ HOST_WIDE_INT_PRINT_DEC ";\n", bits, sz);
+ if (alignment > keep_align)
+ fprintf (file, "\tand.b%d %%frame, %%frame, %d;\n",
+ bits, -alignment);
+ fprintf (file, "\t.reg.u%d %%stack;\n", bits);
+ sz = crtl->outgoing_args_size;
+ gcc_assert (sz % keep_align == 0);
+ fprintf (file, "\tsub.u%d %%stack, %%frame, "
+ HOST_WIDE_INT_PRINT_DEC ";\n", bits, sz);
+ /* crtl->is_leaf is not initialized because RA is not run. */
+ if (!leaf_function_p ())
+ {
+ fprintf (file, "\tst.shared.u%d [%%fstmp2], %%stack;\n", bits);
+ cfun->machine->using_softstack = true;
+ }
+ need_softstack_decl = true;
+ }
/* Declare the pseudos we have as ptx registers. */
int maxregs = max_reg_num ();
@@ -1037,6 +1172,10 @@ nvptx_output_return (void)
{
machine_mode mode = (machine_mode)cfun->machine->return_mode;
+ if (cfun->machine->using_softstack)
+ fprintf (asm_out_file, "\tst.shared.u%d [%%fstmp2], %%fstmp1;\n",
+ POINTER_SIZE);
+
if (mode != VOIDmode)
fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
nvptx_ptx_type_from_mode (mode, false),
@@ -1068,6 +1207,8 @@ nvptx_function_ok_for_sibcall (tree, tree)
static rtx
nvptx_get_drap_rtx (void)
{
+ if (TARGET_SOFT_STACK && stack_realign_drap)
+ return arg_pointer_rtx;
return NULL_RTX;
}
@@ -3939,6 +4183,18 @@ nvptx_file_end (void)
if (worker_red_size)
write_worker_buffer (asm_out_file, worker_red_sym,
worker_red_align, worker_red_size);
+
+ if (need_softstack_decl)
+ {
+ write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
+ fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
+ POINTER_SIZE);
+ }
+ if (need_unisimt_decl)
+ {
+ write_var_marker (asm_out_file, false, true, "__nvptx_uni");
+ fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
+ }
}
/* Expander for the shuffle builtins. */
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 381269e..6da4d06 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -31,6 +31,10 @@
builtin_assert ("machine=nvptx"); \
builtin_assert ("cpu=nvptx"); \
builtin_define ("__nvptx__"); \
+ if (TARGET_SOFT_STACK) \
+ builtin_define ("__nvptx_softstack__"); \
+ if (TARGET_UNIFORM_SIMT) \
+ builtin_define ("__nvptx_unisimt__"); \
} while (0)
/* Avoid the default in ../../gcc.c, which adds "-pthread", which is not
@@ -79,6 +83,7 @@
#define POINTER_SIZE (TARGET_ABI64 ? 64 : 32)
#define Pmode (TARGET_ABI64 ? DImode : SImode)
+#define STACK_SIZE_MODE Pmode
/* Registers. Since ptx is a virtual target, we just define a few
hard registers for special purposes and leave pseudos unallocated.
@@ -200,10 +205,13 @@ struct GTY(()) machine_function
bool is_varadic; /* This call is varadic */
bool has_varadic; /* Current function has a varadic call. */
bool has_chain; /* Current function has outgoing static chain. */
+ bool using_softstack; /* Need to restore __nvptx_stacks[tid.y]. */
int num_args; /* Number of args of current call. */
int return_mode; /* Return mode of current fn.
(machine_mode not defined yet.) */
rtx axis_predicate[2]; /* Neutering predicates. */
+ rtx unisimt_master; /* Master lane index for "uniform simt" mode. */
+ rtx unisimt_predicate; /* Predicate register for "uniform simt". */
};
#endif
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 33a4862..e5650b6 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -961,31 +986,41 @@ (define_expand "allocate_stack"
(match_operand 1 "nvptx_register_operand")]
""
{
+ if (TARGET_SOFT_STACK)
+ {
+ emit_move_insn (stack_pointer_rtx,
+ gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1]));
+ emit_insn (gen_set_softstack_insn (stack_pointer_rtx));
+ emit_move_insn (operands[0], virtual_stack_dynamic_rtx);
+ DONE;
+ }
/* The ptx documentation specifies an alloca intrinsic (for 32 bit
only) but notes it is not implemented. The assembler emits a
confused error message. Issue a blunt one now instead. */
sorry ("target cannot support alloca.");
emit_insn (gen_nop ());
DONE;
- if (TARGET_ABI64)
- emit_insn (gen_allocate_stack_di (operands[0], operands[1]));
- else
- emit_insn (gen_allocate_stack_si (operands[0], operands[1]));
- DONE;
})
-(define_insn "allocate_stack_<mode>"
- [(set (match_operand:P 0 "nvptx_register_operand" "=R")
- (unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")]
- UNSPEC_ALLOCA))]
- ""
- "%.\\tcall (%0), %%alloca, (%1);")
+(define_insn "set_softstack_insn"
+ [(unspec [(match_operand 0 "nvptx_register_operand" "R")] UNSPEC_ALLOCA)]
+ "TARGET_SOFT_STACK"
+{
+ return (cfun->machine->using_softstack
+ ? "%.\\tst.shared%t0\\t[%%fstmp2], %0;"
+ : "");
+})
(define_expand "restore_stack_block"
[(match_operand 0 "register_operand" "")
(match_operand 1 "register_operand" "")]
""
{
+ if (TARGET_SOFT_STACK)
+ {
+ emit_move_insn (operands[0], operands[1]);
+ emit_insn (gen_set_softstack_insn (operands[0]));
+ }
DONE;
})