[PATCH 4/5] nvptx: implement SIMT enter/exit insns
Alexander Monakov
amonakov@ispras.ru
Tue Jan 17 20:15:00 GMT 2017
This patch adds handling of new omp_simt_enter/omp_simt_exit named insns
in the NVPTX backend.
* config/nvptx/nvptx-protos.h (nvptx_output_simt_enter): Declare.
(nvptx_output_simt_exit): Declare.
* config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Use
cfun->machine->unisimt_location. Handle NULL unisimt_predicate.
(init_softstack_frame): Move initialization of crtl->is_leaf to...
(nvptx_declare_function_name): ...here. Emit declaration of local
memory space buffer for omp_simt_enter insn.
(nvptx_output_unisimt_switch): New.
(nvptx_output_softstack_switch): New.
(nvptx_output_simt_enter): New.
(nvptx_output_simt_exit): New.
* config/nvptx/nvptx.h (struct machine_function): New fields
has_simtreg, unisimt_location, simt_stack_size, simt_stack_align.
* config/nvptx/nvptx.md (UNSPECV_SIMT_ENTER): New unspec.
(UNSPECV_SIMT_EXIT): Ditto.
(omp_simt_enter_insn): New insn.
(omp_simt_enter): New expansion.
(omp_simt_exit): New insn.
* config/nvptx/nvptx.opt (msoft-stack-reserve-local): New option.
---
gcc/config/nvptx/nvptx-protos.h | 2 +
gcc/config/nvptx/nvptx.c | 163 +++++++++++++++++++++++++++++++++++-----
gcc/config/nvptx/nvptx.h | 6 ++
gcc/config/nvptx/nvptx.md | 39 ++++++++++
gcc/config/nvptx/nvptx.opt | 4 +
5 files changed, 196 insertions(+), 18 deletions(-)
diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index 331ec0a..2f836c1 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -53,5 +53,7 @@ extern const char *nvptx_output_mov_insn (rtx, rtx);
extern const char *nvptx_output_call_insn (rtx_insn *, rtx, rtx);
extern const char *nvptx_output_return (void);
extern const char *nvptx_output_set_softstack (unsigned);
+extern const char *nvptx_output_simt_enter (rtx, rtx, rtx);
+extern const char *nvptx_output_simt_exit (rtx);
#endif
#endif
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index b3f025f..f132845 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -1047,11 +1047,6 @@ init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
bits, reg_stack, reg_frame, size);
- /* Usually 'crtl->is_leaf' is computed during register allocator
- initialization (which is not done on NVPTX) or for pressure-sensitive
- optimizations. Initialize it here, except if already set. */
- if (!crtl->is_leaf)
- crtl->is_leaf = leaf_function_p ();
if (!crtl->is_leaf)
fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
bits, reg_sspslot, reg_stack);
@@ -1079,24 +1074,29 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
static void
nvptx_init_unisimt_predicate (FILE *file)
{
+ cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
+ int loc = REGNO (cfun->machine->unisimt_location);
int bits = POINTER_SIZE;
- int master = REGNO (cfun->machine->unisimt_master);
- int pred = REGNO (cfun->machine->unisimt_predicate);
+ fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
fprintf (file, "\t{\n");
fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
- fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits);
fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
bits == 64 ? ".wide" : ".lo");
- fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits);
- fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits);
- fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master);
- fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n");
- /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'. */
- fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
- /* Compute predicate as 'tid.x == master'. */
- fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
+ fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
+ fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
+ if (cfun->machine->unisimt_predicate)
+ {
+ int master = REGNO (cfun->machine->unisimt_master);
+ int pred = REGNO (cfun->machine->unisimt_predicate);
+ fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
+ fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
+ /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
+ fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
+ /* Compute predicate as 'tid.x == master'. */
+ fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
+ }
fprintf (file, "\t}\n");
need_unisimt_decl = true;
}
@@ -1220,6 +1220,12 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
fprintf (file, "%s", s.str().c_str());
+ /* Usually 'crtl->is_leaf' is computed during register allocator
+ initialization (which is not done on NVPTX) or for pressure-sensitive
+ optimizations. Initialize it here, except if already set. */
+ if (!crtl->is_leaf)
+ crtl->is_leaf = leaf_function_p ();
+
HOST_WIDE_INT sz = get_frame_size ();
bool need_frameptr = sz || cfun->machine->has_chain;
int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
@@ -1236,9 +1242,28 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
init_frame (file, FRAME_POINTER_REGNUM, alignment,
ROUND_UP (sz, GET_MODE_SIZE (DImode)));
}
- else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca)
+ else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
+ || (cfun->machine->has_simtreg && !crtl->is_leaf))
init_softstack_frame (file, alignment, sz);
+ if (cfun->machine->has_simtreg)
+ {
+ unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
+ unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
+ align = MAX (align, GET_MODE_SIZE (DImode));
+ if (!crtl->is_leaf || cfun->calls_alloca)
+ simtsz = HOST_WIDE_INT_M1U;
+ if (simtsz == HOST_WIDE_INT_M1U)
+ simtsz = nvptx_softstack_size;
+ if (cfun->machine->has_softstack)
+ simtsz += POINTER_SIZE / 8;
+ simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
+ if (align > GET_MODE_SIZE (DImode))
+ simtsz += align - GET_MODE_SIZE (DImode);
+ if (simtsz)
+ fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
+ HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
+ }
/* Declare the pseudos we have as ptx registers. */
int maxregs = max_reg_num ();
for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
@@ -1263,10 +1288,112 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
if (cfun->machine->axis_predicate[1])
nvptx_init_axis_predicate (file,
REGNO (cfun->machine->axis_predicate[1]), "x");
- if (cfun->machine->unisimt_predicate)
+ if (cfun->machine->unisimt_predicate
+ || (cfun->machine->has_simtreg && !crtl->is_leaf))
nvptx_init_unisimt_predicate (file);
}
+/* Output code for switching uniform-simt state. ENTERING indicates whether
+ we are entering or leaving non-uniform execution region. */
+
+static void
+nvptx_output_unisimt_switch (FILE *file, bool entering)
+{
+ if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
+ return;
+ fprintf (file, "\t{\n");
+ fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
+ fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
+ if (!crtl->is_leaf)
+ {
+ int loc = REGNO (cfun->machine->unisimt_location);
+ fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
+ }
+ if (cfun->machine->unisimt_predicate)
+ {
+ int master = REGNO (cfun->machine->unisimt_master);
+ int pred = REGNO (cfun->machine->unisimt_predicate);
+ fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
+ fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
+ master, entering ? "%ustmp2" : "0");
+ fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
+ }
+ fprintf (file, "\t}\n");
+}
+
+/* Output code for allocating per-lane storage and switching soft-stack pointer.
+ ENTERING indicates whether we are entering or leaving non-uniform execution.
+ PTR is the register pointing to allocated storage, it is assigned to on
+ entering and used to restore state on leaving. SIZE and ALIGN are used only
+ on entering. */
+
+static void
+nvptx_output_softstack_switch (FILE *file, bool entering,
+ rtx ptr, rtx size, rtx align)
+{
+ gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
+ if (crtl->is_leaf && !cfun->machine->simt_stack_size)
+ return;
+ int bits = POINTER_SIZE, regno = REGNO (ptr);
+ fprintf (file, "\t{\n");
+ if (entering)
+ {
+ fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
+ HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
+ cfun->machine->simt_stack_size);
+ fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
+ if (CONST_INT_P (size))
+ fprintf (file, HOST_WIDE_INT_PRINT_DEC,
+ ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
+ else
+ output_reg (file, REGNO (size), VOIDmode);
+ fputs (";\n", file);
+ if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
+ fprintf (file, "\t\tand.u%d %%r%d, %%r%d, -%d;\n",
+ bits, regno, regno, UINTVAL (align));
+ }
+ if (cfun->machine->has_softstack)
+ {
+ const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
+ if (entering)
+ {
+ fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
+ bits, regno, bits / 8, reg_stack);
+ fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
+ bits, reg_stack, regno, bits / 8);
+ }
+ else
+ {
+ fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
+ bits, reg_stack, regno, bits / 8);
+ }
+ nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
+ }
+ fprintf (file, "\t}\n");
+}
+
+/* Output code to enter non-uniform execution region. DEST is a register
+ to hold a per-lane allocation given by SIZE and ALIGN. */
+
+const char *
+nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
+{
+ nvptx_output_unisimt_switch (asm_out_file, true);
+ nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
+ return "";
+}
+
+/* Output code to leave non-uniform execution region. SRC is the register
+ holding per-lane storage previously allocated by omp_simt_enter insn. */
+
+const char *
+nvptx_output_simt_exit (rtx src)
+{
+ nvptx_output_unisimt_switch (asm_out_file, false);
+ nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
+ return "";
+}
+
/* Output instruction that sets soft stack pointer in shared memory to the
value in register given by SRC_REGNO. */
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 1702178..2d4fe7d 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -213,12 +213,18 @@ struct GTY(()) machine_function
bool has_varadic; /* Current function has a varadic call. */
bool has_chain; /* Current function has outgoing static chain. */
bool has_softstack; /* Current function has a soft stack frame. */
+ bool has_simtreg; /* Current function has an OpenMP SIMD region. */
int num_args; /* Number of args of current call. */
int return_mode; /* Return mode of current fn.
(machine_mode not defined yet.) */
rtx axis_predicate[2]; /* Neutering predicates. */
rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */
rtx unisimt_predicate; /* Predicate for -muniform-simt. */
+ rtx unisimt_location; /* Mask location for -muniform-simt. */
+ /* The following two fields hold the maximum size resp. alignment required
+ for per-lane storage in OpenMP SIMD regions. */
+ unsigned HOST_WIDE_INT simt_stack_size;
+ unsigned HOST_WIDE_INT simt_stack_align;
};
#endif
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 91d1129..2f6050c 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -63,6 +63,9 @@ (define_c_enum "unspecv" [
UNSPECV_JOIN
UNSPECV_NOUNROLL
+
+ UNSPECV_SIMT_ENTER
+ UNSPECV_SIMT_EXIT
])
(define_attr "subregs_ok" "false,true"
@@ -1184,6 +1187,42 @@ (define_insn "nvptx_vote_ballot"
;; Patterns for OpenMP SIMD-via-SIMT lowering
+(define_insn "omp_simt_enter_insn"
+ [(set (match_operand 0 "nvptx_register_operand" "=R")
+ (unspec_volatile [(match_operand 1 "nvptx_nonmemory_operand" "Ri")
+ (match_operand 2 "nvptx_nonmemory_operand" "Ri")]
+ UNSPECV_SIMT_ENTER))]
+ ""
+{
+ return nvptx_output_simt_enter (operands[0], operands[1], operands[2]);
+})
+
+(define_expand "omp_simt_enter"
+ [(match_operand 0 "nvptx_register_operand" "=R")
+ (match_operand 1 "nvptx_nonmemory_operand" "Ri")
+ (match_operand 2 "const_int_operand" "n")]
+ ""
+{
+ if (!CONST_INT_P (operands[1]))
+ cfun->machine->simt_stack_size = HOST_WIDE_INT_M1U;
+ else
+ cfun->machine->simt_stack_size = MAX (UINTVAL (operands[1]),
+ cfun->machine->simt_stack_size);
+ cfun->machine->simt_stack_align = MAX (UINTVAL (operands[2]),
+ cfun->machine->simt_stack_align);
+ cfun->machine->has_simtreg = true;
+ emit_insn (gen_omp_simt_enter_insn (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_insn "omp_simt_exit"
+ [(unspec_volatile [(match_operand 0 "nvptx_register_operand" "R")]
+ UNSPECV_SIMT_EXIT)]
+ ""
+{
+ return nvptx_output_simt_exit (operands[0]);
+})
+
;; Implement IFN_GOMP_SIMT_LANE: set operand 0 to lane index
(define_insn "omp_simt_lane"
[(set (match_operand:SI 0 "nvptx_register_operand" "")
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index cb6194d..0c3794b 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -37,6 +37,10 @@ msoft-stack
Target Report Mask(SOFT_STACK)
Use custom stacks instead of local memory for automatic storage.
+msoft-stack-reserve-local
+Target Report Joined RejectNegative UInteger Var(nvptx_softstack_size) Init(128)
+Specify size of .local memory used for stack when the exact amount is not known.
+
muniform-simt
Target Report Mask(UNIFORM_SIMT)
Generate code that can keep local state uniform across all lanes.
--
1.8.3.1
More information about the Gcc-patches
mailing list