[PATCH 4/5] nvptx: implement SIMT enter/exit insns

Alexander Monakov amonakov@ispras.ru
Tue Jan 17 20:15:00 GMT 2017


This patch adds handling of new omp_simt_enter/omp_simt_exit named insns
in the NVPTX backend.

	* config/nvptx/nvptx-protos.h (nvptx_output_simt_enter): Declare.
        (nvptx_output_simt_exit): Declare.
        * config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Use
        cfun->machine->unisimt_location.  Handle NULL unisimt_predicate.
        (init_softstack_frame): Move initialization of crtl->is_leaf to...
        (nvptx_declare_function_name): ...here.  Emit declaration of local
        memory space buffer for omp_simt_enter insn.
        (nvptx_output_unisimt_switch): New.
        (nvptx_output_softstack_switch): New.
        (nvptx_output_simt_enter): New.
        (nvptx_output_simt_exit): New.
        * config/nvptx/nvptx.h (struct machine_function): New fields
        has_simtreg, unisimt_location, simt_stack_size, simt_stack_align.
        * config/nvptx/nvptx.md (UNSPECV_SIMT_ENTER): New unspec.
        (UNSPECV_SIMT_EXIT): Ditto.
        (omp_simt_enter_insn): New insn.
        (omp_simt_enter): New expansion.
        (omp_simt_exit): New insn.
        * config/nvptx/nvptx.opt (msoft-stack-reserve-local): New option.


---
 gcc/config/nvptx/nvptx-protos.h |   2 +
 gcc/config/nvptx/nvptx.c        | 163 +++++++++++++++++++++++++++++++++++-----
 gcc/config/nvptx/nvptx.h        |   6 ++
 gcc/config/nvptx/nvptx.md       |  39 ++++++++++
 gcc/config/nvptx/nvptx.opt      |   4 +
 5 files changed, 196 insertions(+), 18 deletions(-)

diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index 331ec0a..2f836c1 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -53,5 +53,7 @@ extern const char *nvptx_output_mov_insn (rtx, rtx);
 extern const char *nvptx_output_call_insn (rtx_insn *, rtx, rtx);
 extern const char *nvptx_output_return (void);
 extern const char *nvptx_output_set_softstack (unsigned);
+extern const char *nvptx_output_simt_enter (rtx, rtx, rtx);
+extern const char *nvptx_output_simt_exit (rtx);
 #endif
 #endif
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index b3f025f..f132845 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -1047,11 +1047,6 @@ init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
 	   bits, reg_stack, reg_frame, size);
 
-  /* Usually 'crtl->is_leaf' is computed during register allocator
-     initialization (which is not done on NVPTX) or for pressure-sensitive
-     optimizations.  Initialize it here, except if already set.  */
-  if (!crtl->is_leaf)
-    crtl->is_leaf = leaf_function_p ();
   if (!crtl->is_leaf)
     fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
 	     bits, reg_sspslot, reg_stack);
@@ -1079,24 +1074,29 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
 static void
 nvptx_init_unisimt_predicate (FILE *file)
 {
+  cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
+  int loc = REGNO (cfun->machine->unisimt_location);
   int bits = POINTER_SIZE;
-  int master = REGNO (cfun->machine->unisimt_master);
-  int pred = REGNO (cfun->machine->unisimt_predicate);
+  fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
   fprintf (file, "\t{\n");
   fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
   fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
-  fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits);
   fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
   fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
 	   bits == 64 ? ".wide" : ".lo");
-  fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits);
-  fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits);
-  fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master);
-  fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n");
-  /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'.  */
-  fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
-  /* Compute predicate as 'tid.x == master'.  */
-  fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
+  fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
+  fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
+  if (cfun->machine->unisimt_predicate)
+    {
+      int master = REGNO (cfun->machine->unisimt_master);
+      int pred = REGNO (cfun->machine->unisimt_predicate);
+      fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
+      fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
+      /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'.  */
+      fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
+      /* Compute predicate as 'tid.x == master'.  */
+      fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
+    }
   fprintf (file, "\t}\n");
   need_unisimt_decl = true;
 }
@@ -1220,6 +1220,12 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
 
   fprintf (file, "%s", s.str().c_str());
 
+  /* Usually 'crtl->is_leaf' is computed during register allocator
+     initialization (which is not done on NVPTX) or for pressure-sensitive
+     optimizations.  Initialize it here, except if already set.  */
+  if (!crtl->is_leaf)
+    crtl->is_leaf = leaf_function_p ();
+
   HOST_WIDE_INT sz = get_frame_size ();
   bool need_frameptr = sz || cfun->machine->has_chain;
   int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
@@ -1236,9 +1242,28 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
 	init_frame (file, FRAME_POINTER_REGNUM, alignment,
 		    ROUND_UP (sz, GET_MODE_SIZE (DImode)));
     }
-  else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca)
+  else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
+	   || (cfun->machine->has_simtreg && !crtl->is_leaf))
     init_softstack_frame (file, alignment, sz);
 
+  if (cfun->machine->has_simtreg)
+    {
+      unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
+      unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
+      align = MAX (align, GET_MODE_SIZE (DImode));
+      if (!crtl->is_leaf || cfun->calls_alloca)
+	simtsz = HOST_WIDE_INT_M1U;
+      if (simtsz == HOST_WIDE_INT_M1U)
+	simtsz = nvptx_softstack_size;
+      if (cfun->machine->has_softstack)
+	simtsz += POINTER_SIZE / 8;
+      simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
+      if (align > GET_MODE_SIZE (DImode))
+	simtsz += align - GET_MODE_SIZE (DImode);
+      if (simtsz)
+	fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
+		HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
+    }
   /* Declare the pseudos we have as ptx registers.  */
   int maxregs = max_reg_num ();
   for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
@@ -1263,10 +1288,112 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
   if (cfun->machine->axis_predicate[1])
     nvptx_init_axis_predicate (file,
 			       REGNO (cfun->machine->axis_predicate[1]), "x");
-  if (cfun->machine->unisimt_predicate)
+  if (cfun->machine->unisimt_predicate
+      || (cfun->machine->has_simtreg && !crtl->is_leaf))
     nvptx_init_unisimt_predicate (file);
 }
 
+/* Output code for switching uniform-simt state.  ENTERING indicates whether
+   we are entering or leaving non-uniform execution region.  */
+
+static void
+nvptx_output_unisimt_switch (FILE *file, bool entering)
+{
+  if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
+    return;
+  fprintf (file, "\t{\n");
+  fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
+  fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
+  if (!crtl->is_leaf)
+    {
+      int loc = REGNO (cfun->machine->unisimt_location);
+      fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
+    }
+  if (cfun->machine->unisimt_predicate)
+    {
+      int master = REGNO (cfun->machine->unisimt_master);
+      int pred = REGNO (cfun->machine->unisimt_predicate);
+      fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
+      fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
+	       master, entering ? "%ustmp2" : "0");
+      fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
+    }
+  fprintf (file, "\t}\n");
+}
+
+/* Output code for allocating per-lane storage and switching soft-stack pointer.
+   ENTERING indicates whether we are entering or leaving non-uniform execution.
+   PTR is the register pointing to allocated storage, it is assigned to on
+   entering and used to restore state on leaving.  SIZE and ALIGN are used only
+   on entering.  */
+
+static void
+nvptx_output_softstack_switch (FILE *file, bool entering,
+			       rtx ptr, rtx size, rtx align)
+{
+  gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
+  if (crtl->is_leaf && !cfun->machine->simt_stack_size)
+    return;
+  int bits = POINTER_SIZE, regno = REGNO (ptr);
+  fprintf (file, "\t{\n");
+  if (entering)
+    {
+      fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
+	       HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
+	       cfun->machine->simt_stack_size);
+      fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
+      if (CONST_INT_P (size))
+	fprintf (file, HOST_WIDE_INT_PRINT_DEC,
+		 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
+      else
+	output_reg (file, REGNO (size), VOIDmode);
+      fputs (";\n", file);
+      if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
+	fprintf (file, "\t\tand.u%d %%r%d, %%r%d, -%d;\n",
+		 bits, regno, regno, UINTVAL (align));
+    }
+  if (cfun->machine->has_softstack)
+    {
+      const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
+      if (entering)
+	{
+	  fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
+		   bits, regno, bits / 8, reg_stack);
+	  fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
+		   bits, reg_stack, regno, bits / 8);
+	}
+      else
+	{
+	  fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
+		   bits, reg_stack, regno, bits / 8);
+	}
+      nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
+    }
+  fprintf (file, "\t}\n");
+}
+
+/* Output code to enter non-uniform execution region.  DEST is a register
+   to hold a per-lane allocation given by SIZE and ALIGN.  */
+
+const char *
+nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
+{
+  nvptx_output_unisimt_switch (asm_out_file, true);
+  nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
+  return "";
+}
+
+/* Output code to leave non-uniform execution region.  SRC is the register
+   holding per-lane storage previously allocated by omp_simt_enter insn.  */
+
+const char *
+nvptx_output_simt_exit (rtx src)
+{
+  nvptx_output_unisimt_switch (asm_out_file, false);
+  nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
+  return "";
+}
+
 /* Output instruction that sets soft stack pointer in shared memory to the
    value in register given by SRC_REGNO.  */
 
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 1702178..2d4fe7d 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -213,12 +213,18 @@ struct GTY(()) machine_function
   bool has_varadic;  /* Current function has a varadic call.  */
   bool has_chain; /* Current function has outgoing static chain.  */
   bool has_softstack; /* Current function has a soft stack frame.  */
+  bool has_simtreg; /* Current function has an OpenMP SIMD region.  */
   int num_args;	/* Number of args of current call.  */
   int return_mode; /* Return mode of current fn.
 		      (machine_mode not defined yet.) */
   rtx axis_predicate[2]; /* Neutering predicates.  */
   rtx unisimt_master; /* 'Master lane index' for -muniform-simt.  */
   rtx unisimt_predicate; /* Predicate for -muniform-simt.  */
+  rtx unisimt_location; /* Mask location for -muniform-simt.  */
+  /* The following two fields hold the maximum size resp. alignment required
+     for per-lane storage in OpenMP SIMD regions.  */
+  unsigned HOST_WIDE_INT simt_stack_size;
+  unsigned HOST_WIDE_INT simt_stack_align;
 };
 #endif
 
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 91d1129..2f6050c 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -63,6 +63,9 @@ (define_c_enum "unspecv" [
    UNSPECV_JOIN
 
    UNSPECV_NOUNROLL
+
+   UNSPECV_SIMT_ENTER
+   UNSPECV_SIMT_EXIT
 ])
 
 (define_attr "subregs_ok" "false,true"
@@ -1184,6 +1187,42 @@ (define_insn "nvptx_vote_ballot"
 
 ;; Patterns for OpenMP SIMD-via-SIMT lowering
 
+(define_insn "omp_simt_enter_insn"
+  [(set (match_operand 0 "nvptx_register_operand" "=R")
+	(unspec_volatile [(match_operand 1 "nvptx_nonmemory_operand" "Ri")
+			    (match_operand 2 "nvptx_nonmemory_operand" "Ri")]
+			   UNSPECV_SIMT_ENTER))]
+  ""
+{
+  return nvptx_output_simt_enter (operands[0], operands[1], operands[2]);
+})
+
+(define_expand "omp_simt_enter"
+  [(match_operand 0 "nvptx_register_operand" "=R")
+   (match_operand 1 "nvptx_nonmemory_operand" "Ri")
+   (match_operand 2 "const_int_operand" "n")]
+  ""
+{
+  if (!CONST_INT_P (operands[1]))
+    cfun->machine->simt_stack_size = HOST_WIDE_INT_M1U;
+  else
+    cfun->machine->simt_stack_size = MAX (UINTVAL (operands[1]),
+					  cfun->machine->simt_stack_size);
+  cfun->machine->simt_stack_align = MAX (UINTVAL (operands[2]),
+					 cfun->machine->simt_stack_align);
+  cfun->machine->has_simtreg = true;
+  emit_insn (gen_omp_simt_enter_insn (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "omp_simt_exit"
+  [(unspec_volatile [(match_operand 0 "nvptx_register_operand" "R")]
+		    UNSPECV_SIMT_EXIT)]
+  ""
+{
+  return nvptx_output_simt_exit (operands[0]);
+})
+
 ;; Implement IFN_GOMP_SIMT_LANE: set operand 0 to lane index
 (define_insn "omp_simt_lane"
   [(set (match_operand:SI 0 "nvptx_register_operand" "")
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index cb6194d..0c3794b 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -37,6 +37,10 @@ msoft-stack
 Target Report Mask(SOFT_STACK)
 Use custom stacks instead of local memory for automatic storage.
 
+msoft-stack-reserve-local
+Target Report Joined RejectNegative UInteger Var(nvptx_softstack_size) Init(128)
+Specify size of .local memory used for stack when the exact amount is not known.
+
 muniform-simt
 Target Report Mask(UNIFORM_SIMT)
 Generate code that can keep local state uniform across all lanes.
-- 
1.8.3.1



More information about the Gcc-patches mailing list