This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH, 2/2][nvptx, PR83589] Workaround for branch-around-nothing JIT bug


Hi,

this patch adds a workaround for the nvptx target JIT bug PR83589 - "[nvptx] mode-transitions.c and private-variables.{c,f90} execution FAILs at GOMP_NVPTX_JIT=-O0".


When compiling a branch-around-nothing (where the branch is warp neutering, so it's a divergent branch):
...
  .reg .pred %r36;
  {
    .reg .u32 %x;
    mov.u32 %x,%tid.x;
    setp.ne.u32 %r36,%x,0;
  }

  @ %r36 bra $L5;
  $L5:
...

The JIT fails to generate a convergence point here:
...
         /*0128*/               @P0 BRA `(.L_1);
.L_1:
...

Consequently, we execute subsequent code in divergent mode, and when executing a shfl.idx a bit later we run into the undefined behaviour that shfl.idx has when executing in divergent mode.

The workaround detects branch-around-nothing, and inserts a ptx operation that does nothing (I'm calling it a fake nop, I haven't been able to come up with a better term yet):
...
  @ %r36 bra $L5;
    {
      .reg .u32 %nop_src;
      .reg .u32 %nop_dst;
      mov.u32 %nop_dst, %nop_src;
    }
  $L5:
...
which makes the test pass, because then we generate a convergence point here at .L1:
...
        /*0128*/                   SSY `(.L_1);
        /*0130*/               @P0 SYNC (*"TARGET= .L_1 "*);
        /*0138*/                   SYNC (*"TARGET= .L_1 "*);
.L_1:
...

The workaround is not minimal given that it inserts the fake nop in all branch-around-nothings it detects, not just the warp neutering ones, but I think this is more robust than trying to identify the warp neutering branches. Furthermore, I'm not going for optimality here anyway. The optimal way to fix this is making sure we don't generate branch-around-nothing, but that's for stage1.

Build and reg-tested on x86_64 with nvptx accelerator.

I'd like to commit in stage4, but I'd appreciate a review of the code. Does the patch look OK?

Thanks,
- Tom
[nvptx, PR83589] Workaround for branch-around-nothing JIT bug

2018-01-23  Tom de Vries  <tom@codesourcery.com>

	PR target/83589
	* config/nvptx/nvptx.c (WORKAROUND_PTXJIT_BUG_2): Define to 1.
	(nvptx_pc_set, nvptx_condjump_label): New function. Copy from jump.c.
	Add strict parameter.
	(prevent_branch_around_nothing): Insert dummy insn between branch to
	label and label with no ptx insn inbetween.
	* config/nvptx/nvptx.md (define_insn "fake_nop"): New insn.

	* testsuite/libgomp.oacc-c-c++-common/pr83589.c: New test.

---
 gcc/config/nvptx/nvptx.c                           | 92 ++++++++++++++++++++++
 gcc/config/nvptx/nvptx.md                          |  9 +++
 .../testsuite/libgomp.oacc-c-c++-common/pr83589.c  | 21 +++++
 3 files changed, 122 insertions(+)

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 3516740..e55b426 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -78,6 +78,7 @@
 #include "target-def.h"
 
 #define WORKAROUND_PTXJIT_BUG 1
+#define WORKAROUND_PTXJIT_BUG_2 1
 
 /* The various PTX memory areas an object might reside in.  */
 enum nvptx_data_area
@@ -4363,6 +4364,93 @@ nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
     nvptx_neuter_pars (par->next, modes, outer);
 }
 
+#if WORKAROUND_PTXJIT_BUG_2
+/* Variant of pc_set that only requires JUMP_P (INSN) if STRICT.  This variant
+   is needed in the nvptx target because the branches generated for
+   parititioning are NONJUMP_INSN_P, not JUMP_P.  */
+
+static rtx
+nvptx_pc_set (const rtx_insn *insn, bool strict = true)
+{
+  rtx pat;
+  if ((strict && !JUMP_P (insn))
+      || (!strict && !INSN_P (insn)))
+    return NULL_RTX;
+  pat = PATTERN (insn);
+
+  /* The set is allowed to appear either as the insn pattern or
+     the first set in a PARALLEL.  */
+  if (GET_CODE (pat) == PARALLEL)
+    pat = XVECEXP (pat, 0, 0);
+  if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
+    return pat;
+
+  return NULL_RTX;
+}
+
+/* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT.  */
+
+static rtx
+nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
+{
+  rtx x = nvptx_pc_set (insn, strict);
+
+  if (!x)
+    return NULL_RTX;
+  x = SET_SRC (x);
+  if (GET_CODE (x) == LABEL_REF)
+    return x;
+  if (GET_CODE (x) != IF_THEN_ELSE)
+    return NULL_RTX;
+  if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
+    return XEXP (x, 1);
+  if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
+    return XEXP (x, 2);
+  return NULL_RTX;
+}
+
+/* Insert a dummy ptx insn when encountering a branch to a label with no ptx
+   insn inbetween the branch and the label.  This works around a JIT bug
+   observed at driver version 384.111, at -O0 for sm_50.  */
+
+static void
+prevent_branch_around_nothing (void)
+{
+  rtx_insn *seen_label = 0;
+    for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
+      {
+	if (seen_label == 0)
+	  {
+	    if (INSN_P (insn) && condjump_p (insn))
+	      seen_label = label_ref_label (nvptx_condjump_label (insn, false));
+
+	    continue;
+	  }
+
+	if (NOTE_P (insn))
+	  continue;
+
+	if (INSN_P (insn))
+	  switch (recog_memoized (insn))
+	    {
+	    case CODE_FOR_nvptx_fork:
+	    case CODE_FOR_nvptx_forked:
+	    case CODE_FOR_nvptx_joining:
+	    case CODE_FOR_nvptx_join:
+	      continue;
+	    default:
+	      seen_label = 0;
+	      continue;
+	    }
+
+	if (LABEL_P (insn) && insn == seen_label)
+	  emit_insn_before (gen_fake_nop (), insn);
+
+	seen_label = 0;
+      }
+  }
+#endif
+
 /* PTX-specific reorganization
    - Split blocks at fork and join instructions
    - Compute live registers
@@ -4442,6 +4530,10 @@ nvptx_reorg (void)
   if (TARGET_UNIFORM_SIMT)
     nvptx_reorg_uniform_simt ();
 
+#if WORKAROUND_PTXJIT_BUG_2
+  prevent_branch_around_nothing ();
+#endif
+
   regstat_free_n_sets_and_refs ();
 
   df_finish_pass (true);
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 135479b..4f4453d 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -999,6 +999,15 @@
   ""
   "exit;")
 
+(define_insn "fake_nop"
+  [(const_int 2)]
+  ""
+  "{
+     .reg .u32 %%nop_src;
+     .reg .u32 %%nop_dst;
+     mov.u32 %%nop_dst, %%nop_src;
+   }")
+
 (define_insn "return"
   [(return)]
   ""
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83589.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83589.c
new file mode 100644
index 0000000..a6ed5cf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83589.c
@@ -0,0 +1,21 @@
+/* { dg-do run } */
+/* { dg-set-target-env-var GOMP_NVPTX_JIT "-O0" } */
+
+#define n 32
+
+int
+main (void)
+{
+  int arr_a[n];
+
+#pragma acc parallel copyout(arr_a) num_gangs(1) num_workers(1) vector_length(32)
+  {
+    #pragma acc loop vector
+    for (int m = 0; m < 32; m++)
+      ;
+
+    #pragma acc loop vector
+    for (int m = 0; m < 32; m++)
+      arr_a[m] = 0;
+  }
+}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]