This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug target/84952] New: [nvptx] bar.sync generated in divergent code


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84952

            Bug ID: 84952
           Summary: [nvptx] bar.sync generated in divergent code
           Product: gcc
           Version: 8.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vries at gcc dot gnu.org
  Target Milestone: ---

[ As discussed here: https://gcc.gnu.org/ml/gcc-patches/2018-03/msg00408.html ]

Consider testcase workers.c:
...
int
main (void)
{
  int a[10];
#pragma acc parallel loop worker
  for (int i = 0; i < 10; i++)
    a[i] = i;

  return 0;
}
...

At -O2, we generate (edited for readability):
...
// BEGIN PREAMBLE
.version 3.1
.target sm_30
.address_size 64
// END PREAMBLE

// BEGIN FUNCTION DECL: main$_omp_fn$0
.entry main$_omp_fn$0 (.param .u64 %in_ar0);

//:FUNC_MAP "main$_omp_fn$0", 0x1, 0x20, 0x20

// BEGIN VAR DEF: __worker_bcast
.shared .align 8 .u8 __worker_bcast[8];

// BEGIN FUNCTION DEF: main$_omp_fn$0
.entry main$_omp_fn$0 (.param .u64 %in_ar0)
{
  .reg .u64 %ar0;
  ld.param.u64 %ar0,[%in_ar0];
  .reg .u32 %r24;
  .reg .u64 %r25;
  .reg .pred %r26;
  .reg .u64 %r27;
  .reg .u64 %r28;
  .reg .u64 %r29;
  .reg .u64 %r30;
  .reg .u64 %r31;
  .reg .u64 %r32;
  .reg .pred %r33;
  .reg .pred %r34;

  {
    .reg .u32 %y;
    mov.u32 %y,%tid.y;
    setp.ne.u32 %r34,%y,0;
  }

  {
    .reg .u32 %x;
    mov.u32 %x,%tid.x;
    setp.ne.u32 %r33,%x,0;
  }

  @ %r34 bra.uni $L6;
  @ %r33 bra $L7;
  mov.u64 %r25,%ar0;
  // fork 2;
  cvta.shared.u64 %r32,__worker_bcast;
  st.u64 [%r32],%r25;
 $L7:
 $L6:

  @ %r33 bra $L5;
  // forked 2;
  bar.sync 0;
  cvta.shared.u64 %r31,__worker_bcast;
  ld.u64 %r25,[%r31];
  mov.u32 %r24,%tid.y;
  setp.le.s32 %r26,%r24,9;
  @ %r26 bra $L2;
  bra $L3;
 $L2:
  ld.u64 %r27,[%r25];
  cvt.s64.s32 %r28,%r24;
  shl.b64 %r29,%r28,2;
  add.u64 %r30,%r27,%r29;
  st.u32 [%r30],%r24;
 $L3:
  bar.sync 1;
  // joining 2;
 $L5:

  @ %r34 bra.uni $L8;
  @ %r33 bra $L9;
  // join 2;
 $L9:
 $L8:

  ret;
}
...

[ Note the neutering of "join 2", that's PR84025 - [nvptx] Don't generate
branch-around-nothing ].

The problem is the positioning of bar.sync, inside the vector-neutering branch.

The documentation for bar.sync says:
...
Barriers are executed on a per-warp basis as if all the threads in a warp are
active. Thus, if any thread in a warp executes a bar instruction, it is as if
all the threads in the warp have executed the bar instruction. All threads in
the warp are stalled until the barrier completes, and the arrival count for the
barrier is incremented by the warp size (not the number of active threads in
the warp). In conditionally executed code, a bar instruction should only be
used if it is known that all threads evaluate the condition identically (the
warp does not diverge).
...

The documentation is somewhat contradictory, in that it first explains that
that it is executed on a per-warp basis (implying that only one thread
executing it should be fine), but then goes on to state that it should not be
executed in divergent mode (implying that all threads should execute it).

Either way, the safest form of usage is: don't execute in divergent mode.

As is evident from the example above, we do generate bar.sync in divergent
mode, and that should be fixed.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]