[gomp] reinstate open-coded static loop scheduling

Mon Sep 26 04:13:00 GMT 2005

Reimplemented in terms of the libgomp algorithms which appear to
work correctly on all the edge conditions.  And unlike the previous
code, this one is type correct.  ;-)


r~


        * gimplify.c (gimplify_omp_for_generic): Tidy commentary.
        (gimplify_omp_for_static_nochunk, gimplify_omp_for_static_chunk): New.
        (gimplify_omp_for): Use them.

Index: gimplify.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/gimplify.c,v
retrieving revision 2.135.4.18
diff -u -p -d -r2.135.4.18 gimplify.c

--- gimplify.c	25 Sep 2005 22:39:25 -0000	2.135.4.18
+++ gimplify.c	26 Sep 2005 04:07:44 -0000
@@ -3912,14 +3912,11 @@ gimplify_to_stmt_list (tree *stmt_p)
 }
 
 /* A subroutine of gimplify_omp_for.  Generate code for a parallel
-   loop with any schedule.
-
-   The general form is:
+   loop with any schedule.  Given parameters:
 
-   OMP_FOR <clause(s), V = N1, V {<, >, >=, <=} N2, V {+=, -=} STEP, BODY>
+	for (V = N1; V cond N2; V += STEP) BODY;
 
-   For loops of the form V = N1; V < N2; V += STEP, it generates the
-   following pseudo-code (gimplified accordingly):
+   where COND is "<" or ">", we generate pseudocode
 
 	more = GOMP_loop_foo_start (N1, N2, STEP, CHUNK, &istart0, &iend0);
 	if (more) goto L0; else goto L3;
@@ -3929,18 +3926,12 @@ gimplify_to_stmt_list (tree *stmt_p)
     L1:
 	BODY;
 	V += STEP;
-	if (V < iend) goto L1; else goto L2;
+	if (V cond iend) goto L1; else goto L2;
     L2:
 	more = GOMP_loop_foo_next (&istart0, &iend0);
 	if (more) goto L0; else goto L3;
     L3:
-
-   For '>' the transformation is the same, except for (V > iend).
-   For '<=' and '>=', we first adjust N2 by 1 to transform to a
-   strict inequality.  */
-
-/* ???  For static schedules with no chunk size and no ORDERED requirement,
-   we can omit the call to GOMP_loop_static_next.  */
+*/
 
 static void
 gimplify_omp_for_generic (tree v, tree n1, tree n2, tree step,
@@ -4035,6 +4026,281 @@ gimplify_omp_for_generic (tree v, tree n
   gimplify_and_add (t, pre_p);
 }
 
+/* A subroutine of gimplify_omp_for.  Generate code for a parallel
+   loop with static schedule and no specified chunk size.  Given parameters:
+
+	for (V = N1; V cond N2; V += STEP) BODY;
+
+   where COND is "<" or ">", we generate pseudocode
+
+	if (cond is <)
+	  adj = STEP - 1;
+	else
+	  adj = STEP + 1;
+	n = (adj + N2 - N1) / STEP;
+	q = n / nthreads;
+	q += (q * nthreads != n);
+	s0 = q * threadid;
+	e0 = min(s0 + q, n);
+	if (s0 >= e0) goto L2; else goto L0;
+    L0:
+	V = s0 * STEP + N1;
+	e = e0 * STEP + N1;
+    L1:
+	BODY;
+	V += STEP;
+	if (V cond e) goto L1; else goto L2;
+    L2:
+*/
+
+static void
+gimplify_omp_for_static_nochunk (tree v, tree n1, tree n2, tree step,
+				 tree body, enum tree_code cond_code,
+				 tree *pre_p)
+{
+  tree l0, l1, l2, n, q, s0, e0, e, t, nthreads, threadid;
+  tree type, utype;
+
+  l0 = create_artificial_label ();
+  l1 = create_artificial_label ();
+  l2 = create_artificial_label ();
+  
+  type = TREE_TYPE (v);
+  utype = lang_hooks.types.unsigned_type (type);
+
+  t = built_in_decls[BUILT_IN_OMP_GET_NUM_THREADS];
+  t = build_function_call_expr (t, NULL);
+  t = fold_convert (utype, t);
+  nthreads = get_formal_tmp_var (t, pre_p);
+  
+  t = built_in_decls[BUILT_IN_OMP_GET_THREAD_NUM];
+  t = build_function_call_expr (t, NULL);
+  t = fold_convert (utype, t);
+  threadid = get_formal_tmp_var (t, pre_p);
+
+  n1 = fold_convert (type, n1);
+  if (!is_gimple_val (n1))
+    n1 = get_formal_tmp_var (n1, pre_p);
+
+  n2 = fold_convert (type, n2);
+  if (!is_gimple_val (n2))
+    n2 = get_formal_tmp_var (n2, pre_p);
+
+  step = fold_convert (type, step);
+  if (!is_gimple_val (step))
+    step = get_formal_tmp_var (step, pre_p);
+
+  t = build_int_cst (type, (cond_code == LT_EXPR ? -1 : 1));
+  t = fold_build2 (PLUS_EXPR, type, step, t);
+  t = fold_build2 (PLUS_EXPR, type, t, n2);
+  t = fold_build2 (MINUS_EXPR, type, t, n1);
+  t = fold_build2 (TRUNC_DIV_EXPR, type, t, step);
+  t = fold_convert (utype, t);
+  if (is_gimple_val (t))
+    n = t;
+  else
+    n = get_formal_tmp_var (t, pre_p);
+
+  t = build2 (TRUNC_DIV_EXPR, utype, n, nthreads);
+  q = get_formal_tmp_var (t, pre_p);
+
+  t = build2 (MULT_EXPR, utype, q, nthreads);
+  t = build2 (NE_EXPR, utype, t, n);
+  t = build2 (PLUS_EXPR, utype, q, t);
+  q = get_formal_tmp_var (t, pre_p);
+
+  t = build2 (MULT_EXPR, utype, q, threadid);
+  s0 = get_formal_tmp_var (t, pre_p);
+
+  t = build2 (PLUS_EXPR, utype, s0, q);
+  t = build2 (MIN_EXPR, utype, t, n);
+  e0 = get_formal_tmp_var (t, pre_p);
+
+  t = build2 (GE_EXPR, boolean_type_node, s0, e0);
+  t = build3 (COND_EXPR, void_type_node, t,
+	      build_and_jump (&l2), build_and_jump (&l0));
+  gimplify_and_add (t, pre_p);
+
+  t = build1 (LABEL_EXPR, void_type_node, l0);
+  gimplify_and_add (t, pre_p);
+
+  t = fold_convert (type, s0);
+  t = build2 (MULT_EXPR, type, t, step);
+  t = build2 (PLUS_EXPR, type, t, n1);
+  t = build2 (MODIFY_EXPR, void_type_node, v, t);
+  gimplify_and_add (t, pre_p);
+
+  t = fold_convert (type, e0);
+  t = build2 (MULT_EXPR, type, t, step);
+  t = build2 (PLUS_EXPR, type, t, n1);
+  e = get_formal_tmp_var (t, pre_p);
+
+  t = build1 (LABEL_EXPR, void_type_node, l1);
+  gimplify_and_add (t, pre_p);
+
+  gimplify_and_add (body, pre_p);
+
+  t = build2 (PLUS_EXPR, type, v, step);
+  t = build2 (MODIFY_EXPR, void_type_node, v, t);
+  gimplify_and_add (t, pre_p);
+
+  t = build2 (cond_code, boolean_type_node, v, e);
+  t = build3 (COND_EXPR, void_type_node, t,
+	      build_and_jump (&l1), build_and_jump (&l2));
+  gimplify_and_add (t, pre_p);
+
+  t = build1 (LABEL_EXPR, void_type_node, l2);
+  gimplify_and_add (t, pre_p);
+}
+
+/* A subroutine of gimplify_omp_for.  Generate code for a parallel
+   loop with static schedule and a specified chunk size.  Given parameters:
+
+	for (V = N1; V cond N2; V += STEP) BODY;
+
+   where COND is "<" or ">", we generate pseudocode
+
+	if (cond is <)
+	  adj = STEP - 1;
+	else
+	  adj = STEP + 1;
+	n = (adj + N2 - N1) / STEP;
+	trip = 0;
+    L0:
+	s0 = (trip * nthreads + threadid) * CHUNK;
+	e0 = min(s0 + CHUNK, n);
+	if (s0 < n) goto L1; else goto L4;
+    L1:
+	V = s0 * STEP + N1;
+	e = e0 * STEP + N1;
+    L2:
+	BODY;
+	V += STEP;
+	if (V cond e) goto L2; else goto L3;
+    L3:
+	trip += 1;
+	goto L0;
+    L4:
+*/
+
+static void
+gimplify_omp_for_static_chunk (tree v, tree n1, tree n2, tree step,
+			       tree body, enum tree_code cond_code,
+			       tree chunk, tree *pre_p)
+{
+  tree l0, l1, l2, l3, l4, n, s0, e0, e, t;
+  tree trip, nthreads, threadid;
+  tree type, utype;
+
+  l0 = create_artificial_label ();
+  l1 = create_artificial_label ();
+  l2 = create_artificial_label ();
+  l3 = create_artificial_label ();
+  l4 = create_artificial_label ();
+  
+  type = TREE_TYPE (v);
+  utype = lang_hooks.types.unsigned_type (type);
+
+  t = built_in_decls[BUILT_IN_OMP_GET_NUM_THREADS];
+  t = build_function_call_expr (t, NULL);
+  t = fold_convert (utype, t);
+  nthreads = get_formal_tmp_var (t, pre_p);
+  
+  t = built_in_decls[BUILT_IN_OMP_GET_THREAD_NUM];
+  t = build_function_call_expr (t, NULL);
+  t = fold_convert (utype, t);
+  threadid = get_formal_tmp_var (t, pre_p);
+
+  n1 = fold_convert (type, n1);
+  if (!is_gimple_val (n1))
+    n1 = get_formal_tmp_var (n1, pre_p);
+
+  n2 = fold_convert (type, n2);
+  if (!is_gimple_val (n2))
+    n2 = get_formal_tmp_var (n2, pre_p);
+
+  step = fold_convert (type, step);
+  if (!is_gimple_val (step))
+    step = get_formal_tmp_var (step, pre_p);
+
+  chunk = fold_convert (utype, chunk);
+  if (!is_gimple_val (chunk))
+    chunk = get_formal_tmp_var (chunk, pre_p);
+
+  t = build_int_cst (type, (cond_code == LT_EXPR ? -1 : 1));
+  t = fold_build2 (PLUS_EXPR, type, step, t);
+  t = fold_build2 (PLUS_EXPR, type, t, n2);
+  t = fold_build2 (MINUS_EXPR, type, t, n1);
+  t = fold_build2 (TRUNC_DIV_EXPR, type, t, step);
+  t = fold_convert (utype, t);
+  if (is_gimple_val (t))
+    n = t;
+  else
+    n = get_formal_tmp_var (t, pre_p);
+
+  t = build_int_cst (utype, 0);
+  trip = get_initialized_tmp_var (t, pre_p, NULL);
+
+  t = build1 (LABEL_EXPR, void_type_node, l0);
+  gimplify_and_add (t, pre_p);
+
+  t = build2 (MULT_EXPR, utype, trip, nthreads);
+  t = build2 (PLUS_EXPR, utype, t, threadid);
+  t = build2 (MULT_EXPR, utype, t, chunk);
+  s0 = get_formal_tmp_var (t, pre_p);
+
+  t = build2 (PLUS_EXPR, utype, s0, chunk);
+  t = build2 (MIN_EXPR, utype, t, n);
+  e0 = get_formal_tmp_var (t, pre_p);
+
+  t = build2 (LT_EXPR, boolean_type_node, s0, n);
+  t = build3 (COND_EXPR, void_type_node, t,
+	      build_and_jump (&l1), build_and_jump (&l4));
+  gimplify_and_add (t, pre_p);
+
+  t = build1 (LABEL_EXPR, void_type_node, l1);
+  gimplify_and_add (t, pre_p);
+
+  t = fold_convert (type, s0);
+  t = build2 (MULT_EXPR, type, t, step);
+  t = build2 (PLUS_EXPR, type, t, n1);
+  t = build2 (MODIFY_EXPR, void_type_node, v, t);
+  gimplify_and_add (t, pre_p);
+
+  t = fold_convert (type, e0);
+  t = build2 (MULT_EXPR, type, t, step);
+  t = build2 (PLUS_EXPR, type, t, n1);
+  e = get_formal_tmp_var (t, pre_p);
+
+  t = build1 (LABEL_EXPR, void_type_node, l2);
+  gimplify_and_add (t, pre_p);
+
+  gimplify_and_add (body, pre_p);
+
+  t = build2 (PLUS_EXPR, type, v, step);
+  t = build2 (MODIFY_EXPR, void_type_node, v, t);
+  gimplify_and_add (t, pre_p);
+
+  t = build2 (cond_code, boolean_type_node, v, e);
+  t = build3 (COND_EXPR, void_type_node, t,
+	      build_and_jump (&l2), build_and_jump (&l3));
+  gimplify_and_add (t, pre_p);
+
+  t = build1 (LABEL_EXPR, void_type_node, l3);
+  gimplify_and_add (t, pre_p);
+
+  t = build_int_cst (utype, 1);
+  t = build2 (PLUS_EXPR, utype, trip, t);
+  t = build2 (MODIFY_EXPR, void_type_node, trip, t);
+  gimplify_and_add (t, pre_p);
+
+  t = build1 (GOTO_EXPR, void_type_node, l0);
+  gimplify_and_add (t, pre_p);
+
+  t = build1 (LABEL_EXPR, void_type_node, l4);
+  gimplify_and_add (t, pre_p);
+}
+
 /* Gimplify a OMP_FOR statement.  */
 
 static enum gimplify_status
@@ -4045,7 +4311,6 @@ gimplify_omp_for (tree *expr_p, tree *pr
   enum tree_code cond_code;
   bool have_nowait, have_ordered;
   enum omp_clause_schedule_kind sched_kind;
-  int fn_index;
 
   for_stmt = *expr_p;
 
@@ -4135,17 +4400,33 @@ gimplify_omp_for (tree *expr_p, tree *pr
 	break;
       }
 
-  if (sched_kind == OMP_CLAUSE_SCHEDULE_RUNTIME)
-    gcc_assert (chunk_size == NULL);
-  else if (chunk_size == NULL)
-    chunk_size = integer_zero_node;
+  if (sched_kind == OMP_CLAUSE_SCHEDULE_STATIC && !have_ordered)
+    {
+      if (chunk_size == NULL)
+	gimplify_omp_for_static_nochunk (v, n1, n2, step,
+					 OMP_FOR_BODY (for_stmt),
+					 cond_code, pre_p);
+      else
+	gimplify_omp_for_static_chunk (v, n1, n2, step,
+				       OMP_FOR_BODY (for_stmt), cond_code,
+				       chunk_size, pre_p);
+    }
+  else
+    {
+      int fn_index;
 
-  fn_index = sched_kind + have_ordered * 4;
+      if (sched_kind == OMP_CLAUSE_SCHEDULE_RUNTIME)
+	gcc_assert (chunk_size == NULL);
+      else if (chunk_size == NULL)
+	chunk_size = integer_zero_node;
 
-  gimplify_omp_for_generic (v, n1, n2, step, chunk_size,
-			    OMP_FOR_BODY (for_stmt), cond_code, pre_p,
-			    BUILT_IN_GOMP_LOOP_STATIC_START + fn_index,
-			    BUILT_IN_GOMP_LOOP_STATIC_NEXT + fn_index);
+      fn_index = sched_kind + have_ordered * 4;
+
+      gimplify_omp_for_generic (v, n1, n2, step, chunk_size,
+				OMP_FOR_BODY (for_stmt), cond_code, pre_p,
+				BUILT_IN_GOMP_LOOP_STATIC_START + fn_index,
+				BUILT_IN_GOMP_LOOP_STATIC_NEXT + fn_index);
+    }
 
   if (have_nowait)
     {