This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[GRAPHITE, PATCH] Loop unroll and jam optimization


Hello, 

This is the patch for unroll and jam optimizations. It is based on the
code written by Tobias from graphite-optimize-isl.c (the code was
unreachable till now) extended and enabled it via a new option -floop-unroll-jam.
The patch takes advantage of the new isl based code generator introduced recently 
in GCC (in fact of the possible options for building the AST).

The code generated for this optimization in the case of non-constant loop bounds
initially looks as below. This is not very useful because the standard GCC 
unrolling don't succeed to unroll the most inner loop.

ISL AST generated by ISL: 
for (int c0 = 0; c0 < HEIGHT; c0 += 4)
  for (int c1 = 0; c1 < LENGTH - 3; c1 += 1)
    for (int c2 = c0; c2 <= min(HEIGHT - 1, c0 + 3); c2 += 1)
      S_4(c2, c1);

Now, the "separating class" option (set for unroll and jam) produces this nice loop 
structure: 
ISL AST generated by ISL: 
for (int c0 = 0; c0 < HEIGHT; c0 += 4)
  for (int c1 = 0; c1 < LENGTH - 3; c1 += 1)
    if (HEIGHT >= c0 + 4) {
      for (int c2 = c0; c2 <= c0 + 3; c2 += 1)
        S_4(c2, c1);
    } else
      for (int c2 = c0; c2 < HEIGHT; c2 += 1)
        S_4(c2, c1);

The "unroll" option (set for unroll and jam) produces: 
ISL AST generated by ISL: 
for (int c0 = 0; c0 < HEIGHT; c0 += 4)
  for (int c1 = 0; c1 < LENGTH - 3; c1 += 1)
    if (HEIGHT >= c0 + 4) {
      S_4(c0, c1);
      S_4(c0 + 1, c1);
      S_4(c0 + 2, c1);
      S_4(c0 + 3, c1);
    } else {
      S_4(c0, c1);
      if (HEIGHT >= c0 + 2) {
        S_4(c0 + 1, c1);
        if (4 * floord(HEIGHT - 3, 4) + 3 == HEIGHT && c0 + 3 == HEIGHT)
          S_4(HEIGHT - 1, c1);
      }
    }

The "separate" option (set by default for all dimensions for the new isl based code generator) 
don't succeed to remove the ifs from the loops and generate two loop structures (this would
have been highly desirable).   

As the stage 1 is going to close soon, quick feedback to this patch is greatly appreciated.
Many thanks, Mircea Namolaru

Attachment: Change.txt
Description: Text document

Index: gcc/graphite.c
===================================================================
--- gcc/graphite.c	(revision 217013)
+++ gcc/graphite.c	(working copy)
@@ -383,7 +383,8 @@
       || flag_loop_strip_mine
       || flag_graphite_identity
       || flag_loop_parallelize_all
-      || flag_loop_optimize_isl)
+      || flag_loop_optimize_isl
+      || flag_loop_unroll_jam)
     flag_graphite = 1;
 
   return flag_graphite != 0;
Index: gcc/common.opt
===================================================================
--- gcc/common.opt	(revision 217013)
+++ gcc/common.opt	(working copy)
@@ -1328,6 +1328,10 @@
 Common Report Var(flag_loop_block) Optimization
 Enable Loop Blocking transformation
 
+floop-unroll-jam
+Common Report Var(flag_loop_unroll_jam) Optimization
+Enable Loop Unroll Jam transformation
+ 
 fgnu-tm
 Common Report Var(flag_tm)
 Enable support for GNU transactional memory
Index: gcc/toplev.c
===================================================================
--- gcc/toplev.c	(revision 217013)
+++ gcc/toplev.c	(working copy)
@@ -1302,11 +1302,12 @@
       || flag_loop_block
       || flag_loop_interchange
       || flag_loop_strip_mine
-      || flag_loop_parallelize_all)
+      || flag_loop_parallelize_all
+      || flag_loop_unroll_jam)
     sorry ("Graphite loop optimizations cannot be used (ISL is not available)" 
 	   "(-fgraphite, -fgraphite-identity, -floop-block, "
 	   "-floop-interchange, -floop-strip-mine, -floop-parallelize-all, "
-	   "and -ftree-loop-linear)");
+	   "-floop-unroll-jam, and -ftree-loop-linear)");
 #endif
 
   /* One region RA really helps to decrease the code size.  */
Index: gcc/params.def
===================================================================
--- gcc/params.def	(revision 217013)
+++ gcc/params.def	(working copy)
@@ -847,6 +847,21 @@
 	  "size of tiles for loop blocking",
 	  51, 0, 0)
 
+/* Size of unrolling factor for unroll-and-jam.  */
+ 
+DEFPARAM (PARAM_LOOP_UNROLL_JAM_SIZE,
+ 	  "loop-unroll-jam-size",
+ 	  "size of unrolling factor for unroll-and-jam",
+ 	  4, 0, 0)
+
+/* Size of the band formed by the strip mined dimension and the most inner one for unroll-and-jam.  */
+ 
+DEFPARAM (PARAM_LOOP_UNROLL_JAM_DEPTH,
+ 	  "loop-unroll-jam-depth",
+ 	  "depth of unrolled loop for unroll-and-jam",
+ 	  2, 0, 0)
+
+
 /* Maximal number of parameters that we allow in a SCoP.  */
 
 DEFPARAM (PARAM_GRAPHITE_MAX_NB_SCOP_PARAMS,
Index: gcc/graphite-isl-ast-to-gimple.c
===================================================================
--- gcc/graphite-isl-ast-to-gimple.c	(revision 217013)
+++ gcc/graphite-isl-ast-to-gimple.c	(working copy)
@@ -831,6 +831,159 @@
   return schedule;
 }
 
+/* Auxiliary data structure to separate the maps. */
+
+struct sep_map 
+{
+  isl_map *map_arr[2];
+  int n;
+}; 
+
+/* Auxiliary function used to separate the maps. */
+
+static int 
+separate_map (isl_basic_map *bmap, void *user)
+{
+  struct sep_map *smap = (struct sep_map *) user;
+
+  smap->map_arr[smap->n] = isl_map_from_basic_map(bmap);
+  smap->n += 1;
+
+  return 0;
+}
+
+/* Set the unroll AST built option.  */
+
+static __isl_give isl_union_map *
+generate_isl_options_2 (scop_p scop, __isl_take isl_union_set *domain, 
+			int dim, int cl)
+{
+  isl_map  *map;
+  isl_space *space, *space_sep;
+  isl_ctx *ctx;
+  isl_union_map *mapu;
+  int nsched = get_max_schedule_dimensions (scop);
+ 
+  ctx = scop->ctx;
+  space_sep = isl_space_alloc(ctx, 0, 1, 1);
+  space_sep = isl_space_wrap(space_sep);
+  space_sep = isl_space_set_tuple_name(space_sep, isl_dim_set,
+				       "separation_class");
+  space = isl_set_get_space (scop->context);
+  space_sep = isl_space_align_params(space_sep, isl_space_copy(space));
+  space = isl_space_map_from_domain_and_range(space, space_sep);
+  space = isl_space_add_dims (space,isl_dim_in, nsched);
+  map = isl_map_universe (space);
+  isl_map_fix_si (map,isl_dim_out,0,dim);
+  isl_map_fix_si (map,isl_dim_out,1,cl);
+
+  mapu = isl_union_map_intersect_domain (isl_union_map_from_map (map), 
+					 domain);
+  return (mapu);
+}
+
+/* Set the separation_class AST built option. */
+
+static __isl_give isl_union_map *
+generate_isl_options_1 (scop_p scop, const char *str, int dim)
+{
+  isl_ctx *ctx;
+  isl_map *map;
+  isl_space *space_sep;
+  isl_space *space;
+  isl_union_map *options_isl;
+  int nsched = get_max_schedule_dimensions (scop);
+
+  ctx = scop->ctx;
+  space_sep = isl_space_alloc(ctx, 0, 1, 1);
+  space = isl_set_get_space (scop->context);
+  space_sep = isl_space_align_params(space_sep, space);
+  space_sep = isl_space_add_dims (space_sep,isl_dim_in, nsched - 1);
+  space_sep = isl_space_set_tuple_name(space_sep, isl_dim_out, str);
+  map = isl_map_universe (space_sep);
+  isl_map_fix_si (map,isl_dim_out,0,dim);
+
+  options_isl = isl_union_map_from_map (map);
+ 
+  return options_isl;
+}
+
+/* Compute the separation class. */
+
+static __isl_give isl_union_set *
+generate_isl_domain_n (scop_p scop)
+{
+  int i;
+  poly_bb_p pbb;
+  isl_union_set *domain_isl;
+
+  domain_isl = isl_union_set_empty (isl_set_get_space (scop->context));
+
+  FOR_EACH_VEC_ELT (SCOP_BBS (scop), i, pbb)
+    {
+      isl_set *bb_domain;
+      isl_set *bb_domain_s;
+      isl_set *bb_domain_r;
+      int res;
+
+      struct sep_map smap, *psmap;
+
+      /* Dead code elimination: when the domain of a PBB is empty,
+	 don't generate code for the PBB.  */
+      if (isl_set_is_empty (pbb->domain))
+	continue;
+
+      bb_domain = isl_set_copy (pbb->domain);
+
+      /* Extract the original and auxiliar maps from pbb->transformed.
+         Set pbb->transformed to the original map. */
+      psmap = &smap;
+      psmap->n = 0;
+      res = isl_map_foreach_basic_map (pbb->transformed, separate_map, (void *)psmap);
+      gcc_assert (res == 0);
+
+      isl_map_free(pbb->transformed);
+      pbb->transformed = isl_map_copy(psmap->map_arr[0]);
+
+      bb_domain_s = isl_set_apply (isl_set_copy (bb_domain),
+				   psmap->map_arr[1]);
+      bb_domain_r = isl_set_apply (bb_domain, psmap->map_arr[0]);      
+
+      bb_domain_s = isl_set_complement (bb_domain_s);
+      bb_domain_r = isl_set_subtract(bb_domain_r,bb_domain_s);
+
+      domain_isl =
+        isl_union_set_union (domain_isl,
+			     isl_union_set_from_set (bb_domain_r));
+    }
+
+  return domain_isl;
+}
+
+/* Set the AST built options for loop unroll and jam. */
+ 
+static __isl_give isl_union_map *
+generate_isl_options (scop_p scop)
+{
+  isl_union_set *domain_isl;
+  isl_union_map *options_isl_ss;
+  isl_union_map *options_isl =
+    isl_union_map_empty (isl_set_get_space (scop->context));
+  int dim = get_max_schedule_dimensions (scop) - 1;
+
+  if (!flag_loop_unroll_jam)
+    return options_isl;
+
+  domain_isl = generate_isl_domain_n (scop);
+ 
+  options_isl_ss = generate_isl_options_1 (scop, "unroll", dim);
+  options_isl = isl_union_map_union (options_isl, options_isl_ss);
+  options_isl_ss = generate_isl_options_2 (scop, domain_isl, dim, 0);
+  options_isl = isl_union_map_union (options_isl, options_isl_ss);
+
+  return options_isl;
+}
+
 /* Generates a schedule, which specifies an order used to
    visit elements in a domain.  */
 
@@ -879,11 +1032,13 @@
 }
 
 /* Set the separate option for all dimensions.
-   This helps to reduce control overhead.  */
+   This helps to reduce control overhead.
+   Set the options for unroll and jam.  */
 
 static __isl_give isl_ast_build *
 set_options (__isl_take isl_ast_build *control,
-	     __isl_keep isl_union_map *schedule)
+	     __isl_keep isl_union_map *schedule,
+	     __isl_take isl_union_map *opt_luj)
 {
   isl_ctx *ctx = isl_union_map_get_ctx (schedule);
   isl_space *range_space = isl_space_set_alloc (ctx, 0, 1);
@@ -894,6 +1049,9 @@
   isl_union_set *domain = isl_union_map_range (isl_union_map_copy (schedule));
   domain = isl_union_set_universe (domain);
   isl_union_map *options = isl_union_map_from_domain_and_range (domain, range);
+
+  options = isl_union_map_union (options, opt_luj);
+
   return isl_ast_build_set_options (control, options);
 }
 
@@ -907,9 +1065,14 @@
   isl_options_set_ast_build_atomic_upper_bound (scop->ctx, true);
 
   add_parameters_to_ivs_params (scop, ip);
+
+  isl_union_map *options_isl = generate_isl_options (scop);
+
   isl_union_map *schedule_isl = generate_isl_schedule (scop);
   isl_ast_build *context_isl = generate_isl_context (scop);
-  context_isl = set_options (context_isl, schedule_isl);
+
+  context_isl = set_options (context_isl, schedule_isl, options_isl);
+
   isl_union_map *dependences = NULL;
   if (flag_loop_parallelize_all)
   {
Index: gcc/graphite-optimize-isl.c
===================================================================
--- gcc/graphite-optimize-isl.c	(revision 217013)
+++ gcc/graphite-optimize-isl.c	(working copy)
@@ -186,7 +186,7 @@
   PartialSchedule = isl_band_get_partial_schedule (Band);
   *Dimensions = isl_band_n_member (Band);
 
-  if (DisableTiling)
+  if (DisableTiling || flag_loop_unroll_jam)
     return PartialSchedule;
 
   /* It does not make any sense to tile a band with just one dimension.  */
@@ -305,8 +305,83 @@
   isl_constraint_set_constant_si (c, VectorWidth - 1);
   TilingMap = isl_map_add_constraint (TilingMap, c);
 
-  isl_map_dump (TilingMap);
+  return TilingMap;
+}
 
+/* Create an auxiliary map to getPrevectorMap whose image provides the
+    separation class for full/partial tile separation of the strip-mined
+    dimension. It is used in graphite_isl_ast_to_gimple.c to set the 
+    corresponding option for AST build. */ 
+static isl_map *
+getPrevectorMap_f (isl_ctx *ctx, int DimToVectorize,
+		 int ScheduleDimensions,
+		 int VectorWidth)
+{
+  isl_space *Space;
+  isl_local_space *LocalSpace, *LocalSpaceRange;
+  isl_set *Modulo;
+  isl_map *TilingMap;
+  isl_constraint *c;
+  isl_aff *Aff;
+  int PointDimension; /* ip */
+  int TileDimension;  /* it */
+  isl_val *VectorWidthMP;
+  int i;
+
+  /* assert (0 <= DimToVectorize && DimToVectorize < ScheduleDimensions);*/
+
+  Space = isl_space_alloc (ctx, 0, ScheduleDimensions, ScheduleDimensions + 1);
+  TilingMap = isl_map_universe (isl_space_copy (Space));
+  LocalSpace = isl_local_space_from_space (Space);
+  PointDimension = ScheduleDimensions;
+  TileDimension = DimToVectorize;
+
+  /* Create an identity map for everything except DimToVectorize and the 
+     point loop. */
+  for (i = 0; i < ScheduleDimensions; i++)
+    {
+      if (i == DimToVectorize)
+        continue;
+
+      c = isl_equality_alloc (isl_local_space_copy (LocalSpace));
+
+      isl_constraint_set_coefficient_si (c, isl_dim_in, i, -1);
+      isl_constraint_set_coefficient_si (c, isl_dim_out, i, 1);
+
+      TilingMap = isl_map_add_constraint (TilingMap, c);
+    }
+
+  /* it % 'VectorWidth' = 0  */
+  LocalSpaceRange = isl_local_space_range (isl_local_space_copy (LocalSpace));
+  Aff = isl_aff_zero_on_domain (LocalSpaceRange);
+  Aff = isl_aff_set_constant_si (Aff, VectorWidth);
+  Aff = isl_aff_set_coefficient_si (Aff, isl_dim_in, TileDimension, 1);
+
+  VectorWidthMP = isl_val_int_from_si (ctx, VectorWidth);
+  Aff = isl_aff_mod_val (Aff, VectorWidthMP);
+  Modulo = isl_pw_aff_zero_set (isl_pw_aff_from_aff (Aff));
+  TilingMap = isl_map_intersect_range (TilingMap, Modulo);
+
+  /* it + ('VectorWidth' - 1) <= i0  */
+  c = isl_inequality_alloc (isl_local_space_copy(LocalSpace));
+  isl_constraint_set_coefficient_si (c, isl_dim_out, TileDimension,-1);
+  isl_constraint_set_coefficient_si (c, isl_dim_in, TileDimension, 1);
+  isl_constraint_set_constant_si (c, -VectorWidth + 1);
+  TilingMap = isl_map_add_constraint (TilingMap, c);
+
+  /* it <= ip */
+  c = isl_inequality_alloc (isl_local_space_copy (LocalSpace));
+  isl_constraint_set_coefficient_si (c, isl_dim_out, TileDimension, -1);
+  isl_constraint_set_coefficient_si (c, isl_dim_out, PointDimension, 1);
+  TilingMap = isl_map_add_constraint (TilingMap, c);
+
+  /* ip <= it + ('VectorWidth' - 1) */
+  c = isl_inequality_alloc (LocalSpace);
+  isl_constraint_set_coefficient_si (c, isl_dim_out, TileDimension, 1);
+  isl_constraint_set_coefficient_si (c, isl_dim_out, PointDimension, -1);
+  isl_constraint_set_constant_si (c, VectorWidth - 1);
+  TilingMap = isl_map_add_constraint (TilingMap, c);
+
   return TilingMap;
 }
 
@@ -350,17 +425,31 @@
 							      SuffixSchedule);
 	  isl_band_list_free (Children);
 	}
-      else if (EnablePollyVector)
+      else if (EnablePollyVector || flag_loop_unroll_jam)
 	{
+           int i;
+           int depth;
+ 
+ 	  depth = PARAM_VALUE (PARAM_LOOP_UNROLL_JAM_DEPTH);
+  
 	  for (i = ScheduleDimensions - 1 ;  i >= 0 ; i--)
 	    {
+	      if (flag_loop_unroll_jam && (i != (ScheduleDimensions - depth)))
+		  continue;
+
 	      if (isl_band_member_is_zero_distance (Band, i))
 		{
 		  isl_map *TileMap;
 		  isl_union_map *TileUMap;
+		  int stride;
 
-		  TileMap = getPrevectorMap (ctx, i, ScheduleDimensions, 4);
+                  stride = PARAM_VALUE (PARAM_LOOP_UNROLL_JAM_SIZE);    
+
+                  TileMap = getPrevectorMap (ctx, i, ScheduleDimensions, stride);
 		  TileUMap = isl_union_map_from_map (TileMap);
+		  TileMap = getPrevectorMap_f (ctx, i, ScheduleDimensions, stride);
+ 		  TileUMap = isl_union_map_add_map (TileUMap, TileMap);
+
 		  TileUMap = isl_union_map_align_params
 		    (TileUMap, isl_space_copy (Space));
 		  PartialSchedule = isl_union_map_apply_range
Index: gcc/graphite-poly.c
===================================================================
--- gcc/graphite-poly.c	(revision 217013)
+++ gcc/graphite-poly.c	(working copy)
@@ -276,7 +276,7 @@
 
   /* This pass needs to be run at the final stage, as it does not
      update the lst.  */
-  if (flag_loop_optimize_isl)
+  if (flag_loop_optimize_isl || flag_loop_unroll_jam)
     transform_done |= optimize_isl (scop);
 
   return transform_done;

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]