This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[rfc] transfering number of iterations from trees to rtl (PR 32283)


Hi,

a problem with having loop optimizer split to two parts (tree and rtl)
is that it is difficult to share the information between these two
parts.  One particular example is the following loop:

char a[100000];

for (i = 0; i < n; i++)
  a[16*i] = 0;

This is a nice loop for that we have no problems to determine that
n is the number of iterations.  Ivopts perform strength reduction
and induction variable elimination, transforming this to

k = a + 16 * n;
for (p = a; p != k; p += 16)
  *p = 0;

Now, on rtl we determine that the number of iterations is
(k-a)/16, under assumption that k-a is divisible by 16.  Although
all the information necessary to determine that the assumption
is always satisfied and that (k-a)/16 = n is in the code, this
becomes difficult to prove (especially since the expression
a + 16 * n can be further optimized by pre/cse), and we cannot
do this without complicating the analysis significantly.

In other cases, the number of iterations could be determined on trees
using say the fact that signed operations do not overflow, but this
information is lost on rtl, so again we fail to determine the number of
iterations.  This affects several optimizations on rtl (unrolling,
doloop, and in turn sms).

The obvious solution is to store the number of iterations of the loops,
and reuse it on rtl instead of trying to recompute it.  Ideally, this
would be achieved by keeping track of the loops between tree and rtl
optimizer.  Unfortunately, that is fairly difficult to implement --
the major obstacle being tree->rtl expansion, that may affect cfg more
or less arbitrarily, but even after solving this somehow, it would still
require at least a few weeks of work -- and in any case, such a change
would definitely not be suitable for stage 3.

A quicker, safer and dirtier way is to somehow record the information in
the IR on trees, and pick it up on RTL.  For example, for the loop above
we could emit the following code:

k = a + 16 * n;
builtin_loop_start(1, n);
for (p = a; p != k; p += 16)
  {
    builtin_loop_step(1);
    *p = 0;
  }

where the builtins are expanded to some similar form of "fake" insn on
rtl, and later gathered in the rtl loop optimizer.  This is easy to
implement, and safe:  all the transformations performed by the compiler
neccesarily preserve the fact that builtin_loop_step(loop_id) is
executed exactly n times after builtin_loop_start(loop_id, n).
Therefore, if there is exactly one builtin_loop_start and
builtin_loop_step for a given loop_id, builtin_loop_start is in a
preheader of a loop and builtin_loop_step is in a header of the same
loop, then we can be sure that the loop iterates exactly n times.

The proof of the concept, more or less untested, implementation of the
idea is below; before I spend more time on this, I would like to know
whether this would be acceptable (in addition to all the nice things
that I can say about it, it is also exceptionally ugly), or preferably,
if someone has a better idea how to solve the problem?

Zdenek

Index: tree-pass.h
===================================================================
*** tree-pass.h	(revision 130990)
--- tree-pass.h	(working copy)
*************** extern struct tree_opt_pass pass_loop2;
*** 381,386 ****
--- 381,387 ----
  extern struct tree_opt_pass pass_rtl_loop_init;
  extern struct tree_opt_pass pass_rtl_move_loop_invariants;
  extern struct tree_opt_pass pass_rtl_unswitch;
+ extern struct tree_opt_pass pass_rtl_gather_loop_niter_builtins;
  extern struct tree_opt_pass pass_rtl_unroll_and_peel_loops;
  extern struct tree_opt_pass pass_rtl_doloop;
  extern struct tree_opt_pass pass_rtl_loop_done;
Index: builtins.c
===================================================================
*** builtins.c	(revision 130990)
--- builtins.c	(working copy)
*************** expand_builtin_prefetch (tree exp)
*** 1059,1064 ****
--- 1059,1088 ----
      emit_insn (op0);
  }
  
+ /* Expand EXP to an insn used to represent LOOP_START builtin.  */
+ 
+ static void
+ expand_builtin_loop_start (tree exp)
+ {
+   tree loop = CALL_EXPR_ARG (exp, 0), niter = CALL_EXPR_ARG (exp, 1);
+   rtx nit = expand_expr (niter, NULL_RTX, SImode, EXPAND_NORMAL);
+   int loop_id = int_cst_value (loop);
+ 
+   emit_insn (gen_rtx_LOOP_NITER_BUILTIN (VOIDmode, loop_id, nit));
+ }
+ 
+ /* Expand EXP to an insn used to represent LOOP_ITERATION builtin.  */
+ 
+ static void
+ expand_builtin_loop_iteration (tree exp)
+ {
+   tree loop = CALL_EXPR_ARG (exp, 0);
+   int loop_id = int_cst_value (loop);
+ 
+   emit_insn (gen_rtx_LOOP_NITER_BUILTIN (VOIDmode, loop_id,
+ 					 gen_rtx_SCRATCH (VOIDmode)));
+ }
+ 
  /* Get a MEM rtx for expression EXP which is the address of an operand
     to be used in a string instruction (cmpstrsi, movmemsi, ..).  LEN is
     the maximum length of the block of memory that might be accessed or
*************** expand_builtin (tree exp, rtx target, rt
*** 6712,6717 ****
--- 6736,6747 ----
      case BUILT_IN_PREFETCH:
        expand_builtin_prefetch (exp);
        return const0_rtx;
+     case BUILT_IN_LOOP_START:
+       expand_builtin_loop_start (exp);
+       return const0_rtx;
+     case BUILT_IN_LOOP_ITERATION:
+       expand_builtin_loop_iteration (exp);
+       return const0_rtx;
  
      case BUILT_IN_PROFILE_FUNC_ENTER:
        return expand_builtin_profile_func (false);
Index: df-scan.c
===================================================================
*** df-scan.c	(revision 130990)
--- df-scan.c	(working copy)
*************** df_uses_record (struct df_collection_rec
*** 2858,2863 ****
--- 2858,2869 ----
        /* If we're clobbering a REG then we have a def so ignore.  */
        return;
  
+     case LOOP_NITER_BUILTIN:
+       df_uses_record (collection_rec,
+ 		      &XEXP (x, 1), DF_REF_REG_USE, 
+ 		      bb, insn, flags);
+       return;
+ 
      case MEM:
        df_uses_record (collection_rec,
  		      &XEXP (x, 0), DF_REF_REG_MEM_LOAD, 
Index: builtin-types.def
===================================================================
*** builtin-types.def	(revision 130990)
--- builtin-types.def	(working copy)
*************** DEF_FUNCTION_TYPE_2 (BT_FN_I8_VPTR_I8, B
*** 308,313 ****
--- 308,314 ----
  DEF_FUNCTION_TYPE_2 (BT_FN_I16_VPTR_I16, BT_I16, BT_VOLATILE_PTR, BT_I16)
  DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_LONGPTR_LONGPTR,
  		     BT_BOOL, BT_PTR_LONG, BT_PTR_LONG)
+ DEF_FUNCTION_TYPE_2 (BT_FN_VOID_INT_UINT, BT_VOID, BT_INT, BT_UINT)
  
  DEF_FUNCTION_TYPE_3 (BT_FN_STRING_STRING_CONST_STRING_SIZE,
  		     BT_STRING, BT_STRING, BT_CONST_STRING, BT_SIZE)
Index: builtins.def
===================================================================
*** builtins.def	(revision 130990)
--- builtins.def	(working copy)
*************** DEF_GCC_BUILTIN        (BUILT_IN_VA_ARG_
*** 707,712 ****
--- 707,715 ----
  DEF_EXT_LIB_BUILTIN    (BUILT_IN__EXIT, "_exit", BT_FN_VOID_INT, ATTR_NORETURN_NOTHROW_LIST)
  DEF_C99_BUILTIN        (BUILT_IN__EXIT2, "_Exit", BT_FN_VOID_INT, ATTR_NORETURN_NOTHROW_LIST)
  
+ DEF_GCC_BUILTIN        (BUILT_IN_LOOP_START, "loop_start", BT_FN_VOID_INT_UINT, ATTR_NOVOPS_LIST)
+ DEF_GCC_BUILTIN        (BUILT_IN_LOOP_ITERATION, "loop_iteration", BT_FN_VOID_INT, ATTR_NOVOPS_LIST)
+ 
  /* Implementing nested functions.  */
  DEF_BUILTIN_STUB (BUILT_IN_INIT_TRAMPOLINE, "__builtin_init_trampoline")
  DEF_BUILTIN_STUB (BUILT_IN_ADJUST_TRAMPOLINE, "__builtin_adjust_trampoline")
Index: tree-ssa-loop-ivopts.c
===================================================================
*** tree-ssa-loop-ivopts.c	(revision 130990)
--- tree-ssa-loop-ivopts.c	(working copy)
*************** tree_ssa_iv_optimize_finalize (struct iv
*** 5287,5292 ****
--- 5287,5333 ----
    VEC_free (iv_cand_p, heap, data->iv_candidates);
  }
  
+ /* Changes made in ivopts may make it impossible (or at least very difficult)
+    to determine number of iterations of the loop.  We record the number of
+    iterations for the following passes in this form:  in the loop preheader,
+    we emit a builtin_loop_start (loop number, number of latch executions) call,
+    and in the loop header, we emit builtin_loop_iteration (loop number).
+    The following passes then can test whether the structure of these builtins
+    stayed unchanged (there are no duplicates, ...), and if so, use the number
+    of iterations recorded in builtin_loop_start.  */
+ 
+ static void
+ emit_number_of_iterations_statement (struct loop *loop)
+ {
+   tree niter = number_of_latch_executions (loop);
+   edge pe = loop_preheader_edge (loop);
+   tree init, step;
+   tree intt, unsignedt, stmts;
+   block_stmt_iterator bsi;
+ 
+   if (niter == chrec_dont_know
+       || TYPE_PRECISION (TREE_TYPE (niter)) > INT_TYPE_SIZE)
+     return;
+ 
+   intt = lang_hooks.types.type_for_size (INT_TYPE_SIZE, false);
+   unsignedt = lang_hooks.types.type_for_size (INT_TYPE_SIZE, true);
+   niter = fold_convert (unsignedt, niter);
+ 
+   niter = force_gimple_operand (niter, &stmts, true, NULL_TREE);
+   init = build_call_expr (built_in_decls[BUILT_IN_LOOP_START], 2,
+ 			  build_int_cst (intt, loop->num),
+ 			  niter);
+   step = build_call_expr (built_in_decls[BUILT_IN_LOOP_ITERATION], 1,
+ 			  build_int_cst (intt, loop->num));
+ 
+   if (stmts)
+     bsi_insert_on_edge_immediate (pe, stmts);
+   bsi_insert_on_edge_immediate (pe, init);
+ 
+   bsi = bsi_after_labels (loop->header);
+   bsi_insert_before (&bsi, step, BSI_NEW_STMT);
+ }
+ 
  /* Optimizes the LOOP.  Returns true if anything changed.  */
  
  static bool
*************** tree_ssa_iv_optimize_loop (struct ivopts
*** 5339,5344 ****
--- 5380,5390 ----
      goto finish;
    changed = true;
  
+   /* Changing the induction variables may make the number of iterations
+      harder to analyse.  Record the number of iterations for further
+      passes.  */
+   emit_number_of_iterations_statement (loop);
+ 
    /* Create the new induction variables (item 4, part 1).  */
    create_new_ivs (data, iv_ca);
    iv_ca_free (&iv_ca);
Index: loop-init.c
===================================================================
*** loop-init.c	(revision 130990)
--- loop-init.c	(working copy)
*************** rtl_loop_init (void)
*** 168,174 ****
    if (dump_file)
      dump_flow_info (dump_file, dump_flags);
  
!   loop_optimizer_init (LOOPS_NORMAL);
    return 0;
  }
  
--- 168,174 ----
    if (dump_file)
      dump_flow_info (dump_file, dump_flags);
  
!   loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
    return 0;
  }
  
*************** struct tree_opt_pass pass_rtl_doloop =
*** 375,377 ****
--- 375,402 ----
    'L'                                   /* letter */
  };
  
+ /* Pass to gather information about # of iterations passed from trees.  */
+ 
+ static bool
+ gate_rtl_gather_loop_niter_builtins (void)
+ {
+   return true;
+ }
+ 
+ struct tree_opt_pass pass_rtl_gather_loop_niter_builtins =
+ {
+   "loop2_gather",                       /* name */
+   gate_rtl_gather_loop_niter_builtins,  /* gate */
+   gather_loop_niter_builtins,           /* execute */
+   NULL,                                 /* sub */
+   NULL,                                 /* next */
+   0,                                    /* static_pass_number */
+   TV_LOOP,                              /* tv_id */
+   0,                                    /* properties_required */
+   0,                                    /* properties_provided */
+   0,                                    /* properties_destroyed */
+   0,                                    /* todo_flags_start */
+   TODO_dump_func | TODO_verify_rtl_sharing, /* todo_flags_finish */
+   'L'                                   /* letter */
+ };
+ 
Index: rtl.def
===================================================================
*** rtl.def	(revision 130990)
--- rtl.def	(working copy)
*************** DEF_RTL_EXPR(US_TRUNCATE, "us_truncate",
*** 732,737 ****
--- 732,742 ----
     initialization status of variables.  */
  DEF_RTL_EXPR(VAR_LOCATION, "var_location", "tei", RTX_EXTRA)
  
+ /* Rtl code to represent LOOP_START and LOOP_ITERATION builtins, used to pass
+    the information about the number of iterations from trees to rtl.
+    For LOOP_ITERATION, it is used directly as a pattern of an insn.  */
+ DEF_RTL_EXPR(LOOP_NITER_BUILTIN, "loop_niter_builtin", "ie", RTX_EXTRA)
+ 
  /* All expressions from this point forward appear only in machine
     descriptions.  */
  #ifdef GENERATOR_FILE
Index: recog.c
===================================================================
*** recog.c	(revision 130990)
--- recog.c	(working copy)
*************** extract_insn (rtx insn)
*** 1940,1945 ****
--- 1940,1946 ----
      case ASM_INPUT:
      case ADDR_VEC:
      case ADDR_DIFF_VEC:
+     case LOOP_NITER_BUILTIN:
        return;
  
      case SET:
Index: function.c
===================================================================
*** function.c	(revision 130990)
--- function.c	(working copy)
*************** instantiate_virtual_regs (void)
*** 1696,1702 ****
  	    || GET_CODE (PATTERN (insn)) == CLOBBER
  	    || GET_CODE (PATTERN (insn)) == ADDR_VEC
  	    || GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC
! 	    || GET_CODE (PATTERN (insn)) == ASM_INPUT)
  	  continue;
  
  	instantiate_virtual_regs_in_insn (insn);
--- 1696,1703 ----
  	    || GET_CODE (PATTERN (insn)) == CLOBBER
  	    || GET_CODE (PATTERN (insn)) == ADDR_VEC
  	    || GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC
! 	    || GET_CODE (PATTERN (insn)) == ASM_INPUT
! 	    || GET_CODE (PATTERN (insn)) == LOOP_NITER_BUILTIN)
  	  continue;
  
  	instantiate_virtual_regs_in_insn (insn);
Index: loop-iv.c
===================================================================
*** loop-iv.c	(revision 130990)
--- loop-iv.c	(working copy)
*************** get_simple_loop_desc (struct loop *loop)
*** 2828,2833 ****
--- 2828,2994 ----
    return desc;
  }
  
+ /* Structure used to record information about LOOP_NITER_BUILTIN insns.  */
+ 
+ typedef struct
+ {
+   bool broken;			/* True if there is more than one
+ 				   LOOP_NITER_BUILTIN start or
+ 				   LOOP_NITER_BUILTIN step insn.  */
+   basic_block start, step;	/* The locations of the start and step
+ 				   insns.  */
+   rtx niter;			/* Number of iterations recorded in the
+ 				   start insn.  */
+ } loop_niter_builtin_info;
+ 
+ DEF_VEC_O (loop_niter_builtin_info);
+ DEF_VEC_ALLOC_O (loop_niter_builtin_info, heap);
+ 
+ /* Records the information about LOOP_NITER_BUILTIN insn for loop with
+    id LOOP_ID that appears in BB to BUILTINS.  If NITER is not NULL,
+    this corresponds to LOOP_ITERATION builtin, if NITER is not NULL,
+    the instruction corresponds to LOOP_START builtin, and NITER is the
+    number of iterations of the loop.  */
+ 
+ static void
+ record_loop_niter_builtin (VEC (loop_niter_builtin_info, heap) **builtins,
+ 			   unsigned loop_id, basic_block bb, rtx niter)
+ {
+   loop_niter_builtin_info *binfo;
+ 
+   if (VEC_length (loop_niter_builtin_info, *builtins) <= loop_id)
+     VEC_safe_grow_cleared (loop_niter_builtin_info, heap, *builtins,
+ 			   loop_id + 1);
+ 
+   binfo = VEC_index (loop_niter_builtin_info, *builtins, loop_id);
+   if (GET_CODE (niter) != SCRATCH)
+     {
+       if (dump_file)
+ 	fprintf (dump_file, "Found loop start builtin for loop %u\n", loop_id);
+ 
+       if (binfo->start)
+ 	binfo->broken = true;
+       else
+ 	{
+ 	  binfo->start = bb;
+ 	  binfo->niter = niter;
+ 	}
+     }
+   else
+     {
+       if (dump_file)
+ 	fprintf (dump_file, "Found loop step builtin for loop %u\n", loop_id);
+ 
+       if (binfo->step)
+ 	binfo->broken = true;
+       else
+ 	binfo->step = bb;
+     }
+ }
+ 
+ /* Sets number of iterations of a loop according to the information stored
+    in BINFO.  */
+ 
+ static void
+ set_niter_by_loop_niter_builtin (loop_niter_builtin_info *binfo)
+ {
+   struct loop *loop;
+   edge exit, ein;
+   basic_block exit_bb;
+   struct niter_desc *desc;
+ 
+   if (binfo->broken || !binfo->start || !binfo->step)
+     return;
+ 
+   loop = binfo->step->loop_father;
+   if (loop->header != binfo->step
+       || loop_preheader_edge (loop)->src != binfo->start)
+     return;
+ 
+   exit = single_dom_exit (loop);
+   if (!exit)
+     return;
+ 
+   exit_bb = exit->src;
+   if (EDGE_COUNT (exit_bb->succs) != 2)
+     return;
+ 
+   if (dump_file)
+     {
+       fprintf (dump_file, "Set number of iterations of loop %d to ", loop->num);
+       print_rtl (dump_file, binfo->niter);
+       fprintf (dump_file, "\n");
+     }
+ 
+   desc = XNEW (struct niter_desc);
+   ein = EDGE_SUCC (exit_bb, 0);
+   if (ein == exit)
+     ein = EDGE_SUCC (exit_bb, 1);
+ 
+   desc->out_edge = exit;
+   desc->in_edge = ein;
+   desc->simple_p = true;
+   desc->assumptions = NULL_RTX;
+   desc->noloop_assumptions = NULL_RTX;
+   desc->infinite = NULL_RTX;
+   desc->signed_p = false;
+   desc->mode = SImode;
+   desc->niter_expr = binfo->niter;
+ 
+   if (GET_CODE (desc->niter_expr) == CONST_INT)
+     {
+       unsigned HOST_WIDEST_INT val = INTVAL (desc->niter_expr);
+ 
+       desc->const_iter = true;
+       desc->niter_max = desc->niter = val & GET_MODE_MASK (desc->mode);
+     }
+   else
+     {
+       desc->const_iter = false;
+       desc->niter = 0;
+       desc->niter_max = determine_max_iter (loop, desc);
+     }
+ 
+   loop->aux = desc;
+ }
+ 
+ /* Removes insns used to transfer information about number of iterations from
+    trees to rtl, and records it at the loops.  */
+ 
+ unsigned
+ gather_loop_niter_builtins (void)
+ {
+   basic_block bb;
+   rtx insn, curr, pattern;
+   unsigned i;
+   VEC (loop_niter_builtin_info, heap) *builtins = NULL;
+   loop_niter_builtin_info *binfo;
+ 
+   FOR_EACH_BB (bb)
+     {
+       FOR_BB_INSNS_SAFE (bb, insn, curr)
+ 	{
+ 	  if (!INSN_P (insn))
+ 	    continue;
+ 
+ 	  pattern = PATTERN (insn);
+ 	  if (GET_CODE (pattern) != LOOP_NITER_BUILTIN)
+ 	    continue;
+ 
+ 	  record_loop_niter_builtin (&builtins,
+ 				     XINT (pattern, 0), bb, 
+ 				     XEXP (pattern, 1));
+ 	  delete_insn (insn);
+ 	}
+     }
+ 
+   for (i = 0; VEC_iterate (loop_niter_builtin_info, builtins, i, binfo); i++)
+     set_niter_by_loop_niter_builtin (binfo);
+ 
+   VEC_free (loop_niter_builtin_info, heap, builtins);
+   return 0;
+ }
+ 
  /* Releases simple loop description for LOOP.  */
  
  void
Index: cfgloop.h
===================================================================
*** cfgloop.h	(revision 130990)
--- cfgloop.h	(working copy)
*************** extern basic_block *get_loop_body_in_dom
*** 237,242 ****
--- 237,243 ----
  extern basic_block *get_loop_body_in_bfs_order (const struct loop *);
  extern VEC (edge, heap) *get_loop_exit_edges (const struct loop *);
  edge single_exit (const struct loop *);
+ edge single_dom_exit (struct loop *);
  extern unsigned num_loop_branches (const struct loop *);
  
  extern edge loop_preheader_edge (const struct loop *);
*************** extern void iv_analysis_done (void);
*** 389,394 ****
--- 390,396 ----
  
  extern struct niter_desc *get_simple_loop_desc (struct loop *loop);
  extern void free_simple_loop_desc (struct loop *loop);
+ extern unsigned gather_loop_niter_builtins (void);
  
  static inline struct niter_desc *
  simple_loop_desc (struct loop *loop)
Index: tree-flow.h
===================================================================
*** tree-flow.h	(revision 130990)
--- tree-flow.h	(working copy)
*************** struct loop *tree_ssa_loop_version (stru
*** 1026,1032 ****
  				    basic_block *);
  tree expand_simple_operations (tree);
  void substitute_in_loop_info (struct loop *, tree, tree);
- edge single_dom_exit (struct loop *);
  bool can_unroll_loop_p (struct loop *loop, unsigned factor,
  			struct tree_niter_desc *niter);
  void tree_unroll_loop (struct loop *, unsigned,
--- 1026,1031 ----
Index: passes.c
===================================================================
*** passes.c	(revision 130990)
--- passes.c	(working copy)
*************** init_optimization_passes (void)
*** 704,709 ****
--- 704,710 ----
  	  NEXT_PASS (pass_rtl_loop_init);
  	  NEXT_PASS (pass_rtl_move_loop_invariants);
  	  NEXT_PASS (pass_rtl_unswitch);
+ 	  NEXT_PASS (pass_rtl_gather_loop_niter_builtins);
  	  NEXT_PASS (pass_rtl_unroll_and_peel_loops);
  	  NEXT_PASS (pass_rtl_doloop);
  	  NEXT_PASS (pass_rtl_loop_done);
Index: dce.c
===================================================================
*** dce.c	(revision 130990)
--- dce.c	(working copy)
*************** deletable_insn_p (rtx insn, bool fast)
*** 106,111 ****
--- 106,112 ----
    switch (GET_CODE (body))
      {
      case USE:
+     case LOOP_NITER_BUILTIN:
        return false;
  
      case CLOBBER:


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]