This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCHv2] new target hook for loop unrolling adjustment


> > +static unsigned
> > +s390_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
> 
> Description missing of this function.
[...]

> This description is not matching the use.  You use it where
> the loop unroller passes you the number of times a loop is
> to be unrolled, not the number of instructions that would result
> from that.
> 
> Thus it should read
> 
> ... for the number of times @var{loop} should be unrolled.
> ... is the number of times the loop is to be unrolled.


Right you are. Both comments fixed.


> The middle-end changes are ok, but a target maintainer needs to ack
> the s390 part and needs to make sense of what you do.

Andreas, can you ack and consider for 4.6? 

Christian


Patch with update comments below.
[PATCHv3] new target hook for loop unrolling adjustment

The patch below introduces a new target hook that allows back ends to adjust
the amount of loop unrolling, and an implementation of such an adjustment
that improves performance for IBM System z with a z10 processor.

The story behind this improvement is that z10 employs a data prefetch unit
that monitors instructions performing memory accesses with the intention to
detect regular access patterns occurring within a loop, i.e. something like
"always reads the next 8 byte". That unit has 32 slots for monitoring
instructions which are identified by part of their address. If a loop body
is too long, different instructions may happen to fill the same
slot, causing the unit to actually serve neither of the instructions.

The purpose of this patch is to limit loop unrolling depending on the number
of memory accesses performed within the loop body in a way that avoids
collisions in the slots of the prefetch unit.

For loops with nesting level 1, the normal cache management is often 
sufficient, but there are cases where the prefetch unit still improves
performancs. We can therefore allow many memory references, but still
try to avoid too many collisions.
For loops with a nesting level of 3 and deeper we assume that the memory
accesses follow a more irregular pattern that prevents the normal cache
management from achieving a good hit rate. Here, the prefetch unit is 
much more important.  As a performance optimization, we limit the number
of unrolls such that the unrolled body does not have too many memory
references that could collide. 
Two-dimensional loops are in between these two cases.
The used constants have been found by comparing several measurements.

This patch is a joint work of Wolfgang Gellerich and me. Wolfgang did the
core work in s390.c as well as the performance analyses. I finalized the
patch and wrote the target hook.

The patch bootstraps and the regression test was ok.

2010-02-01  Christian Borntraeger  <borntraeger@de.ibm.com>
            Wolfgang Gellerich  <gellerich@de.ibm.com>

        Implement target hook for loop unrolling
        * target.h (loop_unroll_adjust): Add a new target hook function.
        * target-def.h (TARGET_LOOP_UNROLL_ADJUST): Likewise.
        * doc/tm.texi (TARGET_LOOP_UNROLL_ADJUST): Document it.
        * config/s390/s390.c (TARGET_LOOP_UNROLL_ADJUST): Define it.
        (s390_loop_unroll_adjust): Implement the new target hook for s390.
        * loop-unroll.c (decide_unroll_runtime_iterations): Call loop unroll target hook
        (decide_unroll_stupid): Likewise.
Index: gcc/config/s390/s390.c
===================================================================
--- gcc/config/s390/s390.c.orig
+++ gcc/config/s390/s390.c
@@ -53,6 +53,7 @@ along with GCC; see the file COPYING3.  
 #include "gimple.h"
 #include "df.h"
 #include "params.h"
+#include "cfgloop.h"
 
 
 /* Define the specific costs for a given cpu.  */
@@ -10160,6 +10161,62 @@ s390_sched_variable_issue (FILE *file AT
     return more;
 }
 
+/* This function checks the whole of insn X for memory references. The
+   function always returns zero because the framework it is called
+   from would stop recursively analyzing the insn upon a return value
+   other than zero. The real result of this function is updating
+   counter variable MEM_COUNT.  */
+static int
+check_dpu (rtx *x, unsigned *mem_count)
+{
+  if (*x != NULL_RTX && MEM_P (*x))
+    (*mem_count)++;
+  return 0;
+}
+
+/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
+   a new number struct loop *loop should be unrolled if tuned for the z10
+   cpu. The loop is analyzed for memory accesses by calling check_dpu for
+   each rtx of the loop. Depending on the loop_depth and the amount of
+   memory accesses a new number <=nunroll is returned to improve the
+   behaviour of the hardware prefetch unit.  */
+static unsigned
+s390_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
+{
+  basic_block *bbs;
+  rtx insn;
+  unsigned i;
+  unsigned mem_count = 0;
+
+  /* Only z10 needs special handling.  */
+  if (s390_tune != PROCESSOR_2097_Z10)
+    return nunroll;
+
+  /* Count the number of memory references within the loop body.  */
+  bbs = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      for (insn = BB_HEAD (bbs[i]); insn != BB_END (bbs[i]); insn = NEXT_INSN (insn))
+	if (INSN_P (insn) && INSN_CODE (insn) != -1)
+            for_each_rtx (&insn, (rtx_function) check_dpu, &mem_count);
+    }
+  free (bbs);
+
+  /* Prevent division by zero, and we do not need to adjust nunroll in this case.  */
+  if (mem_count == 0)
+    return nunroll;
+
+  switch (loop_depth(loop))
+    {
+    case 1:
+      return MIN (nunroll, 28 / mem_count);
+    case 2:
+      return MIN (nunroll, 22 / mem_count);
+    default:
+      return MIN (nunroll, 16 / mem_count);
+    }
+}
+
 /* Initialize GCC target structure.  */
 
 #undef  TARGET_ASM_ALIGNED_HI_OP
@@ -10286,6 +10343,9 @@ s390_sched_variable_issue (FILE *file AT
 #undef TARGET_CAN_ELIMINATE
 #define TARGET_CAN_ELIMINATE s390_can_eliminate
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST s390_loop_unroll_adjust
+
 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
 #define TARGET_ASM_TRAMPOLINE_TEMPLATE s390_asm_trampoline_template
 #undef TARGET_TRAMPOLINE_INIT
Index: gcc/doc/tm.texi
===================================================================
--- gcc/doc/tm.texi.orig
+++ gcc/doc/tm.texi
@@ -10835,6 +10835,15 @@ This target hook is required only when t
 modes and they have different conditional execution capability, such as ARM.
 @end deftypefn
 
+@deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, struct loop *@var{loop})
+This target hook returns a new value for the number of times @var{loop}
+should be unrolled. The parameter @var{nunroll} is the number of times
+the loop is to be unrolled. The parameter @var{loop} is a pointer to
+the loop, which is going to be checked for unrolling. This target hook
+is required only when the target has special constraints like maximum
+number of memory accesses.
+@end deftypefn
+
 @defmac POWI_MAX_MULTS
 If defined, this macro is interpreted as a signed integer C expression
 that specifies the maximum number of floating point multiplications
Index: gcc/loop-unroll.c
===================================================================
--- gcc/loop-unroll.c.orig
+++ gcc/loop-unroll.c
@@ -33,6 +33,7 @@ along with GCC; see the file COPYING3.  
 #include "expr.h"
 #include "hashtab.h"
 #include "recog.h"
+#include "target.h"
 
 /* This pass performs loop unrolling and peeling.  We only perform these
    optimizations on innermost loops (with single exception) because
@@ -826,6 +827,9 @@ decide_unroll_runtime_iterations (struct
   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 
+  if (targetm.loop_unroll_adjust)
+    nunroll = targetm.loop_unroll_adjust (nunroll, loop);
+
   /* Skip big loops.  */
   if (nunroll <= 1)
     {
@@ -1366,6 +1370,9 @@ decide_unroll_stupid (struct loop *loop,
   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 
+  if (targetm.loop_unroll_adjust)
+    nunroll = targetm.loop_unroll_adjust (nunroll, loop);
+
   /* Skip big loops.  */
   if (nunroll <= 1)
     {
Index: gcc/target-def.h
===================================================================
--- gcc/target-def.h.orig
+++ gcc/target-def.h
@@ -540,6 +540,7 @@
   default_branch_target_register_class
 #define TARGET_BRANCH_TARGET_REGISTER_CALLEE_SAVED hook_bool_bool_false
 #define TARGET_HAVE_CONDITIONAL_EXECUTION default_have_conditional_execution
+#define TARGET_LOOP_UNROLL_ADJUST NULL
 #define TARGET_CANNOT_FORCE_CONST_MEM hook_bool_rtx_false
 #define TARGET_CANNOT_COPY_INSN_P NULL
 #define TARGET_COMMUTATIVE_P hook_bool_const_rtx_commutative_p
@@ -942,6 +943,7 @@
   TARGET_BRANCH_TARGET_REGISTER_CLASS,		\
   TARGET_BRANCH_TARGET_REGISTER_CALLEE_SAVED,	\
   TARGET_HAVE_CONDITIONAL_EXECUTION,		\
+  TARGET_LOOP_UNROLL_ADJUST,			\
   TARGET_CANNOT_FORCE_CONST_MEM,		\
   TARGET_CANNOT_COPY_INSN_P,			\
   TARGET_COMMUTATIVE_P,				\
Index: gcc/target.h
===================================================================
--- gcc/target.h.orig
+++ gcc/target.h
@@ -97,6 +97,9 @@ struct _dep;
 /* This is defined in ddg.h .  */
 struct ddg;
 
+/* This is defined in cfgloop.h .  */
+struct loop;
+
 /* Assembler instructions for creating various kinds of integer object.  */
 
 struct asm_int_op
@@ -633,6 +636,9 @@ struct gcc_target
   /* Return true if the target supports conditional execution.  */
   bool (* have_conditional_execution) (void);
 
+  /* Return a new value for loop unroll size.  */
+  unsigned (* loop_unroll_adjust) (unsigned nunroll, struct loop *loop);
+
   /* True if the constant X cannot be placed in the constant pool.  */
   bool (* cannot_force_const_mem) (rtx);
 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]