This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PR 80689] Copy small aggregates element-wise


Hi,

I'd like to propose again a new variant of a fix that I sent here in
November (https://gcc.gnu.org/ml/gcc-patches/2017-10/msg00881.html) that
avoids store-to-load forwarding stalls in the ImageMagick benchmark by
expanding copies of very small simple aggregates element-wise rather
than "by pieces."

I have adjusted the patch only a little, most notably there is only one
controlling parameter and that is the maximum number of element-copies
that are necessary to copy an aggregate to use this method, rather than
a size constraint.  On x86_64, that parameter is set 4, on other
architectures I leave it at zero but it could help them too.

I have benchmarked the patch on top of a recent trunk on an AMD Ryzen
and an Intel Skylake machine using SPEC 2006 and SPEC 2017 CPU suites.
The only non-noise difference was 538.imagick_r, which on Ryzen and -O2
improved by 13% (generich march/mtune) and 20% (native march/mtune) and
on Skylake by 7% and 9% with the same switches.

I have bootstrapped the patch on x86_64-linux and (after changing the
parameter default) also on ppc64-linux and aarch64-linux.  I have not
done any benchmarking on non-x86_64 machines.

I'll be grateful for any comments, eventually I'd like to get approval
to commit it to trunk.

Thanks,

Martin



2018-07-10  Martin Jambor  <mjambor@suse.cz>

	PR target/80689
	* tree-sra.h: New file.
	* ipa-prop.h: Moved declaration of build_ref_for_offset to
	tree-sra.h.
	* expr.c: Include params.h and tree-sra.h.
	(emit_move_elementwise): New function.
	(store_expr_with_bounds): Optionally use it.
	* ipa-cp.c: Include tree-sra.h.
	* params.def (PARAM_MAX_ELEMS_FOR_ELEMENTWISE_COPY): New.
	* doc/invoke.texi: Document max-elems-for-elementwise-copy.
	* config/i386/i386.c (ix86_option_override_internal): Set
	PARAM_MAX_ELEMS_FOR_ELEMENTWISE_COPY to 4.
	* tree-sra.c: Include tree-sra.h.
	(scalarizable_type_p): Renamed to
	simple_mix_of_records_and_arrays_p, made public, renamed the
	second parameter to allow_char_arrays, added count_p parameter.
	(extract_min_max_idx_from_array): New function.
	(completely_scalarize): Moved bits of the function to
	extract_min_max_idx_from_array.

	testsuite/
	* gcc.target/i386/pr80689-1.c: New test.
---
 gcc/config/i386/i386.c                    |   4 +
 gcc/doc/invoke.texi                       |   4 +
 gcc/expr.c                                | 102 +++++++++++++++-
 gcc/ipa-cp.c                              |   1 +
 gcc/ipa-prop.h                            |   4 -
 gcc/params.def                            |   6 +
 gcc/testsuite/gcc.target/i386/pr80689-1.c |  38 ++++++
 gcc/tree-sra.c                            | 134 ++++++++++++++++------
 gcc/tree-sra.h                            |  34 ++++++
 9 files changed, 282 insertions(+), 45 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr80689-1.c
 create mode 100644 gcc/tree-sra.h

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ee409cfe7e4..d90d1e1ade0 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -4698,6 +4698,10 @@ ix86_option_override_internal (bool main_args_p,
 			 ix86_tune_cost->l2_cache_size,
 			 opts->x_param_values,
 			 opts_set->x_param_values);
+  maybe_set_param_value (PARAM_MAX_ELEMS_FOR_ELEMENTWISE_COPY,
+			 4,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
 
   /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
   if (opts->x_flag_prefetch_loop_arrays < 0
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 1dcdfb51c47..240934b07d8 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -11309,6 +11309,10 @@ away for the unroll-and-jam transformation to be considered profitable.
 @item unroll-jam-max-unroll
 The maximum number of times the outer loop should be unrolled by
 the unroll-and-jam transformation.
+
+@item max-elems-for-elementwise-copy
+The maximum number of elements required to copy a structure and/or
+array element-wise (as opposed to a bulk memory move).
 @end table
 @end table
 
diff --git a/gcc/expr.c b/gcc/expr.c
index f665e187ebb..657043e43fe 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -62,7 +62,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "ccmp.h"
 #include "gimple-fold.h"
 #include "rtx-vector-builder.h"
-
+#include "params.h"
+#include "tree-sra.h"
 
 /* If this is nonzero, we do not bother generating VOLATILE
    around volatile memory references, and we are willing to
@@ -5419,6 +5420,77 @@ emit_storent_insn (rtx to, rtx from)
   return maybe_expand_insn (code, 2, ops);
 }
 
+/* Generate code for copying data of type TYPE at SOURCE plus OFFSET to TARGET
+   plus OFFSET, but do so element-wise and/or field-wise for each record and
+   array within TYPE.  TYPE must either be a register type or an aggregate
+   complying with scalarizable_type_p.
+
+   The function is used to copy structures with size bounded by a (small)
+   integer constant, therefore it does not use poly_ints to work with offsets.
+
+   If CALL_PARAM_P is nonzero, this is a store into a call param on the
+   stack, and block moves may need to be treated specially.  */
+
+static void
+emit_move_elementwise (tree type, rtx target, rtx source, HOST_WIDE_INT offset,
+		       int call_param_p)
+{
+  switch (TREE_CODE (type))
+    {
+    case RECORD_TYPE:
+      for (tree fld = TYPE_FIELDS (type); fld; fld = DECL_CHAIN (fld))
+	if (TREE_CODE (fld) == FIELD_DECL)
+	  {
+	    HOST_WIDE_INT fld_offset = offset + int_bit_position (fld);
+	    tree ft = TREE_TYPE (fld);
+	    emit_move_elementwise (ft, target, source, fld_offset,
+				   call_param_p);
+	  }
+      break;
+
+    case ARRAY_TYPE:
+      {
+	tree elem_type = TREE_TYPE (type);
+	HOST_WIDE_INT el_size = tree_to_shwi (TYPE_SIZE (elem_type));
+	gcc_assert (el_size > 0);
+
+	offset_int idx, max;
+	/* Skip (some) zero-length arrays; others have MAXIDX == MINIDX - 1.  */
+	if (extract_min_max_idx_from_array (type, &idx, &max))
+	  {
+	    HOST_WIDE_INT el_offset = offset;
+	    for (; idx <= max; ++idx)
+	      {
+		emit_move_elementwise (elem_type, target, source, el_offset,
+				       call_param_p);
+		el_offset += el_size;
+	      }
+	  }
+      }
+      break;
+    default:
+      machine_mode mode = TYPE_MODE (type);
+
+      rtx ntgt = adjust_address (target, mode, offset / BITS_PER_UNIT);
+      rtx nsrc = adjust_address (source, mode, offset / BITS_PER_UNIT);
+
+      gcc_assert (mode != VOIDmode);
+      if (mode != BLKmode)
+	emit_move_insn (ntgt, nsrc);
+      else
+	{
+	  /* For example vector gimple registers can end up here.  */
+	  rtx size = expand_expr (TYPE_SIZE_UNIT (type), NULL_RTX,
+				  TYPE_MODE (sizetype), EXPAND_NORMAL);
+	  emit_block_move (ntgt, nsrc, size,
+			   (call_param_p
+			    ? BLOCK_OP_CALL_PARM : BLOCK_OP_NORMAL));
+	}
+      break;
+    }
+  return;
+}
+
 /* Generate code for computing expression EXP,
    and storing the value into TARGET.
 
@@ -5775,9 +5847,31 @@ store_expr (tree exp, rtx target, int call_param_p,
 	emit_group_store (target, temp, TREE_TYPE (exp),
 			  int_size_in_bytes (TREE_TYPE (exp)));
       else if (GET_MODE (temp) == BLKmode)
-	emit_block_move (target, temp, expr_size (exp),
-			 (call_param_p
-			  ? BLOCK_OP_CALL_PARM : BLOCK_OP_NORMAL));
+	{
+	  /* Copying smallish BLKmode structures with emit_block_move and thus
+	     by-pieces can result in store-to-load stalls.  So copy some simple
+	     small aggregates element or field-wise.  */
+	  int count = 0;
+	  if (GET_MODE (target) == BLKmode
+	      && AGGREGATE_TYPE_P (TREE_TYPE (exp))
+	      && !TREE_ADDRESSABLE (TREE_TYPE (exp))
+	      && tree_fits_shwi_p (TYPE_SIZE (TREE_TYPE (exp)))
+	      && (tree_to_shwi (TYPE_SIZE (TREE_TYPE (exp)))
+		  <= (PARAM_VALUE (PARAM_MAX_ELEMS_FOR_ELEMENTWISE_COPY)
+		      * MOVE_MAX_PIECES * BITS_PER_UNIT))
+	      && simple_mix_of_records_and_arrays_p (TREE_TYPE (exp), false,
+						     &count)
+	      && (count <= PARAM_VALUE (PARAM_MAX_ELEMS_FOR_ELEMENTWISE_COPY)))
+	    {
+	      gcc_assert (!reverse);
+	      emit_move_elementwise (TREE_TYPE (exp), target, temp, 0,
+				     call_param_p);
+	    }
+	  else
+	    emit_block_move (target, temp, expr_size (exp),
+			     (call_param_p
+			      ? BLOCK_OP_CALL_PARM : BLOCK_OP_NORMAL));
+	}
       /* If we emit a nontemporal store, there is nothing else to do.  */
       else if (nontemporal && emit_storent_insn (target, temp))
 	;
diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 42dd4cc2904..8a84bd66478 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -124,6 +124,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-ccp.h"
 #include "stringpool.h"
 #include "attribs.h"
+#include "tree-sra.h"
 
 template <typename valtype> class ipcp_value;
 
diff --git a/gcc/ipa-prop.h b/gcc/ipa-prop.h
index 55e10cf0f27..2e660824ad1 100644
--- a/gcc/ipa-prop.h
+++ b/gcc/ipa-prop.h
@@ -803,10 +803,6 @@ void ipa_dump_param (FILE *, struct ipa_node_params *info, int i);
 void ipa_release_body_info (struct ipa_func_body_info *);
 tree ipa_get_callee_param_type (struct cgraph_edge *e, int i);
 
-/* From tree-sra.c:  */
-tree build_ref_for_offset (location_t, tree, poly_int64, bool, tree,
-			   gimple_stmt_iterator *, bool);
-
 /* In ipa-cp.c  */
 void ipa_cp_c_finalize (void);
 
diff --git a/gcc/params.def b/gcc/params.def
index df3a06b5d9d..daeb207951f 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1352,6 +1352,12 @@ DEFPARAM(PARAM_AVOID_FMA_MAX_BITS,
 	 "Maximum number of bits for which we avoid creating FMAs.",
 	 0, 0, 512)
 
+DEFPARAM (PARAM_MAX_ELEMS_FOR_ELEMENTWISE_COPY,
+	  "max-elems-for-elementwise-copy",
+	  "Maximum number of elements required to consider copying"
+          "a structure or an array by its individual fields or elements",
+	  0, 0, 64)
+
 /*
 
 Local variables:
diff --git a/gcc/testsuite/gcc.target/i386/pr80689-1.c b/gcc/testsuite/gcc.target/i386/pr80689-1.c
new file mode 100644
index 00000000000..4156d4fba45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr80689-1.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef struct st1
+{
+        long unsigned int a,b;
+        long int c,d;
+}R;
+
+typedef struct st2
+{
+        int  t;
+        R  reg;
+}N;
+
+void Set (const R *region,  N *n_info );
+
+void test(N  *n_obj ,const long unsigned int a, const long unsigned int b,  const long int c,const long int d)
+{
+        R reg;
+
+        reg.a=a;
+        reg.b=b;
+        reg.c=c;
+        reg.d=d;
+        Set (&reg, n_obj);
+
+}
+
+void Set (const R *reg,  N *n_obj )
+{
+        n_obj->reg=(*reg);
+}
+
+
+/* { dg-final { scan-assembler-not "%(x|y|z)mm\[0-9\]+" } } */
+/* { dg-final { scan-assembler-not "movdqu" } } */
+/* { dg-final { scan-assembler-not "movups" } } */
diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index 3e30f6bc3d4..4d4bcd2b3b4 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -105,6 +105,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "ipa-fnsummary.h"
 #include "ipa-utils.h"
 #include "builtins.h"
+#include "tree-sra.h"
 
 /* Enumeration of all aggregate reductions we can do.  */
 enum sra_mode { SRA_MODE_EARLY_IPA,   /* early call regularization */
@@ -962,14 +963,15 @@ create_access (tree expr, gimple *stmt, bool write)
 }
 
 
-/* Return true iff TYPE is scalarizable - i.e. a RECORD_TYPE or fixed-length
-   ARRAY_TYPE with fields that are either of gimple register types (excluding
-   bit-fields) or (recursively) scalarizable types.  CONST_DECL must be true if
-   we are considering a decl from constant pool.  If it is false, char arrays
-   will be refused.  */
+/* Return true if TYPE consists of RECORD_TYPE or fixed-length ARRAY_TYPE with
+   fields/elements that are not bit-fields and are either register types or
+   recursively comply with simple_mix_of_records_and_arrays_p.  Furthermore, if
+   ALLOW_CHAR_ARRAYS is false, the function will return false also if TYPE
+   contains an array of elements that only have one byte.  */
 
-static bool
-scalarizable_type_p (tree type, bool const_decl)
+bool
+simple_mix_of_records_and_arrays_p (tree type, bool allow_char_arrays,
+				    int *count_p)
 {
   gcc_assert (!is_gimple_reg_type (type));
   if (type_contains_placeholder_p (type))
@@ -986,8 +988,13 @@ scalarizable_type_p (tree type, bool const_decl)
 	  if (DECL_BIT_FIELD (fld))
 	    return false;
 
-	  if (!is_gimple_reg_type (ft)
-	      && !scalarizable_type_p (ft, const_decl))
+	  if (is_gimple_reg_type (ft))
+	    {
+	      if (count_p)
+		(*count_p)++;
+	    }
+	  else if (!simple_mix_of_records_and_arrays_p (ft, allow_char_arrays,
+						       count_p))
 	    return false;
 	}
 
@@ -996,7 +1003,7 @@ scalarizable_type_p (tree type, bool const_decl)
   case ARRAY_TYPE:
     {
       HOST_WIDE_INT min_elem_size;
-      if (const_decl)
+      if (allow_char_arrays)
 	min_elem_size = 0;
       else
 	min_elem_size = BITS_PER_UNIT;
@@ -1017,9 +1024,45 @@ scalarizable_type_p (tree type, bool const_decl)
 	return false;
 
       tree elem = TREE_TYPE (type);
-      if (!is_gimple_reg_type (elem)
-	  && !scalarizable_type_p (elem, const_decl))
-	return false;
+      if (!count_p)
+	{
+	  if (is_gimple_reg_type (elem)
+	      || simple_mix_of_records_and_arrays_p (elem, allow_char_arrays,
+						      NULL))
+	    return true;
+	  else
+	    return false;
+	}
+
+      offset_int min, max;
+      HOST_WIDE_INT ds;
+      bool nonzero = extract_min_max_idx_from_array (type, &min, &max);
+
+      if (nonzero && (min <= max))
+	{
+	  offset_int d = max - min + 1;
+	  if (!wi::fits_shwi_p (d))
+	    return false;
+	  ds = d.to_shwi ();
+	  if (ds > INT_MAX)
+	    return false;
+	}
+      else
+	ds = 0;
+
+      if (is_gimple_reg_type (elem))
+	*count_p += (int) ds;
+      else
+	{
+	  int elc = 0;
+	  if (!simple_mix_of_records_and_arrays_p (elem, allow_char_arrays,
+						   &elc))
+	    return false;
+	  ds *= elc;
+	  if (ds > INT_MAX)
+	    return false;
+	  *count_p += (unsigned) ds;
+	}
       return true;
     }
   default:
@@ -1027,10 +1070,38 @@ scalarizable_type_p (tree type, bool const_decl)
   }
 }
 
-static void scalarize_elem (tree, HOST_WIDE_INT, HOST_WIDE_INT, bool, tree, tree);
+static void scalarize_elem (tree, HOST_WIDE_INT, HOST_WIDE_INT, bool, tree,
+			    tree);
+
+/* For a given array TYPE, return false if its domain does not have any maximum
+   value.  Otherwise calculate MIN and MAX indices of the first and the last
+   element.  */
+
+bool
+extract_min_max_idx_from_array (tree type, offset_int *min, offset_int *max)
+{
+  tree domain = TYPE_DOMAIN (type);
+  tree minidx = TYPE_MIN_VALUE (domain);
+  gcc_assert (TREE_CODE (minidx) == INTEGER_CST);
+  tree maxidx = TYPE_MAX_VALUE (domain);
+  if (!maxidx)
+    return false;
+  gcc_assert (TREE_CODE (maxidx) == INTEGER_CST);
+
+  /* MINIDX and MAXIDX are inclusive, and must be interpreted in
+     DOMAIN (e.g. signed int, whereas min/max may be size_int).  */
+  *min = wi::to_offset (minidx);
+  *max = wi::to_offset (maxidx);
+  if (!TYPE_UNSIGNED (domain))
+    {
+      *min = wi::sext (*min, TYPE_PRECISION (domain));
+      *max = wi::sext (*max, TYPE_PRECISION (domain));
+    }
+  return true;
+}
 
 /* Create total_scalarization accesses for all scalar fields of a member
-   of type DECL_TYPE conforming to scalarizable_type_p.  BASE
+   of type DECL_TYPE conforming to simple_mix_of_records_and_arrays_p.  BASE
    must be the top-most VAR_DECL representing the variable; within that,
    OFFSET locates the member and REF must be the memory reference expression for
    the member.  */
@@ -1057,27 +1128,14 @@ completely_scalarize (tree base, tree decl_type, HOST_WIDE_INT offset, tree ref)
       {
 	tree elemtype = TREE_TYPE (decl_type);
 	tree elem_size = TYPE_SIZE (elemtype);
-	gcc_assert (elem_size && tree_fits_shwi_p (elem_size));
 	HOST_WIDE_INT el_size = tree_to_shwi (elem_size);
 	gcc_assert (el_size > 0);
 
-	tree minidx = TYPE_MIN_VALUE (TYPE_DOMAIN (decl_type));
-	gcc_assert (TREE_CODE (minidx) == INTEGER_CST);
-	tree maxidx = TYPE_MAX_VALUE (TYPE_DOMAIN (decl_type));
+	offset_int idx, max;
 	/* Skip (some) zero-length arrays; others have MAXIDX == MINIDX - 1.  */
-	if (maxidx)
+	if (extract_min_max_idx_from_array (decl_type, &idx, &max))
 	  {
-	    gcc_assert (TREE_CODE (maxidx) == INTEGER_CST);
 	    tree domain = TYPE_DOMAIN (decl_type);
-	    /* MINIDX and MAXIDX are inclusive, and must be interpreted in
-	       DOMAIN (e.g. signed int, whereas min/max may be size_int).  */
-	    offset_int idx = wi::to_offset (minidx);
-	    offset_int max = wi::to_offset (maxidx);
-	    if (!TYPE_UNSIGNED (domain))
-	      {
-		idx = wi::sext (idx, TYPE_PRECISION (domain));
-		max = wi::sext (max, TYPE_PRECISION (domain));
-	      }
 	    for (int el_off = offset; idx <= max; ++idx)
 	      {
 		tree nref = build4 (ARRAY_REF, elemtype,
@@ -1098,10 +1156,10 @@ completely_scalarize (tree base, tree decl_type, HOST_WIDE_INT offset, tree ref)
 }
 
 /* Create total_scalarization accesses for a member of type TYPE, which must
-   satisfy either is_gimple_reg_type or scalarizable_type_p.  BASE must be the
-   top-most VAR_DECL representing the variable; within that, POS and SIZE locate
-   the member, REVERSE gives its torage order. and REF must be the reference
-   expression for it.  */
+   satisfy either is_gimple_reg_type or simple_mix_of_records_and_arrays_p.
+   BASE must be the top-most VAR_DECL representing the variable; within that,
+   POS and SIZE locate the member, REVERSE gives its torage order. and REF must
+   be the reference expression for it.  */
 
 static void
 scalarize_elem (tree base, HOST_WIDE_INT pos, HOST_WIDE_INT size, bool reverse,
@@ -1121,7 +1179,8 @@ scalarize_elem (tree base, HOST_WIDE_INT pos, HOST_WIDE_INT size, bool reverse,
 }
 
 /* Create a total_scalarization access for VAR as a whole.  VAR must be of a
-   RECORD_TYPE or ARRAY_TYPE conforming to scalarizable_type_p.  */
+   RECORD_TYPE or ARRAY_TYPE conforming to
+   simple_mix_of_records_and_arrays_p.  */
 
 static void
 create_total_scalarization_access (tree var)
@@ -2846,8 +2905,9 @@ analyze_all_variable_accesses (void)
       {
 	tree var = candidate (i);
 
-	if (VAR_P (var) && scalarizable_type_p (TREE_TYPE (var),
-						constant_decl_p (var)))
+	if (VAR_P (var)
+	    && simple_mix_of_records_and_arrays_p (TREE_TYPE (var),
+						   constant_decl_p (var), NULL))
 	  {
 	    if (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (var)))
 		<= max_scalarization_size)
diff --git a/gcc/tree-sra.h b/gcc/tree-sra.h
new file mode 100644
index 00000000000..0d8b2431ec5
--- /dev/null
+++ b/gcc/tree-sra.h
@@ -0,0 +1,34 @@
+/* tree-sra.h - Run-time parameters.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef TREE_SRA_H
+#define TREE_SRA_H
+
+
+bool simple_mix_of_records_and_arrays_p (tree type, bool allow_char_arrays,
+					 int *count_pg);
+bool extract_min_max_idx_from_array (tree type, offset_int *idx,
+				     offset_int *max);
+tree build_ref_for_offset (location_t loc, tree base, poly_int64 offset,
+			   bool reverse, tree exp_type,
+			   gimple_stmt_iterator *gsi, bool insert_after);
+
+
+
+#endif	/* TREE_SRA_H */
-- 
2.18.0


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]