[RFC] light expander sra for parameters and returns

Jiufu Guo guojiufu@linux.ibm.com
Mon May 29 03:52:17 GMT 2023


Hi,

Previously, I was investigating some struct parameters and returns related
PRs 69143/65421/108073.

Investigating the issues case by case, and drafting patches for each of
them one by one. This would help us to enhance code incrementally.
While, this way, patches would interact with each other and implement
different codes for similar issues (because of the different paths in
gimple/rtl).  We may have a common fix for those issues.

We know a few other related PRs(such as meta-bug PR101926) exist. For those
PRs in different targets with different symptoms (and also different root
cause), I would expect a method could help some of them, but it may
be hard to handle all of them in one fix.

With investigation and check discussion for the issues, I remember a
suggestion from Richard: it would be nice to perform some SRA-like analysis
for the accesses on the structs (parameter/returns).
https://gcc.gnu.org/pipermail/gcc-patches/2022-November/605117.html
This may be a 'fairly common method' for those issues. With this idea,
I drafted a patch as below in this mail.

I also thought about directly using tree-sra.cc, e.g. enhance it and rerun it
at the end of GIMPLE passes. While since some issues are introduced inside
the expander, so below patch also co-works with other parts of the expander.
And since we already have tree-sra in gimple pass, we only need to take more
care on parameter and return in this patch: other decls could be handled
well in tree-sra.

The steps of this patch are:
1. Collect struct type parameters and returns, and then scan the function to
get the accesses on them. And figure out the accesses which would be profitable
to be scalarized (using registers of the parameter/return ). Now, reading on
parameter and writing on returns are checked in the current patch.
2. When/after the scalar registers are determined/expanded for the return or
parameters, compute the corresponding scalar register(s) for each accesses of
the return/parameter, and prepare the scalar RTLs for those accesses.
3. When using/expanding the accesses expression, leverage the computed/prepared
scalars directly.

This patch is tested on ppc64 both LE and BE.
To continue, I would ask for comments and suggestions first. And then I would
update/enhance accordingly.  Thanks in advance!
   

BR,
Jeff (Jiufu)


---
 gcc/cfgexpand.cc                             | 567 ++++++++++++++++++-
 gcc/expr.cc                                  |  15 +-
 gcc/function.cc                              |  26 +-
 gcc/opts.cc                                  |   8 +-
 gcc/testsuite/g++.target/powerpc/pr102024.C  |   2 +-
 gcc/testsuite/gcc.target/powerpc/pr108073.c  |  29 +
 gcc/testsuite/gcc.target/powerpc/pr65421-1.c |   6 +
 gcc/testsuite/gcc.target/powerpc/pr65421-2.c |  32 ++
 8 files changed, 675 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-2.c

diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index 85a93a547c0..95c29b6b6fe 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -97,6 +97,564 @@ static bool defer_stack_allocation (tree, bool);
 
 static void record_alignment_for_reg_var (unsigned int);
 
+/* For light SRA in expander about paramaters and returns.  */
+namespace {
+
+struct access
+{
+  /* Each accessing on the aggragate is about OFFSET/SIZE and BASE.  */
+  HOST_WIDE_INT offset;
+  HOST_WIDE_INT size;
+  tree base;
+  bool writing;
+
+  /* The context expression of this access.  */
+  tree expr;
+
+  /* The rtx for the access: link to incoming/returning register(s).  */
+  rtx rtx_val;
+};
+
+typedef struct access *access_p;
+
+/* Expr (tree) -> Acess (access_p) map.  */
+static hash_map<tree, access_p> *expr_access_vec;
+
+/* Base (tree) -> Vector (vec<access_p> *) map.  */
+static hash_map<tree, auto_vec<access_p> > *base_access_vec;
+
+/* Return a vector of pointers to accesses for the variable given in BASE or
+ NULL if there is none.  */
+
+static vec<access_p> *
+get_base_access_vector (tree base)
+{
+  return base_access_vec->get (base);
+}
+
+/* Remove DECL from candidates for SRA.  */
+static void
+disqualify_candidate (tree decl)
+{
+  decl = get_base_address (decl);
+  base_access_vec->remove (decl);
+}
+
+/* Create and insert access for EXPR. Return created access, or NULL if it is
+   not possible.  */
+static struct access *
+create_access (tree expr, bool write)
+{
+  poly_int64 poffset, psize, pmax_size;
+  bool reverse;
+
+  tree base
+    = get_ref_base_and_extent (expr, &poffset, &psize, &pmax_size, &reverse);
+
+  if (!DECL_P (base))
+    return NULL;
+
+  vec<access_p> *access_vec = get_base_access_vector (base);
+  if (!access_vec)
+    return NULL;
+
+  /* TODO: support reverse. */
+  if (reverse)
+    {
+      disqualify_candidate (expr);
+      return NULL;
+    }
+
+  HOST_WIDE_INT offset, size, max_size;
+  if (!poffset.is_constant (&offset) || !psize.is_constant (&size)
+      || !pmax_size.is_constant (&max_size))
+    return NULL;
+
+  if (size != max_size || size == 0 || offset < 0 || size < 0
+      || offset + size > tree_to_shwi (DECL_SIZE (base)))
+    return NULL;
+
+  struct access *access = XNEWVEC (struct access, 1);
+
+  memset (access, 0, sizeof (struct access));
+  access->base = base;
+  access->offset = offset;
+  access->size = size;
+  access->expr = expr;
+  access->writing = write;
+  access->rtx_val = NULL_RTX;
+
+  access_vec->safe_push (access);
+
+  return access;
+}
+
+/* Return true if VAR is a candidate for SRA.  */
+static bool
+add_sra_candidate (tree var)
+{
+  tree type = TREE_TYPE (var);
+
+  if (!AGGREGATE_TYPE_P (type) || TREE_THIS_VOLATILE (var)
+      || !COMPLETE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE (type))
+      || tree_to_shwi (TYPE_SIZE (type)) == 0
+      || TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT (va_list_type_node))
+    return false;
+
+  base_access_vec->get_or_insert (var);
+
+  return true;
+}
+
+/* Callback of walk_stmt_load_store_addr_ops visit_addr used to remove
+   operands with address taken.  */
+static tree
+visit_addr (tree *tp, int *, void *)
+{
+  tree op = *tp;
+  if (op && DECL_P (op))
+    disqualify_candidate (op);
+
+  return NULL;
+}
+
+/* Scan expression EXPR and create access structures for all accesses to
+   candidates for scalarization.  Return the created access or NULL if none is
+   created.  */
+static struct access *
+build_access_from_expr (tree expr, bool write)
+{
+  if (TREE_CODE (expr) == VIEW_CONVERT_EXPR)
+    expr = TREE_OPERAND (expr, 0);
+
+  if (TREE_CODE (expr) == BIT_FIELD_REF || storage_order_barrier_p (expr)
+      || TREE_THIS_VOLATILE (expr))
+    {
+      disqualify_candidate (expr);
+      return NULL;
+    }
+
+  switch (TREE_CODE (expr))
+    {
+      case MEM_REF: {
+	tree op = TREE_OPERAND (expr, 0);
+	if (TREE_CODE (op) == ADDR_EXPR)
+	  disqualify_candidate (TREE_OPERAND (op, 0));
+	break;
+      }
+    case ADDR_EXPR:
+    case IMAGPART_EXPR:
+    case REALPART_EXPR:
+      disqualify_candidate (TREE_OPERAND (expr, 0));
+      break;
+    case VAR_DECL:
+    case PARM_DECL:
+    case RESULT_DECL:
+    case COMPONENT_REF:
+    case ARRAY_REF:
+    case ARRAY_RANGE_REF:
+      return create_access (expr, write);
+      break;
+    default:
+      break;
+    }
+
+  return NULL;
+}
+
+/* Scan function and look for interesting expressions and create access
+   structures for them.  */
+static void
+scan_function (void)
+{
+  basic_block bb;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  gphi *phi = gsi.phi ();
+	  for (size_t i = 0; i < gimple_phi_num_args (phi); i++)
+	    {
+	      tree t = gimple_phi_arg_def (phi, i);
+	      walk_tree (&t, visit_addr, NULL, NULL);
+	    }
+	}
+
+      for (gimple_stmt_iterator gsi = gsi_start_nondebug_after_labels_bb (bb);
+	   !gsi_end_p (gsi); gsi_next_nondebug (&gsi))
+	{
+	  gimple *stmt = gsi_stmt (gsi);
+	  switch (gimple_code (stmt))
+	    {
+	      case GIMPLE_RETURN: {
+		tree r = gimple_return_retval (as_a<greturn *> (stmt));
+		if (r && VAR_P (r) && r != DECL_RESULT (current_function_decl))
+		  build_access_from_expr (r, true);
+	      }
+	      break;
+	    case GIMPLE_ASSIGN:
+	      if (gimple_assign_single_p (stmt) && !gimple_clobber_p (stmt))
+		{
+		  tree lhs = gimple_assign_lhs (stmt);
+		  tree rhs = gimple_assign_rhs1 (stmt);
+		  if (TREE_CODE (rhs) == CONSTRUCTOR)
+		    disqualify_candidate (lhs);
+		  else
+		    {
+		      build_access_from_expr (rhs, false);
+		      build_access_from_expr (lhs, true);
+		    }
+		}
+	      break;
+	    default:
+	      walk_gimple_op (stmt, visit_addr, NULL);
+	      break;
+	    }
+	}
+    }
+}
+
+/* Collect the parameter and returns with type which is suitable for
+ * scalarization.  */
+static bool
+collect_light_sra_candidates (void)
+{
+  bool ret = false;
+
+  /* Collect parameters.  */
+  for (tree parm = DECL_ARGUMENTS (current_function_decl); parm;
+       parm = DECL_CHAIN (parm))
+    ret |= add_sra_candidate (parm);
+
+  /* Collect VARs on returns.  */
+  if (DECL_RESULT (current_function_decl))
+    {
+      edge_iterator ei;
+      edge e;
+      FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+	if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
+	  {
+	    tree val = gimple_return_retval (r);
+	    if (val && VAR_P (val))
+	      ret |= add_sra_candidate (val);
+	  }
+    }
+
+  return ret;
+}
+
+/* Now, only scalarize the parms only with reading
+   or returns only with writing.  */
+bool
+check_access_vec (tree const &base, auto_vec<access_p> const &access_vec,
+		  auto_vec<tree> *unqualify_vec)
+{
+  bool read = false;
+  bool write = false;
+  for (unsigned int j = 0; j < access_vec.length (); j++)
+    {
+      struct access *access = access_vec[j];
+      if (access->writing)
+	write = true;
+      else
+	read = true;
+
+      if (write && read)
+	break;
+    }
+  if ((write && read) || (!write && !read))
+    unqualify_vec->safe_push (base);
+
+  return true;
+}
+
+/* Analyze all the accesses, remove those inprofitable candidates.
+   And build the expr->access map.  */
+static void
+analyze_accesses ()
+{
+  auto_vec<tree> unqualify_vec;
+  base_access_vec->traverse<auto_vec<tree> *, check_access_vec> (
+    &unqualify_vec);
+
+  tree base;
+  unsigned i;
+  FOR_EACH_VEC_ELT (unqualify_vec, i, base)
+    disqualify_candidate (base);
+}
+
+static void
+prepare_expander_sra ()
+{
+  if (optimize <= 0)
+    return;
+
+  base_access_vec = new hash_map<tree, auto_vec<access_p> >;
+  expr_access_vec = new hash_map<tree, access_p>;
+
+  if (collect_light_sra_candidates ())
+    {
+      scan_function ();
+      analyze_accesses ();
+    }
+}
+
+static void
+free_expander_sra ()
+{
+  if (optimize <= 0 || !expr_access_vec)
+    return;
+  delete expr_access_vec;
+  expr_access_vec = 0;
+  delete base_access_vec;
+  base_access_vec = 0;
+}
+} /* namespace */
+
+/* Check If there is an sra access for the expr.
+   Return the correspond scalar sym for the access. */
+rtx
+get_scalar_rtx_for_aggregate_expr (tree expr)
+{
+  if (!expr_access_vec)
+    return NULL_RTX;
+  access_p *access = expr_access_vec->get (expr);
+  return access ? (*access)->rtx_val : NULL_RTX;
+}
+
+extern rtx
+expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx, int);
+
+/* Compute/Set RTX registers for those accesses on BASE.  */
+void
+set_scalar_rtx_for_aggregate_access (tree base, rtx regs)
+{
+  if (!base_access_vec)
+    return;
+  vec<access_p> *access_vec = get_base_access_vector (base);
+  if (!access_vec)
+    return;
+
+  /* Go through each access, compute corresponding rtx(regs or subregs)
+     for the expression.  */
+  int n = access_vec->length ();
+  int cur_access_index = 0;
+  for (; cur_access_index < n; cur_access_index++)
+    {
+      access_p acc = (*access_vec)[cur_access_index];
+      machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr));
+      /* non BLK in mult registers*/
+      if (expr_mode != BLKmode
+	  && known_gt (acc->size, GET_MODE_BITSIZE (word_mode)))
+	break;
+
+      int start_index = -1;
+      int end_index = -1;
+      HOST_WIDE_INT left_margin_bits = 0;
+      HOST_WIDE_INT right_margin_bits = 0;
+      int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
+      for (; cur_index < XVECLEN (regs, 0); cur_index++)
+	{
+	  rtx slot = XVECEXP (regs, 0, cur_index);
+	  HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
+	  HOST_WIDE_INT size
+	    = GET_MODE_BITSIZE (GET_MODE (XEXP (slot, 0))).to_constant ();
+	  if (off <= acc->offset && off + size > acc->offset)
+	    {
+	      start_index = cur_index;
+	      left_margin_bits = acc->offset - off;
+	    }
+	  if (off + size >= acc->offset + acc->size)
+	    {
+	      end_index = cur_index;
+	      right_margin_bits = off + size - (acc->offset + acc->size);
+	      break;
+	    }
+	}
+      /* accessing pading and outof bound.  */
+      if (start_index < 0 || end_index < 0)
+	break;
+
+      /* Need a parallel for possible multi-registers. */
+      if (expr_mode == BLKmode || end_index > start_index)
+	{
+	  /* Can not support start from middle of a register.  */
+	  if (left_margin_bits != 0)
+	    break;
+
+	  int len = end_index - start_index + 1;
+	  const int margin = 3; /* more space for SI, HI, QI.  */
+	  rtx *tmps = XALLOCAVEC (rtx, len + (right_margin_bits ? margin : 0));
+
+	  HOST_WIDE_INT start_off
+	    = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1));
+	  int pos = 0;
+	  for (; pos < len - (right_margin_bits ? 1 : 0); pos++)
+	    {
+	      int index = start_index + pos;
+	      rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0);
+	      machine_mode mode = GET_MODE (orig_reg);
+	      rtx reg = NULL_RTX;
+	      if (HARD_REGISTER_P (orig_reg))
+		{
+		  /* Reading from param hard reg need to be moved to a temp.  */
+		  gcc_assert (!acc->writing);
+		  reg = gen_reg_rtx (mode);
+		  emit_move_insn (reg, orig_reg);
+		}
+	      else
+		reg = orig_reg;
+
+	      HOST_WIDE_INT off = UINTVAL (XEXP (XVECEXP (regs, 0, index), 1));
+	      tmps[pos]
+		= gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off - start_off));
+	    }
+
+	  /* There are some fields are in part of registers.   */
+	  if (right_margin_bits != 0)
+	    {
+	      if (acc->writing)
+		break;
+
+	      gcc_assert ((right_margin_bits % BITS_PER_UNIT) == 0);
+	      HOST_WIDE_INT off_byte
+		= UINTVAL (XEXP (XVECEXP (regs, 0, end_index), 1)) - start_off;
+	      rtx orig_reg = XEXP (XVECEXP (regs, 0, end_index), 0);
+	      machine_mode orig_mode = GET_MODE (orig_reg);
+	      gcc_assert (GET_MODE_CLASS (orig_mode) == MODE_INT);
+
+	      machine_mode mode_aux[] = {SImode, HImode, QImode};
+	      HOST_WIDE_INT reg_size
+		= GET_MODE_BITSIZE (orig_mode).to_constant ();
+	      HOST_WIDE_INT off_bits = 0;
+	      for (unsigned long j = 0;
+		   j < sizeof (mode_aux) / sizeof (mode_aux[0]); j++)
+		{
+		  HOST_WIDE_INT submode_bitsize
+		    = GET_MODE_BITSIZE (mode_aux[j]).to_constant ();
+		  if (reg_size - right_margin_bits - off_bits
+		      >= submode_bitsize)
+		    {
+		      rtx reg = gen_reg_rtx (orig_mode);
+		      emit_move_insn (reg, orig_reg);
+
+		      poly_uint64 lowpart_off
+			= subreg_lowpart_offset (mode_aux[j], orig_mode);
+		      int lowpart_off_bits
+			= lowpart_off.to_constant () * BITS_PER_UNIT;
+		      int shift_bits = lowpart_off_bits >= off_bits
+					 ? (lowpart_off_bits - off_bits)
+					 : (off_bits - lowpart_off_bits);
+		      if (shift_bits > 0)
+			reg = expand_shift (RSHIFT_EXPR, orig_mode, reg,
+					    shift_bits, NULL, 1);
+		      rtx subreg = gen_lowpart (mode_aux[j], reg);
+		      rtx off = GEN_INT (off_byte);
+		      tmps[pos++]
+			= gen_rtx_EXPR_LIST (mode_aux[j], subreg, off);
+		      off_byte += submode_bitsize / BITS_PER_UNIT;
+		      off_bits += submode_bitsize;
+		    }
+		}
+	    }
+
+	  /* Currently, PARALLELs with register elements for param/returns
+	     are using BLKmode.  */
+	  acc->rtx_val = gen_rtx_PARALLEL (TYPE_MODE (TREE_TYPE (acc->expr)),
+					   gen_rtvec_v (pos, tmps));
+	  continue;
+	}
+
+      /* The access corresponds to one reg.  */
+      if (end_index == start_index && left_margin_bits == 0
+	  && right_margin_bits == 0)
+	{
+	  rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0);
+	  rtx reg = NULL_RTX;
+	  if (HARD_REGISTER_P (orig_reg))
+	    {
+	      /* Reading from param hard reg need to be moved to a temp.  */
+	      gcc_assert (!acc->writing);
+	      reg = gen_reg_rtx (GET_MODE (orig_reg));
+	      emit_move_insn (reg, orig_reg);
+	    }
+	  else
+	    reg = orig_reg;
+	  if (GET_MODE (orig_reg) != expr_mode)
+	    reg = gen_lowpart (expr_mode, reg);
+
+	  acc->rtx_val = reg;
+	  continue;
+	}
+
+      /* It is accessing a filed which is part of a register.  */
+      scalar_int_mode imode;
+      if (!acc->writing && end_index == start_index
+	  && int_mode_for_size (acc->size, 1).exists (&imode))
+	{
+	  /* get and copy original register inside the param.  */
+	  rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0);
+	  machine_mode mode = GET_MODE (orig_reg);
+	  gcc_assert (GET_MODE_CLASS (mode) == MODE_INT);
+	  rtx reg = gen_reg_rtx (mode);
+	  emit_move_insn (reg, orig_reg);
+
+	  /* shift to expect part. */
+	  poly_uint64 lowpart_off = subreg_lowpart_offset (imode, mode);
+	  int lowpart_off_bits = lowpart_off.to_constant () * BITS_PER_UNIT;
+	  int shift_bits = lowpart_off_bits >= left_margin_bits
+			     ? (lowpart_off_bits - left_margin_bits)
+			     : (left_margin_bits - lowpart_off_bits);
+	  if (shift_bits > 0)
+	    reg = expand_shift (RSHIFT_EXPR, mode, reg, shift_bits, NULL, 1);
+
+	  /* move corresond part subreg to result.  */
+	  rtx subreg = gen_lowpart (imode, reg);
+	  rtx result = gen_reg_rtx (imode);
+	  emit_move_insn (result, subreg);
+
+	  if (expr_mode != imode)
+	    result = gen_lowpart (expr_mode, result);
+
+	  acc->rtx_val = result;
+	  continue;
+	}
+
+      break;
+    }
+
+  /* Some access expr(s) are not scalarized.  */
+  if (cur_access_index != n)
+    disqualify_candidate (base);
+  else
+    {
+      /* Add elements to expr->access map.  */
+      for (int j = 0; j < n; j++)
+	{
+	  access_p access = (*access_vec)[j];
+	  expr_access_vec->put (access->expr, access);
+	}
+    }
+}
+
+void
+set_scalar_rtx_for_returns ()
+{
+  tree res = DECL_RESULT (current_function_decl);
+  gcc_assert (res);
+  edge_iterator ei;
+  edge e;
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+    if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
+      {
+	tree val = gimple_return_retval (r);
+	if (val && VAR_P (val))
+	  set_scalar_rtx_for_aggregate_access (val, DECL_RTL (res));
+      }
+}
+
 /* Return an expression tree corresponding to the RHS of GIMPLE
    statement STMT.  */
 
@@ -3778,7 +4336,8 @@ expand_return (tree retval)
 
   /* If we are returning the RESULT_DECL, then the value has already
      been stored into it, so we don't have to do anything special.  */
-  if (TREE_CODE (retval_rhs) == RESULT_DECL)
+  if (TREE_CODE (retval_rhs) == RESULT_DECL
+      || get_scalar_rtx_for_aggregate_expr (retval_rhs))
     expand_value_return (result_rtl);
 
   /* If the result is an aggregate that is being returned in one (or more)
@@ -4422,6 +4981,9 @@ expand_debug_expr (tree exp)
   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp));
   addr_space_t as;
   scalar_int_mode op0_mode, op1_mode, addr_mode;
+  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
+  if (x)
+    return NULL_RTX;/* optimized out.  */
 
   switch (TREE_CODE_CLASS (TREE_CODE (exp)))
     {
@@ -6630,6 +7192,8 @@ pass_expand::execute (function *fun)
 	    avoid_deep_ter_for_debug (gsi_stmt (gsi), 0);
     }
 
+  prepare_expander_sra ();
+
   /* Mark arrays indexed with non-constant indices with TREE_ADDRESSABLE.  */
   auto_bitmap forced_stack_vars;
   discover_nonconstant_array_refs (forced_stack_vars);
@@ -7062,6 +7626,7 @@ pass_expand::execute (function *fun)
       loop_optimizer_finalize ();
     }
 
+  free_expander_sra ();
   timevar_pop (TV_POST_EXPAND);
 
   return 0;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 56b51876f80..b970f98e689 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx, rtx, rtx,
 static rtx const_vector_from_tree (tree);
 static tree tree_expr_size (const_tree);
 static void convert_mode_scalar (rtx, rtx, int);
+rtx get_scalar_rtx_for_aggregate_expr (tree);
 
 
 /* This is run to set up which modes can be used
@@ -5623,11 +5624,12 @@ expand_assignment (tree to, tree from, bool nontemporal)
      Assignment of an array element at a constant index, and assignment of
      an array element in an unaligned packed structure field, has the same
      problem.  Same for (partially) storing into a non-memory object.  */
-  if (handled_component_p (to)
-      || (TREE_CODE (to) == MEM_REF
-	  && (REF_REVERSE_STORAGE_ORDER (to)
-	      || mem_ref_refers_to_non_mem_p (to)))
-      || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)
+  if (!get_scalar_rtx_for_aggregate_expr (to)
+      && (handled_component_p (to)
+	  || (TREE_CODE (to) == MEM_REF
+	      && (REF_REVERSE_STORAGE_ORDER (to)
+		  || mem_ref_refers_to_non_mem_p (to)))
+	  || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE))
     {
       machine_mode mode1;
       poly_int64 bitsize, bitpos;
@@ -8995,6 +8997,9 @@ expand_expr_real (tree exp, rtx target, machine_mode tmode,
       ret = CONST0_RTX (tmode);
       return ret ? ret : const0_rtx;
     }
+  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
+  if (x)
+    return x;
 
   ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl,
 			    inner_reference_p);
diff --git a/gcc/function.cc b/gcc/function.cc
index 82102ed78d7..262d3f17e72 100644
--- a/gcc/function.cc
+++ b/gcc/function.cc
@@ -2742,6 +2742,9 @@ assign_parm_find_stack_rtl (tree parm, struct assign_parm_data_one *data)
   data->stack_parm = stack_parm;
 }
 
+extern void
+set_scalar_rtx_for_aggregate_access (tree, rtx);
+
 /* A subroutine of assign_parms.  Adjust DATA->ENTRY_RTL such that it's
    always valid and contiguous.  */
 
@@ -3117,8 +3120,21 @@ assign_parm_setup_block (struct assign_parm_data_all *all,
 	  emit_move_insn (mem, entry_parm);
 	}
       else
-	move_block_from_reg (REGNO (entry_parm), mem,
-			     size_stored / UNITS_PER_WORD);
+	{
+	  int regno = REGNO (entry_parm);
+	  int nregs = size_stored / UNITS_PER_WORD;
+	  move_block_from_reg (regno, mem, nregs);
+
+	  rtx *tmps = XALLOCAVEC (rtx, nregs);
+	  machine_mode mode = word_mode;
+	  for (int i = 0; i < nregs; i++)
+	    tmps[i] = gen_rtx_EXPR_LIST (
+	      VOIDmode, gen_rtx_REG (mode, regno + i),
+	      GEN_INT (GET_MODE_SIZE (mode).to_constant () * i));
+
+	  rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps));
+	  set_scalar_rtx_for_aggregate_access (parm, regs);
+	}
     }
   else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type))
     {
@@ -3718,6 +3734,10 @@ assign_parms (tree fndecl)
       else
 	set_decl_incoming_rtl (parm, data.entry_parm, false);
 
+      rtx incoming = DECL_INCOMING_RTL (parm);
+      if (GET_CODE (incoming) == PARALLEL)
+	set_scalar_rtx_for_aggregate_access (parm, incoming);
+
       assign_parm_adjust_stack_rtl (&data);
 
       if (assign_parm_setup_block_p (&data))
@@ -5037,6 +5057,7 @@ stack_protect_epilogue (void)
    the function's parameters, which must be run at any return statement.  */
 
 bool currently_expanding_function_start;
+extern void set_scalar_rtx_for_returns ();
 void
 expand_function_start (tree subr)
 {
@@ -5138,6 +5159,7 @@ expand_function_start (tree subr)
 	    {
 	      gcc_assert (GET_CODE (hard_reg) == PARALLEL);
 	      set_parm_rtl (res, gen_group_rtx (hard_reg));
+	      set_scalar_rtx_for_returns ();
 	    }
 	}
 
diff --git a/gcc/opts.cc b/gcc/opts.cc
index 86b94d62b58..5e129a1cc49 100644
--- a/gcc/opts.cc
+++ b/gcc/opts.cc
@@ -1559,6 +1559,10 @@ public:
   vec<const char *> m_values;
 };
 
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-truncation"
+#endif
 /* Print help for a specific front-end, etc.  */
 static void
 print_filtered_help (unsigned int include_flags,
@@ -1913,7 +1917,9 @@ print_filtered_help (unsigned int include_flags,
       printf ("\n\n");
     }
 }
-
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
 /* Display help for a specified type of option.
    The options must have ALL of the INCLUDE_FLAGS set
    ANY of the flags in the ANY_FLAGS set
diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C b/gcc/testsuite/g++.target/powerpc/pr102024.C
index 769585052b5..c8995cae707 100644
--- a/gcc/testsuite/g++.target/powerpc/pr102024.C
+++ b/gcc/testsuite/g++.target/powerpc/pr102024.C
@@ -5,7 +5,7 @@
 // Test that a zero-width bit field in an otherwise homogeneous aggregate
 // generates a psabi warning and passes arguments in GPRs.
 
-// { dg-final { scan-assembler-times {\mstd\M} 4 } }
+// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } }
 
 struct a_thing
 {
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c b/gcc/testsuite/gcc.target/powerpc/pr108073.c
new file mode 100644
index 00000000000..7dd1a4a326a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+typedef struct DF {double a[4]; short s1; short s2; short s3; short s4; } DF;
+typedef struct SF {float a[4]; int i1; int i2; } SF;
+
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { has_arch_ppc64 && has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mlwz\M} {target { has_arch_ppc64 && has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mlhz\M} {target { has_arch_ppc64 && has_arch_pwr8 } } } } */
+short  __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag == 2)return a.s2+a.s3;return 0;}
+int  __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag == 2)return a.i2+a.i1;return 0;}
+double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag == 2)return arg.a[3];else return 0.0;}
+float  __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag == 2)return arg.a[2]; return 0;}
+float  __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag == 2)return arg.a[1];return 0;}
+
+DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4};
+SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2};
+
+int main()
+{
+  if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == 4.0
+	&& foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0))
+    __builtin_abort ();
+  if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0
+	&& foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0))
+    __builtin_abort ();
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
new file mode 100644
index 00000000000..4e1f87f7939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
@@ -0,0 +1,6 @@
+/* PR target/65421 */
+/* { dg-options "-O2" } */
+
+typedef struct LARGE {double a[4]; int arr[32];} LARGE;
+LARGE foo (LARGE a){return a;}
+/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
new file mode 100644
index 00000000000..8a8e1a0e996
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
@@ -0,0 +1,32 @@
+/* PR target/65421 */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target powerpc_elfv2 } */
+/* { dg-require-effective-target has_arch_ppc64 } */
+
+typedef struct FLOATS
+{
+  double a[3];
+} FLOATS;
+
+/* 3 lfd after returns also optimized */
+/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */
+
+/* 3 stfd */
+void st_arg (FLOATS a, FLOATS *p) {*p = a;}
+/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */
+
+/* blr */
+FLOATS ret_arg (FLOATS a) {return a;}
+
+typedef struct MIX
+{
+  double a[2];
+  long l;
+} MIX;
+
+/* std 3 param regs to return slot */
+MIX ret_arg1 (MIX a) {return a;}
+/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */
+
+/* count insns */
+/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */
-- 
2.39.1



More information about the Gcc-patches mailing list