[PATCH, rs6000] Folding of vector loads in GIMPLE

Tue Sep 12 17:45:00 GMT 2017

On Tue, 2017-09-12 at 10:22 -0500, Bill Schmidt wrote:
> > On Sep 12, 2017, at 9:41 AM, Will Schmidt <will_schmidt@vnet.ibm.com> wrote:
> > 
> > Hi
> > 
> > [PATCH, rs6000] Folding of vector loads in GIMPLE
> > 
> > Folding of vector loads in GIMPLE.
> > 
> > - Add code to handle gimple folding for the vec_ld builtins.
> > - Remove the now obsoleted folding code for vec_ld from rs6000-c.c. Surrounding
> > comments have been adjusted slightly so they continue to read OK for the
> > vec_st code that remains.
> > 
> > The resulting code is specifically verified by the powerpc/fold-vec-ld-*.c
> > tests which have been posted separately. (a few minutes ago).
> > 
> > Regtest successfully completed on power6 and newer. (p6,p7,p8le,p8be,p9).
> > 
> > OK for trunk?
> > 
> > Thanks,
> > -Will
> > 
> > [gcc]
> > 
> >        2017-09-12  Will Schmidt  <will_schmidt@vnet.ibm.com>
> > 
> > 	* config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
> > 	  for early folding of vector loads (ALTIVEC_BUILTIN_LVX_*).
> > 	* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
> > 	  Remove obsoleted code for handling ALTIVEC_BUILTIN_VEC_LD.
> > 
> > diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
> > index 897306c..73e14d9 100644
> > --- a/gcc/config/rs6000/rs6000-c.c
> > +++ b/gcc/config/rs6000/rs6000-c.c
> > @@ -6459,92 +6459,19 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
> > 		     convert (TREE_TYPE (stmt), arg0));
> >       stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
> >       return stmt;
> >     }
> > 
> > -  /* Expand vec_ld into an expression that masks the address and
> > -     performs the load.  We need to expand this early to allow
> > +  /* Expand vec_st into an expression that masks the address and
> > +     performs the store.  We need to expand this early to allow
> >      the best aliasing, as by the time we get into RTL we no longer
> >      are able to honor __restrict__, for example.  We may want to
> >      consider this for all memory access built-ins.
> > 
> >      When -maltivec=be is specified, or the wrong number of arguments
> >      is provided, simply punt to existing built-in processing.  */
> > -  if (fcode == ALTIVEC_BUILTIN_VEC_LD
> > -      && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
> > -      && nargs == 2)
> > -    {
> > -      tree arg0 = (*arglist)[0];
> > -      tree arg1 = (*arglist)[1];
> > -
> > -      /* Strip qualifiers like "const" from the pointer arg.  */
> > -      tree arg1_type = TREE_TYPE (arg1);
> > -      if (!POINTER_TYPE_P (arg1_type) && TREE_CODE (arg1_type) != ARRAY_TYPE)
> > -	goto bad;
> > -
> > -      tree inner_type = TREE_TYPE (arg1_type);
> > -      if (TYPE_QUALS (TREE_TYPE (arg1_type)) != 0)
> > -	{
> > -	  arg1_type = build_pointer_type (build_qualified_type (inner_type,
> > -								0));
> > -	  arg1 = fold_convert (arg1_type, arg1);
> > -	}
> > -
> > -      /* Construct the masked address.  Let existing error handling take
> > -	 over if we don't have a constant offset.  */
> > -      arg0 = fold (arg0);
> > -
> > -      if (TREE_CODE (arg0) == INTEGER_CST)
> > -	{
> > -	  if (!ptrofftype_p (TREE_TYPE (arg0)))
> > -	    arg0 = build1 (NOP_EXPR, sizetype, arg0);
> > -
> > -	  tree arg1_type = TREE_TYPE (arg1);
> > -	  if (TREE_CODE (arg1_type) == ARRAY_TYPE)
> > -	    {
> > -	      arg1_type = TYPE_POINTER_TO (TREE_TYPE (arg1_type));
> > -	      tree const0 = build_int_cstu (sizetype, 0);
> > -	      tree arg1_elt0 = build_array_ref (loc, arg1, const0);
> > -	      arg1 = build1 (ADDR_EXPR, arg1_type, arg1_elt0);
> > -	    }
> > -
> > -	  tree addr = fold_build2_loc (loc, POINTER_PLUS_EXPR, arg1_type,
> > -				       arg1, arg0);
> > -	  tree aligned = fold_build2_loc (loc, BIT_AND_EXPR, arg1_type, addr,
> > -					  build_int_cst (arg1_type, -16));
> > -
> > -	  /* Find the built-in to get the return type so we can convert
> > -	     the result properly (or fall back to default handling if the
> > -	     arguments aren't compatible).  */
> > -	  for (desc = altivec_overloaded_builtins;
> > -	       desc->code && desc->code != fcode; desc++)
> > -	    continue;
> > -
> > -	  for (; desc->code == fcode; desc++)
> > -	    if (rs6000_builtin_type_compatible (TREE_TYPE (arg0), desc->op1)
> > -		&& (rs6000_builtin_type_compatible (TREE_TYPE (arg1),
> > -						    desc->op2)))
> > -	      {
> > -		tree ret_type = rs6000_builtin_type (desc->ret_type);
> > -		if (TYPE_MODE (ret_type) == V2DImode)
> > -		  /* Type-based aliasing analysis thinks vector long
> > -		     and vector long long are different and will put them
> > -		     in distinct alias classes.  Force our return type
> > -		     to be a may-alias type to avoid this.  */
> > -		  ret_type
> > -		    = build_pointer_type_for_mode (ret_type, Pmode,
> > -						   true/*can_alias_all*/);
> > -		else
> > -		  ret_type = build_pointer_type (ret_type);
> > -		aligned = build1 (NOP_EXPR, ret_type, aligned);
> > -		tree ret_val = build_indirect_ref (loc, aligned, RO_NULL);
> > -		return ret_val;
> > -	      }
> > -	}
> > -    }
> > 
> > -  /* Similarly for stvx.  */
> >   if (fcode == ALTIVEC_BUILTIN_VEC_ST
> >       && (BYTES_BIG_ENDIAN || !VECTOR_ELT_ORDER_BIG)
> >       && nargs == 3)
> >     {
> >       tree arg0 = (*arglist)[0];
> > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> > index cf744d8..5b14789 100644
> > --- a/gcc/config/rs6000/rs6000.c
> > +++ b/gcc/config/rs6000/rs6000.c
> > @@ -16473,10 +16473,65 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
> > 	res = gimple_build (&stmts, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), res);
> > 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> > 	update_call_from_tree (gsi, res);
> > 	return true;
> >       }
> > +    /* Vector loads.  */
> > +    case ALTIVEC_BUILTIN_LVX_V16QI:
> > +    case ALTIVEC_BUILTIN_LVX_V8HI:
> > +    case ALTIVEC_BUILTIN_LVX_V4SI:
> > +    case ALTIVEC_BUILTIN_LVX_V4SF:
> > +    case ALTIVEC_BUILTIN_LVX_V2DI:
> > +    case ALTIVEC_BUILTIN_LVX_V2DF:
> > +      {
> > +	 gimple *g;
> > +	 arg0 = gimple_call_arg (stmt, 0);  // offset
> > +	 arg1 = gimple_call_arg (stmt, 1);  // address
> > +
> > +	 /* Limit folding of loads to LE targets.  */
> > +	 if (BYTES_BIG_ENDIAN || VECTOR_ELT_ORDER_BIG)
> > +	   return false;
> 
> Why?  This transformation shouldn't be endian-dependent.

I was seeing errors in some of the existing tests specific to BE.
FAIL: gcc.dg/vmx/ld-be-order.c   -Os  execution test
FAIL: gcc.dg/vmx/ld-vsx-be-order.c   -O0  execution test

I'll give this another attempt without that exclusion and verify.  I'll
admit it is possible the ld*be-order tests were failing for other
reasons.

Thanks
-Will

> 
> Thanks,
> Bill
> 
> > +
> > +	 lhs = gimple_call_lhs (stmt);
> > +	 location_t loc = gimple_location (stmt);
> > +
> > +	 tree arg1_type = TREE_TYPE (arg1);
> > +	 tree lhs_type = TREE_TYPE (lhs);
> > +
> > +	 /* POINTER_PLUS_EXPR wants the offset to be of type 'sizetype'.  Create
> > +	    the tree using the value from arg0.  The resulting type will match
> > +	    the type of arg1.  */
> > +	 tree temp_offset = create_tmp_reg_or_ssa_name (sizetype);
> > +	 g = gimple_build_assign (temp_offset, NOP_EXPR, arg0);
> > +	 gimple_set_location (g, loc);
> > +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +	 tree temp_addr = create_tmp_reg_or_ssa_name (arg1_type);
> > +	 g = gimple_build_assign (temp_addr, POINTER_PLUS_EXPR, arg1,
> > +				  temp_offset);
> > +	 gimple_set_location (g, loc);
> > +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +
> > +	 /* Mask off any lower bits from the address.  */
> > +	 tree alignment_mask = build_int_cst (arg1_type, -16);
> > +	 tree aligned_addr = create_tmp_reg_or_ssa_name (arg1_type);
> > +	 g = gimple_build_assign (aligned_addr, BIT_AND_EXPR,
> > +				 temp_addr, alignment_mask);
> > +	 gimple_set_location (g, loc);
> > +	 gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +
> > +	 /* Use the build2 helper to set up the mem_ref.  The MEM_REF could also
> > +	    take an offset, but since we've already incorporated the offset
> > +	    above, here we just pass in a zero.  */
> > +	 g = gimple_build_assign (lhs, build2 (MEM_REF, lhs_type, aligned_addr,
> > +						build_int_cst (arg1_type, 0)));
> > +	 gimple_set_location (g, loc);
> > +	 gsi_replace (gsi, g, true);
> > +
> > +	 return true;
> > +
> > +      }
> > +
> >     default:
> > 	if (TARGET_DEBUG_BUILTIN)
> > 	   fprintf (stderr, "gimple builtin intrinsic not matched:%d %s %s\n",
> > 		    fn_code, fn_name1, fn_name2);
> >       break;
> > 
> > 
>