[PATCH] vect: Add bias parameter for partial vectorization

Kewen.Lin linkw@linux.ibm.com
Thu Nov 4 03:59:18 GMT 2021


Hi Robin,

on 2021/11/3 上午4:16, Robin Dapp wrote:
> Hi,
> 
> thanks for the helpful comments. The attached v2 addresses the following
> points from them:
> 
>  - Save the bias in loop_vinfo and set it once in vect_verify_loop_lens.
>  - Add code to handle the bias in vect_set_loop_controls_directly.
>  - Adjust costing.
>  - Add comments for the new len_load parameter as well as change wording.
>  - Include the rs6000 change directly.
> 
> I'm not entirely satisfied with the bias code for the loop controls,
> mainly because of side effects I might not have considered.  The test
> suites show no new regressions and bootstrap succeeded, though.
> 
> Regards
>  Robin
> 
> 
> vll-v2.patch
> 
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index bf033e31c1c..dc2756f83e9 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -5637,7 +5637,8 @@
>  (define_expand "len_load_v16qi"
>    [(match_operand:V16QI 0 "vlogical_operand")
>     (match_operand:V16QI 1 "memory_operand")
> -   (match_operand:QI 2 "gpc_reg_operand")]
> +   (match_operand:QI 2 "gpc_reg_operand")
> +   (match_operand:QI 3 "zero_constant")]
>    "TARGET_P9_VECTOR && TARGET_64BIT"
>  {
>    rtx mem = XEXP (operands[1], 0);
> @@ -5651,6 +5652,7 @@
>    [(match_operand:V16QI 0 "memory_operand")
>     (match_operand:V16QI 1 "vlogical_operand")
>     (match_operand:QI 2 "gpc_reg_operand")
> +   (match_operand:QI 3 "zero_constant")
>    ]
>    "TARGET_P9_VECTOR && TARGET_64BIT"
>  {

Nice, thanks!

> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 2b41cb7fb7b..265c76f1609 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5213,7 +5213,10 @@ which must be a vector mode.  Operand 2 has whichever integer mode the
>  target prefers.  If operand 2 exceeds the number of elements in mode
>  @var{m}, the behavior is undefined.  If the target prefers the length
>  to be measured in bytes rather than elements, it should only implement
> -this pattern for vectors of @code{QI} elements.
> +this pattern for vectors of @code{QI} elements.  Operand 3 specifies
> +a bias predicate that determines whether a length of zero is permitted
> +or not.  If permitted, the predicate should only allow a zero immediate,
> +otherwhise it should only allow an immediate value of -1.
>  
>  This pattern is not allowed to @code{FAIL}.
>  
> @@ -5226,7 +5229,10 @@ a vector mode.  Operand 2 has whichever integer mode the target prefers.
>  If operand 2 exceeds the number of elements in mode @var{m}, the behavior
>  is undefined.  If the target prefers the length to be measured in bytes
>  rather than elements, it should only implement this pattern for vectors
> -of @code{QI} elements.
> +of @code{QI} elements.  Operand 3 specifies a bias predicate that
> +determines whether a length of zero is permitted or not.  If permitted,
> +the predicate should only allow a zero constant, otherwhise it should
> +only allow an immediate value of -1.
>  

Nit: s/otherwhise/otherwise/ (same for len_load).

Since these optabs are also for length in elements (although there is not
this kind of usage), I guess the current bias -1 support would work well
for length in elements too?  Nice!  :)

>  This pattern is not allowed to @code{FAIL}.
>  
> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
> index 8312d08aab2..90fdc440248 100644
> --- a/gcc/internal-fn.c
> +++ b/gcc/internal-fn.c
> @@ -2696,9 +2696,9 @@ expand_call_mem_ref (tree type, gcall *stmt, int index)
>  static void
>  expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  {
> -  class expand_operand ops[3];
> -  tree type, lhs, rhs, maskt;
> -  rtx mem, target, mask;
> +  class expand_operand ops[4];
> +  tree type, lhs, rhs, maskt, biast;
> +  rtx mem, target, mask, bias;
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> @@ -2723,11 +2723,20 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>    create_output_operand (&ops[0], target, TYPE_MODE (type));
>    create_fixed_operand (&ops[1], mem);
>    if (optab == len_load_optab)
> -    create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> -				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> +				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      biast = gimple_call_arg (stmt, 3);
> +      bias = expand_normal (biast);
> +      create_input_operand (&ops[3], bias, QImode);
> +      expand_insn (icode, 4, ops);
> +    }
>    else
> +    {
>      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> -  expand_insn (icode, 3, ops);
> +    expand_insn (icode, 3, ops);
> +    }
> +
>    if (!rtx_equal_p (target, ops[0].value))
>      emit_move_insn (target, ops[0].value);
>  }
> @@ -2741,9 +2750,9 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  static void
>  expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  {
> -  class expand_operand ops[3];
> -  tree type, lhs, rhs, maskt;
> -  rtx mem, reg, mask;
> +  class expand_operand ops[4];
> +  tree type, lhs, rhs, maskt, biast;
> +  rtx mem, reg, mask, bias;
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> @@ -2766,11 +2775,19 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>    create_fixed_operand (&ops[0], mem);
>    create_input_operand (&ops[1], reg, TYPE_MODE (type));
>    if (optab == len_store_optab)
> -    create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> -				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
> +				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      biast = gimple_call_arg (stmt, 4);
> +      bias = expand_normal (biast);
> +      create_input_operand (&ops[3], bias, QImode);
> +      expand_insn (icode, 4, ops);
> +    }
>    else
> -    create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> -  expand_insn (icode, 3, ops);
> +    {
> +      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> +      expand_insn (icode, 3, ops);
> +    }
>  }
>  
>  #define expand_mask_store_optab_fn expand_partial_store_optab_fn
> @@ -4172,6 +4189,29 @@ internal_check_ptrs_fn_supported_p (internal_fn ifn, tree type,
>  	  && insn_operand_matches (icode, 4, GEN_INT (align)));
>  }
>  
> +/* Return the supported bias for the len_load IFN.  For now we only support
> +   the biases of 0 and -1 (in case 0 is not an allowable length for len_load).
> +   If none of the biases match what the backend provides, return
> +   VECT_PARTIAL_BIAS_UNSUPPORTED.  */
> +
> +signed char
> +internal_len_load_bias (internal_fn ifn, machine_mode mode)

Nit: this function is used for both len_load and len_store.  We have a similar
function name get_len_load_store_mode, maybe it's better to align it with 
a name like internal_len_load_store_bias?

If it's a good idea, it would be also applied for LOOP_VINFO_PARTIAL_LOAD_BIAS
(to LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS).

> +{
> +  optab optab = direct_internal_fn_optab (ifn);
> +  insn_code icode = direct_optab_handler (optab, mode);
> +
> +  if (icode != CODE_FOR_nothing)
> +    {
> +      /* For now we only support biases of 0 or -1.  Try both of them.  */
> +      if (insn_operand_matches (icode, 3, GEN_INT (0)))
> +	return 0;
> +      if (insn_operand_matches (icode, 3, GEN_INT (-1)))
> +	return -1;
> +    }
> +
> +  return VECT_PARTIAL_BIAS_UNSUPPORTED;
> +}
> +
>  /* Expand STMT as though it were a call to internal function FN.  */
>  
>  void
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 19d0f849a5a..c057df1327a 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -227,6 +227,10 @@ extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
>  						    tree, tree, int);
>  extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
>  						poly_uint64, unsigned int);
> +#define VECT_PARTIAL_BIAS_UNSUPPORTED 127
> +
> +extern signed char internal_len_load_bias (internal_fn ifn,
> +					   machine_mode);
>  
>  extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
>  				    bool, bool, bool, bool, tree *);
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index 4988c93fdb6..79cf5029faf 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -436,7 +436,14 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>    tree length_limit = NULL_TREE;
>    /* For length, we need length_limit to ensure length in range.  */
>    if (!use_masks_p)
> -    length_limit = build_int_cst (compare_type, nitems_per_ctrl);
> +    {
> +      int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo);
> +      if (partial_load_bias == 0)
> +	length_limit = build_int_cst (compare_type, nitems_per_ctrl);
> +      else
> +	length_limit = build_int_cst (compare_type,
> +				      nitems_per_ctrl + partial_load_bias);
> +    }

Nit: this could be simplified as

  length_limit = build_int_cst (compare_type, nitems_per_ctrl + partial_load_bias);
 
>  
>    /* Calculate the maximum number of item values that the rgroup
>       handles in total, the number that it handles for each iteration
> @@ -560,8 +567,12 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>      {
>        /* Previous controls will cover BIAS items.  This control covers the
>  	 next batch.  */
> +      tree bias_tree;
>        poly_uint64 bias = nitems_per_ctrl * i;
> -      tree bias_tree = build_int_cst (compare_type, bias);
> +      int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo);
> +      if (partial_load_bias != 0)
> +	bias -= partial_load_bias;
> +      bias_tree = build_int_cst (compare_type, bias);
>  
>        /* See whether the first iteration of the vector loop is known
>  	 to have a full control.  */
> @@ -574,7 +585,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>  	 TEST_LIMIT, prefer to use the same 0-based IV for each control
>  	 and adjust the bound down by BIAS.  */
>        tree this_test_limit = test_limit;
> -      if (i != 0)
> +      if (i != 0 || (i == 0 && partial_load_bias != 0))
>  	{
>  	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
>  					  compare_type, this_test_limit,

The changes in gcc/tree-vect-loop-manip.c seem not work for all cases, especially
for the length_limit boundary case?  such as: char array with length 17
  
  for (int i = 0; i < 17; i++)
    c[i] = a[i] + b[i];

The next_len is computed as zero, it's intended to be used for new length in the
next loop iteration, but it's also used for loop exit check, the loop would end
unexpectedly.

Could you have a double check?

IIUC, one equivalent change as v1 seems to just update the loop_len at the
beginning of the head of the loop with bias -1, which ensures all its following
usesites adopt the same biased length.

> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index e94356d76e9..705439cb380 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -1163,6 +1163,21 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
>    if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
>      return false;
>  
> +  machine_mode len_load_mode = get_len_load_store_mode
> +    (loop_vinfo->vector_mode, false).require ();
> +
> +  signed char partial_load_bias = internal_len_load_bias
> +    (IFN_LEN_LOAD, len_load_mode);
> +
> +  LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo) = partial_load_bias;
> +

It seems better to add
 
  if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
     return false;

to align with the documentation that we only support bias 0 and -1.


> +  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
> +     len_loads with a length of zero.  In order to avoid that we prohibit
> +     more than one loop length here.  */
> +  if (partial_load_bias == -1
> +      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
> +      return false;
> +

Maybe it's better to move LOOP_VINFO_PARTIAL_LOAD_BIAS setting after
the checkings and assert that we have the same bias for len_store, such as:

  signed char partial_store_bias = internal_len_load_store_bias (IFN_LEN_STORE, len_load_mode);
  gcc_assert (partial_load_bias == partial_store_bias);

  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;


>    unsigned int max_nitems_per_iter = 1;
>    unsigned int i;
>    rgroup_controls *rgl;
> @@ -4125,6 +4140,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>  	 here.  */
>  
>        bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
> +      signed char partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_BIAS (loop_vinfo);
>        bool need_iterate_p
>  	= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>  	   && !vect_known_niters_smaller_than_vf (loop_vinfo));
> @@ -4157,6 +4173,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>  	       for each since start index is zero.  */
>  	    prologue_stmts += num_vectors;
>  
> +	    /* If we have a non-zero partial load bias, we need one MINUS
> +	       and a MAX to adjust the load length.  */
> +	    if (partial_load_bias != 0)
> +	      prologue_stmts += 2;
> +
>  	    /* Each may need two MINs and one MINUS to update lengths in body
>  	       for next iteration.  */
>  	    if (need_iterate_p)
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 17849b575b7..57ecafb7965 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -8289,9 +8289,16 @@ vectorizable_store (vec_info *vinfo,
>  						   gsi);
>  		      vec_oprnd = var;
>  		    }
> +
> +		  /* Check which bias value to use.  */
> +		  char biasval = internal_len_load_bias
> +		    (IFN_LEN_STORE, new_vmode);

Nit: seems better to align with other places, s/char/signed char/,
same for below.

> +
> +		  tree bias = build_int_cst (intQI_type_node, biasval);
>  		  gcall *call
> -		    = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr,
> -						  ptr, final_len, vec_oprnd);
> +		    = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
> +						  ptr, final_len, vec_oprnd,
> +						  bias);
>  		  gimple_call_set_nothrow (call, true);
>  		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
>  		  new_stmt = call;
> @@ -9588,24 +9595,32 @@ vectorizable_load (vec_info *vinfo,
>  					       vec_num * j + i);
>  			tree ptr = build_int_cst (ref_type,
>  						  align * BITS_PER_UNIT);
> +
> +			machine_mode vmode = TYPE_MODE (vectype);
> +			opt_machine_mode new_ovmode
> +			  = get_len_load_store_mode (vmode, true);
> +			machine_mode new_vmode = new_ovmode.require ();
> +			tree qi_type = unsigned_intQI_type_node;
> +			tree new_vtype
> +			  = build_vector_type_for_mode (qi_type, new_vmode);

Nit: this new_vtype assignment could still stay in the below hunk ...

> +
> +			/* Check which bias value to use.  */
> +			char biasval = internal_len_load_bias
> +			  (IFN_LEN_LOAD, new_vmode);
> +
> +			tree bias = build_int_cst (intQI_type_node, biasval);
> +
>  			gcall *call
> -			  = gimple_build_call_internal (IFN_LEN_LOAD, 3,
> +			  = gimple_build_call_internal (IFN_LEN_LOAD, 4,
>  							dataref_ptr, ptr,
> -							final_len);
> +							final_len, bias);
>  			gimple_call_set_nothrow (call, true);
>  			new_stmt = call;
>  			data_ref = NULL_TREE;
>  
>  			/* Need conversion if it's wrapped with VnQI.  */
> -			machine_mode vmode = TYPE_MODE (vectype);
> -			opt_machine_mode new_ovmode
> -			  = get_len_load_store_mode (vmode, true);
> -			machine_mode new_vmode = new_ovmode.require ();
>  			if (vmode != new_vmode)
>  			  {
> -			    tree qi_type = unsigned_intQI_type_node;
> -			    tree new_vtype
> -			      = build_vector_type_for_mode (qi_type, new_vmode);

here.

BR,
Kewen

>  			    tree var = vect_get_new_ssa_name (new_vtype,
>  							      vect_simple_var);
>  			    gimple_set_lhs (call, var);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index c4c5678e7f1..b59f5f6643c 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -749,6 +749,11 @@ public:
>       epilogue of loop.  */
>    bool epil_using_partial_vectors_p;
>  
> +  /* The bias for len_load and len_store.  For now, only 0 and -1 are
> +     supported.  -1 must be used when a backend does not support
> +     len_load/len_store with a length of zero.  */
> +  signed char partial_load_bias;
> +
>    /* When we have grouped data accesses with gaps, we may introduce invalid
>       memory accesses.  We peel the last iteration of the loop to prevent
>       this.  */
> @@ -814,6 +819,7 @@ public:
>  #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
>  #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
>    (L)->epil_using_partial_vectors_p
> +#define LOOP_VINFO_PARTIAL_LOAD_BIAS(L)    (L)->partial_load_bias
>  #define LOOP_VINFO_VECT_FACTOR(L)          (L)->vectorization_factor
>  #define LOOP_VINFO_MAX_VECT_FACTOR(L)      (L)->max_vectorization_factor
>  #define LOOP_VINFO_MASKS(L)                (L)->masks



More information about the Gcc-patches mailing list