[PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)

Fri Jan 5 09:01:00 GMT 2018

On Tue, 28 Nov 2017, Richard Biener wrote:

> 
> The following adds a new target hook, targetm.vectorize.split_reduction,
> which allows the target to specify a preferred mode to perform the
> final reducion on using either vector shifts or scalar extractions.
> Up to that mode the vector reduction result is reduced by combining
> lowparts and highparts recursively.  This avoids lane-crossing operations
> when doing AVX256 on Zen and Bulldozer and also speeds up things on
> Haswell (I verified ~20% speedup on Broadwell).
> 
> Thus the patch implements the target hook on x86 to _always_ prefer
> SSE modes for the final reduction.
> 
> For the testcase in the bugzilla
> 
> int sumint(const int arr[]) {
>     arr = __builtin_assume_aligned(arr, 64);
>     int sum=0;
>     for (int i=0 ; i<1024 ; i++)
>       sum+=arr[i];
>     return sum;
> }
> 
> this changes -O3 -mavx512f code from
> 
> sumint:
> .LFB0:
>         .cfi_startproc
>         vpxord  %zmm0, %zmm0, %zmm0
>         leaq    4096(%rdi), %rax
>         .p2align 4,,10
>         .p2align 3
> .L2:
>         vpaddd  (%rdi), %zmm0, %zmm0
>         addq    $64, %rdi
>         cmpq    %rdi, %rax
>         jne     .L2
>         vpxord  %zmm1, %zmm1, %zmm1
>         vshufi32x4      $78, %zmm1, %zmm0, %zmm2
>         vpaddd  %zmm2, %zmm0, %zmm0
>         vmovdqa64       .LC0(%rip), %zmm2
>         vpermi2d        %zmm1, %zmm0, %zmm2
>         vpaddd  %zmm2, %zmm0, %zmm0
>         vmovdqa64       .LC1(%rip), %zmm2
>         vpermi2d        %zmm1, %zmm0, %zmm2
>         vpaddd  %zmm2, %zmm0, %zmm0
>         vmovdqa64       .LC2(%rip), %zmm2
>         vpermi2d        %zmm1, %zmm0, %zmm2
>         vpaddd  %zmm2, %zmm0, %zmm0
>         vmovd   %xmm0, %eax
> 
> to
> 
> sumint:
> .LFB0:
>         .cfi_startproc
>         vpxord  %zmm0, %zmm0, %zmm0
>         leaq    4096(%rdi), %rax
>         .p2align 4,,10
>         .p2align 3
> .L2:
>         vpaddd  (%rdi), %zmm0, %zmm0
>         addq    $64, %rdi
>         cmpq    %rdi, %rax
>         jne     .L2
>         vextracti64x4   $0x1, %zmm0, %ymm1
>         vpaddd  %ymm0, %ymm1, %ymm1
>         vmovdqa %xmm1, %xmm0
>         vextracti128    $1, %ymm1, %xmm1
>         vpaddd  %xmm1, %xmm0, %xmm0
>         vpsrldq $8, %xmm0, %xmm1
>         vpaddd  %xmm1, %xmm0, %xmm0
>         vpsrldq $4, %xmm0, %xmm1
>         vpaddd  %xmm1, %xmm0, %xmm0
>         vmovd   %xmm0, %eax
> 
> and for -O3 -mavx2 from
> 
> sumint:
> .LFB0:
>         .cfi_startproc
>         vpxor   %xmm0, %xmm0, %xmm0
>         leaq    4096(%rdi), %rax
>         .p2align 4,,10
>         .p2align 3
> .L2:
>         vpaddd  (%rdi), %ymm0, %ymm0
>         addq    $32, %rdi
>         cmpq    %rdi, %rax
>         jne     .L2
>         vpxor   %xmm1, %xmm1, %xmm1
>         vperm2i128      $33, %ymm1, %ymm0, %ymm2
>         vpaddd  %ymm2, %ymm0, %ymm0
>         vperm2i128      $33, %ymm1, %ymm0, %ymm2
>         vpalignr        $8, %ymm0, %ymm2, %ymm2
>         vpaddd  %ymm2, %ymm0, %ymm0
>         vperm2i128      $33, %ymm1, %ymm0, %ymm1
>         vpalignr        $4, %ymm0, %ymm1, %ymm1
>         vpaddd  %ymm1, %ymm0, %ymm0
>         vmovd   %xmm0, %eax
> 
> to
> 
> sumint:
> .LFB0:
>         .cfi_startproc
>         vpxor   %xmm0, %xmm0, %xmm0
>         leaq    4096(%rdi), %rax
>         .p2align 4,,10
>         .p2align 3
> .L2:
>         vpaddd  (%rdi), %ymm0, %ymm0
>         addq    $32, %rdi
>         cmpq    %rdi, %rax
>         jne     .L2
>         vmovdqa %xmm0, %xmm1
>         vextracti128    $1, %ymm0, %xmm0
>         vpaddd  %xmm0, %xmm1, %xmm0
>         vpsrldq $8, %xmm0, %xmm1
>         vpaddd  %xmm1, %xmm0, %xmm0
>         vpsrldq $4, %xmm0, %xmm1
>         vpaddd  %xmm1, %xmm0, %xmm0
>         vmovd   %xmm0, %eax
>         vzeroupper
>         ret
> 
> which besides being faster is also smaller (less prefixes).
> 
> SPEC 2k6 results on Haswell (thus AVX2) are neutral.  As it merely
> effects reduction vectorization epilogues I didn't expect big effects
> but for loops that do not run much (more likely with AVX512).
> 
> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> 
> Ok for trunk?

Ping?

Richard.

> The PR mentions some more tricks to optimize the sequence but
> those look like backend only optimizations.
> 
> Thanks,
> Richard.
> 
> 2017-11-28  Richard Biener  <rguenther@suse.de>
> 
> 	PR tree-optimization/80846
> 	* target.def (split_reduction): New target hook.
> 	* targhooks.c (default_split_reduction): New function.
> 	* targhooks.h (default_split_reduction): Declare.
> 	* tree-vect-loop.c (vect_create_epilog_for_reduction): If the
> 	target requests first reduce vectors by combining low and high
> 	parts.
> 	* tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
> 	(get_vectype_for_scalar_type_and_size): Export.
> 	* tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
> 
> 	* doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
> 	* doc/tm.texi: Regenerate.
> 
> 	i386/
> 	* config/i386/i386.c (ix86_split_reduction): Implement
> 	TARGET_VECTORIZE_SPLIT_REDUCTION.
> 
> 	* gcc.target/i386/pr80846-1.c: New testcase.
> 	* gcc.target/i386/pr80846-2.c: Likewise.
> 
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c	(revision 255197)
> +++ gcc/config/i386/i386.c	(working copy)
> @@ -48864,6 +48864,36 @@ ix86_preferred_simd_mode (scalar_mode mo
>      }
>  }
>  
> +/* All CPUs perfer to avoid cross-lane operations so perform reductions
> +   upper against lower halves up to SSE reg size.  */
> +
> +static machine_mode
> +ix86_split_reduction (machine_mode mode)
> +{
> +  /* Reduce lowpart against highpart until we reach SSE reg width to
> +     avoid cross-lane operations.  */
> +  switch (mode)
> +    {
> +    case E_V16SImode:
> +    case E_V8SImode:
> +      return V4SImode;
> +    case E_V32HImode:
> +    case E_V16HImode:
> +      return V8HImode;
> +    case E_V64QImode:
> +    case E_V32QImode:
> +      return V16QImode;
> +    case E_V16SFmode:
> +    case E_V8SFmode:
> +      return V4SFmode;
> +    case E_V8DFmode:
> +    case E_V4DFmode:
> +      return V2DFmode;
> +    default:
> +      return mode;
> +    }
> +}
> +
>  /* If AVX is enabled then try vectorizing with both 256bit and 128bit
>     vectors.  If AVX512F is enabled then try vectorizing with 512bit,
>     256bit and 128bit vectors.  */
> @@ -50486,6 +50516,9 @@ ix86_run_selftests (void)
>  #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
>  #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
>    ix86_preferred_simd_mode
> +#undef TARGET_VECTORIZE_SPLIT_REDUCTION
> +#define TARGET_VECTORIZE_SPLIT_REDUCTION \
> +  ix86_split_reduction
>  #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
>  #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
>    ix86_autovectorize_vector_sizes
> Index: gcc/doc/tm.texi
> ===================================================================
> --- gcc/doc/tm.texi	(revision 255197)
> +++ gcc/doc/tm.texi	(working copy)
> @@ -5844,6 +5844,13 @@ equal to @code{word_mode}, because the v
>  transformations even in absence of specialized @acronym{SIMD} hardware.
>  @end deftypefn
>  
> +@deftypefn {Target Hook} machine_mode TARGET_VECTORIZE_SPLIT_REDUCTION (machine_mode)
> +This hook should return the preferred mode to split the final reduction
> +step on @var{mode} to.  The reduction is then carried out reducing upper
> +against lower halves of vectors recursively until the specified mode is
> +reached.  The default is @var{mode} which means no splitting.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} {unsigned int} TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (void)
>  This hook should return a mask of sizes that should be iterated over
>  after trying to autovectorize using the vector size derived from the
> Index: gcc/doc/tm.texi.in
> ===================================================================
> --- gcc/doc/tm.texi.in	(revision 255197)
> +++ gcc/doc/tm.texi.in	(working copy)
> @@ -4091,6 +4091,8 @@ address;  but often a machine-dependent
>  
>  @hook TARGET_VECTORIZE_PREFERRED_SIMD_MODE
>  
> +@hook TARGET_VECTORIZE_SPLIT_REDUCTION
> +
>  @hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
>  
>  @hook TARGET_VECTORIZE_GET_MASK_MODE
> Index: gcc/target.def
> ===================================================================
> --- gcc/target.def	(revision 255197)
> +++ gcc/target.def	(working copy)
> @@ -1875,6 +1875,17 @@ transformations even in absence of speci
>   (scalar_mode mode),
>   default_preferred_simd_mode)
>  
> +/* Returns the preferred mode for splitting SIMD reductions to.  */
> +DEFHOOK
> +(split_reduction,
> + "This hook should return the preferred mode to split the final reduction\n\
> +step on @var{mode} to.  The reduction is then carried out reducing upper\n\
> +against lower halves of vectors recursively until the specified mode is\n\
> +reached.  The default is @var{mode} which means no splitting.",
> +  machine_mode,
> +  (machine_mode),
> +  default_split_reduction)
> +
>  /* Returns a mask of vector sizes to iterate over when auto-vectorizing
>     after processing the preferred one derived from preferred_simd_mode.  */
>  DEFHOOK
> Index: gcc/targhooks.c
> ===================================================================
> --- gcc/targhooks.c	(revision 255197)
> +++ gcc/targhooks.c	(working copy)
> @@ -1281,6 +1281,14 @@ default_preferred_simd_mode (scalar_mode
>    return word_mode;
>  }
>  
> +/* By default do not split reductions further.  */
> +
> +machine_mode
> +default_split_reduction (machine_mode mode)
> +{
> +  return mode;
> +}
> +
>  /* By default only the size derived from the preferred vector mode
>     is tried.  */
>  
> Index: gcc/targhooks.h
> ===================================================================
> --- gcc/targhooks.h	(revision 255197)
> +++ gcc/targhooks.h	(working copy)
> @@ -108,6 +108,7 @@ default_builtin_support_vector_misalignm
>  					     const_tree,
>  					     int, bool);
>  extern machine_mode default_preferred_simd_mode (scalar_mode mode);
> +extern machine_mode default_split_reduction (machine_mode);
>  extern unsigned int default_autovectorize_vector_sizes (void);
>  extern opt_machine_mode default_get_mask_mode (unsigned, unsigned);
>  extern void *default_init_cost (struct loop *);
> Index: gcc/testsuite/gcc.target/i386/pr80846-1.c
> ===================================================================
> --- gcc/testsuite/gcc.target/i386/pr80846-1.c	(nonexistent)
> +++ gcc/testsuite/gcc.target/i386/pr80846-1.c	(working copy)
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512f" } */
> +
> +int sumint(const int arr[]) {
> +    arr = __builtin_assume_aligned(arr, 64);
> +    int sum=0;
> +    for (int i=0 ; i<1024 ; i++)
> +      sum+=arr[i];
> +    return sum;
> +}
> +
> +/* { dg-final { scan-assembler-times "vextracti" 2 } } */
> Index: gcc/testsuite/gcc.target/i386/pr80846-2.c
> ===================================================================
> --- gcc/testsuite/gcc.target/i386/pr80846-2.c	(nonexistent)
> +++ gcc/testsuite/gcc.target/i386/pr80846-2.c	(working copy)
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2" } */
> +
> +int sumint(const int arr[]) {
> +    arr = __builtin_assume_aligned(arr, 64);
> +    int sum=0;
> +    for (int i=0 ; i<1024 ; i++)
> +      sum+=arr[i];
> +    return sum;
> +}
> +
> +/* { dg-final { scan-assembler-times "vextracti" 1 } } */
> Index: gcc/tree-vect-loop.c
> ===================================================================
> --- gcc/tree-vect-loop.c	(revision 255197)
> +++ gcc/tree-vect-loop.c	(working copy)
> @@ -4994,38 +4994,126 @@ vect_create_epilog_for_reduction (vec<tr
>      }
>    else
>      {
> -      bool reduce_with_shift = have_whole_vector_shift (mode);
> -      int element_bitsize = tree_to_uhwi (bitsize);
> -      int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
> +      bool reduce_with_shift;
>        tree vec_temp;
> -
> + 
>        /* COND reductions all do the final reduction with MAX_EXPR.  */
>        if (code == COND_EXPR)
>  	code = MAX_EXPR;
>  
> -      /* Regardless of whether we have a whole vector shift, if we're
> -         emulating the operation via tree-vect-generic, we don't want
> -         to use it.  Only the first round of the reduction is likely
> -         to still be profitable via emulation.  */
> -      /* ??? It might be better to emit a reduction tree code here, so that
> -         tree-vect-generic can expand the first round via bit tricks.  */
> -      if (!VECTOR_MODE_P (mode))
> -        reduce_with_shift = false;
> +      /* See if the target wants to do the final (shift) reduction
> +	 in a vector mode of smaller size and first reduce upper/lower
> +	 halves against each other.  */
> +      enum machine_mode mode1 = mode;
> +      tree vectype1 = vectype;
> +      unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
> +      unsigned sz1 = sz;
> +      if (!slp_reduc
> +	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
> +	sz1 = GET_MODE_SIZE (mode1);
> +
> +      vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
> +      reduce_with_shift = have_whole_vector_shift (mode1);
> +      if (!VECTOR_MODE_P (mode1))
> +	reduce_with_shift = false;
>        else
> -        {
> -          optab optab = optab_for_tree_code (code, vectype, optab_default);
> -          if (optab_handler (optab, mode) == CODE_FOR_nothing)
> -            reduce_with_shift = false;
> -        }
> +	{
> +	  optab optab = optab_for_tree_code (code, vectype1, optab_default);
> +	  if (optab_handler (optab, mode1) == CODE_FOR_nothing)
> +	    reduce_with_shift = false;
> +	}
> +
> +      /* First reduce the vector to the desired vector size we should
> +	 do shift reduction on by combining upper and lower halves.  */
> +      new_temp = new_phi_result;
> +      while (sz > sz1)
> +	{
> +	  gcc_assert (!slp_reduc);
> +	  sz /= 2;
> +	  vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
> +
> +	  /* The target has to make sure we support lowpart/highpart
> +	     extraction, either via direct vector extract or through
> +	     an integer mode punning.  */
> +	  tree dst1, dst2;
> +	  if (convert_optab_handler (vec_extract_optab,
> +				     TYPE_MODE (TREE_TYPE (new_temp)),
> +				     TYPE_MODE (vectype1))
> +	      != CODE_FOR_nothing)
> +	    {
> +	      /* Extract sub-vectors directly once vec_extract becomes
> +		 a conversion optab.  */
> +	      dst1 = make_ssa_name (vectype1);
> +	      epilog_stmt
> +		= gimple_build_assign (dst1, BIT_FIELD_REF,
> +				       build3 (BIT_FIELD_REF, vectype1,
> +					       new_temp, TYPE_SIZE (vectype1),
> +					       bitsize_int (0)));
> +	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> +	      dst2 =  make_ssa_name (vectype1);
> +	      epilog_stmt
> +		= gimple_build_assign (dst2, BIT_FIELD_REF,
> +				       build3 (BIT_FIELD_REF, vectype1,
> +					       new_temp, TYPE_SIZE (vectype1),
> +					       bitsize_int (sz * BITS_PER_UNIT)));
> +	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> +	    }
> +	  else
> +	    {
> +	      /* Extract via punning to appropriately sized integer mode
> +		 vector.  */
> +	      tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
> +							    1);
> +	      tree etype = build_vector_type (eltype, 2);
> +	      gcc_assert (convert_optab_handler (vec_extract_optab,
> +						 TYPE_MODE (etype),
> +						 TYPE_MODE (eltype))
> +			  != CODE_FOR_nothing);
> +	      tree tem = make_ssa_name (etype);
> +	      epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
> +						 build1 (VIEW_CONVERT_EXPR,
> +							 etype, new_temp));
> +	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> +	      new_temp = tem;
> +	      tem = make_ssa_name (eltype);
> +	      epilog_stmt
> +		= gimple_build_assign (tem, BIT_FIELD_REF,
> +				       build3 (BIT_FIELD_REF, eltype,
> +					       new_temp, TYPE_SIZE (eltype),
> +					       bitsize_int (0)));
> +	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> +	      dst1 = make_ssa_name (vectype1);
> +	      epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
> +						 build1 (VIEW_CONVERT_EXPR,
> +							 vectype1, tem));
> +	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> +	      tem = make_ssa_name (eltype);
> +	      epilog_stmt
> +		= gimple_build_assign (tem, BIT_FIELD_REF,
> +				       build3 (BIT_FIELD_REF, eltype,
> +					       new_temp, TYPE_SIZE (eltype),
> +					       bitsize_int (sz * BITS_PER_UNIT)));
> +	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> +	      dst2 =  make_ssa_name (vectype1);
> +	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
> +						 build1 (VIEW_CONVERT_EXPR,
> +							 vectype1, tem));
> +	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> +	    }
> +
> +	  new_temp = make_ssa_name (vectype1);
> +	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
> +	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
> +	}
>  
>        if (reduce_with_shift && !slp_reduc)
>          {
> -          int nelements = vec_size_in_bits / element_bitsize;
> +          int nelements = TYPE_VECTOR_SUBPARTS (vectype1);
>            auto_vec_perm_indices sel (nelements);
>  
>            int elt_offset;
>  
> -          tree zero_vec = build_zero_cst (vectype);
> +          tree zero_vec = build_zero_cst (vectype1);
>            /* Case 2: Create:
>               for (offset = nelements/2; offset >= 1; offset/=2)
>                  {
> @@ -5039,15 +5127,15 @@ vect_create_epilog_for_reduction (vec<tr
>              dump_printf_loc (MSG_NOTE, vect_location,
>  			     "Reduce using vector shifts\n");
>  
> -          vec_dest = vect_create_destination_var (scalar_dest, vectype);
> -          new_temp = new_phi_result;
> +	  mode1 = TYPE_MODE (vectype1);
> +	  vec_dest = vect_create_destination_var (scalar_dest, vectype1);
>            for (elt_offset = nelements / 2;
>                 elt_offset >= 1;
>                 elt_offset /= 2)
>              {
>  	      sel.truncate (0);
> -	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
> -	      tree mask = vect_gen_perm_mask_any (vectype, sel);
> +              calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
> +              tree mask = vect_gen_perm_mask_any (vectype1, sel);
>  	      epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
>  						 new_temp, zero_vec, mask);
>                new_name = make_ssa_name (vec_dest, epilog_stmt);
> @@ -5092,7 +5180,8 @@ vect_create_epilog_for_reduction (vec<tr
>              dump_printf_loc (MSG_NOTE, vect_location,
>  			     "Reduce using scalar code.\n");
>  
> -          vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
> +	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
> +	  int element_bitsize = tree_to_uhwi (bitsize);
>            FOR_EACH_VEC_ELT (new_phis, i, new_phi)
>              {
>                int bit_offset;
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c	(revision 255197)
> +++ gcc/tree-vect-stmts.c	(working copy)
> @@ -6514,7 +6514,7 @@ vect_gen_perm_mask_any (tree vectype, ve
>  
>    mask_elt_type = lang_hooks.types.type_for_mode
>      (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
> -  mask_type = get_vectype_for_scalar_type (mask_elt_type);
> +  mask_type = get_same_sized_vectype (mask_elt_type, vectype);
>  
>    auto_vec<tree, 32> mask_elts (nunits);
>    for (unsigned int i = 0; i < nunits; ++i)
> @@ -9065,7 +9065,7 @@ free_stmt_vec_info (gimple *stmt)
>     Returns the vector type corresponding to SCALAR_TYPE  and SIZE as supported
>     by the target.  */
>  
> -static tree
> +tree
>  get_vectype_for_scalar_type_and_size (tree scalar_type, unsigned size)
>  {
>    tree orig_scalar_type = scalar_type;
> Index: gcc/tree-vectorizer.h
> ===================================================================
> --- gcc/tree-vectorizer.h	(revision 255197)
> +++ gcc/tree-vectorizer.h	(working copy)
> @@ -1151,6 +1151,7 @@ extern bool vect_can_advance_ivs_p (loop
>  /* In tree-vect-stmts.c.  */
>  extern unsigned int current_vector_size;
>  extern tree get_vectype_for_scalar_type (tree);
> +extern tree get_vectype_for_scalar_type_and_size (tree, unsigned);
>  extern tree get_mask_type_for_scalar_type (tree);
>  extern tree get_same_sized_vectype (tree, tree);
>  extern bool vect_is_simple_use (tree, vec_info *, gimple **,
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)