[PATCH 3/3] aarch64: Add SVE support for simd clones [PR 96342]

Wed Oct 23 16:26:34 GMT 2024

On 2/1/24 21:59, Richard Sandiford wrote:
> Andre Vieira <andre.simoesdiasvieira@arm.com> writes:
>> This patch finalizes adding support for the generation of SVE simd clones when
>> no simdlen is provided, following the ABI rules where the widest data type
>> determines the minimum amount of elements in a length agnostic vector.
>>
>> gcc/ChangeLog:
>>
>> 	* config/aarch64/aarch64-protos.h (add_sve_type_attribute): Declare.
>> 	* config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): Make
>> 	visibility global and support use for non_acle types.
>> 	* config/aarch64/aarch64.cc
>> 	(aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd clone
>> 	when no simdlen is provided, according to ABI rules.
>> 	(simd_clone_adjust_sve_vector_type): New helper function.
>> 	(aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones
>> 	and modify types to use SVE types.
>> 	* omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
>> 	(simd_clone_adjust): Adapt safelen check to be compatible with VLA
>> 	simdlen.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* c-c++-common/gomp/declare-variant-14.c: Make i?86 and x86_64 target
>> 	only test.
>> 	* gfortran.dg/gomp/declare-variant-14.f90: Likewise.
>> 	* gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.
>> 	* gcc.target/aarch64/vect-simd-clone-1.c: New test.
>>
>> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
>> index a0b142e0b94..207396de0ff 100644
>> --- a/gcc/config/aarch64/aarch64-protos.h
>> +++ b/gcc/config/aarch64/aarch64-protos.h
>> @@ -1031,6 +1031,8 @@ namespace aarch64_sve {
>>   #ifdef GCC_TARGET_H
>>     bool verify_type_context (location_t, type_context_kind, const_tree, bool);
>>   #endif
>> + void add_sve_type_attribute (tree, unsigned int, unsigned int,
>> +			      const char *, const char *);
>>   }
>>   
>>   extern void aarch64_split_combinev16qi (rtx operands[3]);
>> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
>> index 11f5c5c500c..747131e684e 100644
>> --- a/gcc/config/aarch64/aarch64-sve-builtins.cc
>> +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
>> @@ -953,14 +953,16 @@ static bool reported_missing_registers_p;
>>   /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
>>      and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
>>      mangling of the type.  ACLE_NAME is the <arm_sve.h> name of the type.  */
>> -static void
>> +void
>>   add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
>>   			const char *mangled_name, const char *acle_name)
>>   {
>>     tree mangled_name_tree
>>       = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
>> +  tree acle_name_tree
>> +    = (acle_name ? get_identifier (acle_name) : NULL_TREE);
>>   
>> -  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
>> +  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
>>     value = tree_cons (NULL_TREE, mangled_name_tree, value);
>>     value = tree_cons (NULL_TREE, size_int (num_pr), value);
>>     value = tree_cons (NULL_TREE, size_int (num_zr), value);
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index 31617510160..cba8879ab33 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -28527,7 +28527,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>   					int num, bool explicit_p)
>>   {
>>     tree t, ret_type;
>> -  unsigned int nds_elt_bits;
>> +  unsigned int nds_elt_bits, wds_elt_bits;
>>     unsigned HOST_WIDE_INT const_simdlen;
>>   
>>     if (!TARGET_SIMD)
>> @@ -28572,10 +28572,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>     if (TREE_CODE (ret_type) != VOID_TYPE)
>>       {
>>         nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
>> +      wds_elt_bits = nds_elt_bits;
>>         vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
>>       }
>>     else
>> -    nds_elt_bits = POINTER_SIZE;
>> +    {
>> +      nds_elt_bits = POINTER_SIZE;
>> +      wds_elt_bits = 0;
>> +    }
>>   
>>     int i;
>>     tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
>> @@ -28583,44 +28587,72 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>     for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
>>          t && t != void_list_node; t = TREE_CHAIN (t), i++)
>>       {
>> -      tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
>> +      tree type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
>>         if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
>> -	  && !supported_simd_type (arg_type))
>> +	  && !supported_simd_type (type))
>>   	{
>>   	  if (!explicit_p)
>>   	    ;
>> -	  else if (COMPLEX_FLOAT_TYPE_P (ret_type))
>> +	  else if (COMPLEX_FLOAT_TYPE_P (type))
>>   	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
>>   			"GCC does not currently support argument type %qT "
>> -			"for simd", arg_type);
>> +			"for simd", type);
>>   	  else
>>   	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
>>   			"unsupported argument type %qT for simd",
>> -			arg_type);
>> +			type);
>>   	  return 0;
>>   	}
>> -      unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
>> +      unsigned lane_bits = lane_size (clonei->args[i].arg_type, type);
>>         if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
>> -	vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
>> +	vec_elts.safe_push (std::make_pair (type, lane_bits));
>>         if (nds_elt_bits > lane_bits)
>>   	nds_elt_bits = lane_bits;
>> +      if (wds_elt_bits < lane_bits)
>> +	wds_elt_bits = lane_bits;
>>       }
>>   
>> -  clonei->vecsize_mangle = 'n';
>> +  /* If we could not determine the WDS type from available parameters/return,
>> +     then fallback to using uintptr_t.  */
>> +  if (wds_elt_bits == 0)
>> +    wds_elt_bits = POINTER_SIZE;
>> +
>>     clonei->mask_mode = VOIDmode;
>>     poly_uint64 simdlen;
>> -  auto_vec<poly_uint64> simdlens (2);
>> +  auto_vec<poly_uint64> simdlens (3);
>> +  auto_vec<char> simdmangle (3);
> 
> Minor, but I think it'd be neater to use an ad-hoc structure that
> contains the mangling prefix and simdlen together, so that only one
> vector is needed.  Brace initialization should make it a bit shorter too.
> 
>>     /* Keep track of the possible simdlens the clones of this function can have,
>>        and check them later to see if we support them.  */
>>     if (known_eq (clonei->simdlen, 0U))
>>       {
>>         simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
>>         if (maybe_ne (simdlen, 1U))
>> -	simdlens.safe_push (simdlen);
>> +	{
>> +	  simdlens.safe_push (simdlen);
>> +	  simdmangle.safe_push ('n');
>> +	}
>>         simdlens.safe_push (simdlen * 2);
>> +      simdmangle.safe_push ('n');
>> +      /* Only create a SVE simd clone if we aren't dealing with an unprototyped
>> +	 function.
>> +	We have also disabled support for creating SVE simdclones for functions
>> +	with function bodies and any simdclones when -msve-vector-bits is used.
>> +	TODO: add support for these.  */
>> +      if ((DECL_ARGUMENTS (node->decl) != 0
>> +	   || type_arg_types != 0)
> 
> I think my comment from the previous review still stands:
> 
>    This check feels a bit indirect.  Does it work to use:
> 
>      if (prototype_p (TREE_TYPE (node->decl)))
> 
>    instead?
> 
> Or does that not work?
> 
>> +	  && !node->definition
>> +	  && !aarch64_sve_vg.is_constant ())
>> +	{
>> +	  poly_uint64 sve_simdlen = aarch64_sve_vg * 64;
>> +	  simdlens.safe_push (exact_div (sve_simdlen, wds_elt_bits));
> 
> Simpler as:
> 
> 	  simdlens.safe_push (exact_div (BITS_PER_SVE_VECTOR, wds_elt_bits));
> 
>> +	  simdmangle.safe_push ('s');
>> +	}
>>       }
>>     else
>> -    simdlens.safe_push (clonei->simdlen);
>> +    {
>> +      simdlens.safe_push (clonei->simdlen);
>> +      simdmangle.safe_push ('n');
>> +    }
>>   
>>     clonei->vecsize_int = 0;
>>     clonei->vecsize_float = 0;
>> @@ -28638,7 +28670,8 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>       {
>>         bool remove_simdlen = false;
>>         for (auto elt : vec_elts)
>> -	if (known_gt (simdlens[j] * elt.second, 128U))
>> +	if (simdmangle[j] == 'n'
>> +	    && known_gt (simdlens[j] * elt.second, 128U))
>>   	  {
>>   	    /* Don't issue a warning for every simdclone when there is no
>>   	       specific simdlen clause.  */
>> @@ -28651,12 +28684,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>   	    break;
>>   	  }
>>         if (remove_simdlen)
>> -	simdlens.ordered_remove (j);
>> +	{
>> +	  simdlens.ordered_remove (j);
>> +	  simdmangle.ordered_remove (j);
>> +	}
>>         else
>>   	j++;
>>       }
>>   
>> -
>>     int count = simdlens.length ();
>>     if (count == 0)
>>       {
>> @@ -28675,20 +28710,107 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
>>   
>>     gcc_assert (num < count);
>>     clonei->simdlen = simdlens[num];
>> +  clonei->vecsize_mangle = simdmangle[num];
>> +  /* SVE simdclones always have a Mask, so set inbranch to 1.  */
>> +  if (clonei->vecsize_mangle == 's')
>> +    clonei->inbranch = 1;
>>     return count;
>>   }
>>   
>> +static tree
>> +simd_clone_adjust_sve_vector_type (tree type, bool is_mask, poly_uint64 simdlen)
>> +{
>> +    unsigned int num_zr = 0;
> 
>  From the previous review:
> 
>    Nits: missing function comment.  The body is indented by too many columns.
> 
>> +    unsigned int num_pr = 0;
>> +    machine_mode vector_mode;
>> +    type = TREE_TYPE (type);
>> +    scalar_mode scalar_m = as_a <scalar_mode> (TYPE_MODE (type));
> 
> SCALAR_TYPE_MODE
> 
>> +    gcc_assert (aarch64_sve_data_mode (scalar_m,
>> +				       simdlen).exists (&vector_mode));
> 
> Better to use require () instead, since gcc_asserts can be compiled out.
> 
>> +    type = build_vector_type_for_mode (type, vector_mode);
>> +    if (is_mask)
>> +      {
>> +	type = truth_type_for (type);
>> +	num_pr = 1;
>> +      }
>> +    else
>> +      num_zr = 1;
>> +
>> +    aarch64_sve::add_sve_type_attribute (type, num_zr, num_pr, NULL,
>> +					 NULL);
> 
> The comment from my previous review still stands:
> 
>    Before adding the atttribute, I think we should call:
> 
>      type = build_distinct_type_copy (type);
> 
>    so that we don't change a type that is already in use, or associate
>    any new types with this one.
> 
> I think it'd also be worth adding a comment to say why we take this
> approach instead of reusing ACLE types.  (The reason being that we need
> to handle unpacked vectors as well, which the ACLE doesn't provide.)
> 
>> +    return type;
>> +}
>> +
>>   /* Implement TARGET_SIMD_CLONE_ADJUST.  */
>>   
>>   static void
>>   aarch64_simd_clone_adjust (struct cgraph_node *node)
>>   {
>> -  /* Add aarch64_vector_pcs target attribute to SIMD clones so they
>> -     use the correct ABI.  */
>> -
>>     tree t = TREE_TYPE (node->decl);
>> -  TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
>> -					TYPE_ATTRIBUTES (t));
>> +  cl_target_option cur_target;
>> +  bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
>> +
>> +  if (node->simdclone->vecsize_mangle == 's')
>> +    {
>> +      tree target = build_string (strlen ("+sve"), "+sve");
> 
> Probably worth adding a comment here to say (as you noted in the reply
> to the last review) that this is additive and has no effect if SVE (or
> higher) is already enabled.
> 
>> +      aarch64_option_valid_attribute_p (node->decl, NULL_TREE, target, 0);
> 
> I still think it'd be better to assert that this succeeds (via
> a gcc_unreachable).  It looks weird to call a _p function and not test
> the result.
> 
>> +      cl_target_option_save (&cur_target, &global_options, &global_options_set);
>> +      tree new_target = DECL_FUNCTION_SPECIFIC_TARGET (node->decl);
>> +      cl_target_option_restore (&global_options, &global_options_set,
>> +				TREE_TARGET_OPTION (new_target));
>> +      aarch64_override_options_internal (&global_options);
>> +      memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
>> +	      sizeof (have_regs_of_mode));
>> +      for (int i = 0; i < NUM_MACHINE_MODES; ++i)
>> +	if (aarch64_sve_mode_p ((machine_mode) i))
>> +	  have_regs_of_mode[i] = true;
> 
> Sorry, just realised I never replied to your question about the
> push_cfun/pop_cfun suggestion.  I think the function we'd push is
> node->decl, i.e. the one that received the +sve target attribute.
> 
> I.e. could we do:
> 
>      push_cfun (node->decl);
> 
> after aarch64_option_valid_attribute_p and skip the rest?  Then do
> pop_cfun as the restoration step.

On this matter, I will just quote Andre verbatim, as he's correct:

push_cfun expects a function structure and DECL_STRUCT_FUNCTION
(node->decl) returns 0 for functions without function bodies, so I can't
use it here.

> 
> Does the above work with the:
> 
>    /* If what we're processing is the current pragma string then the
>       target option node is already stored in target_option_current_node
>       by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
>       having to re-parse the string.  This is especially useful to keep
>       arm_neon.h compile times down since that header contains a lot
>       of intrinsics enclosed in pragmas.  */
>    if (!existing_target && args == current_target_pragma)
> 
> shortcut in aarch64_override_options_internal?  I have no particular
> reason to believe that it wouldn't, just wanted to check...

I'm afraid this won't work for a few reasons.

The first is that args is a STRING_CST and current_target_pragma a
TREE_LIST.

We could, in principle, fix the issue as follows:

   else if (TREE_CODE (args) == STRING_CST
	   && list_length (current_target_pragma) == 1)
     {
       if (TREE_STRING_LENGTH (TREE_VALUE (current_target_pragma))
	  == TREE_STRING_LENGTH (args))
	if (!strcmp (TREE_STRING_POINTER (args),
	    TREE_STRING_POINTER(TREE_VALUE(current_target_pragma))))
	  return true;
     }

But this exposes another issue. `current_target_pragma' gets updated as
we parse the source code from top to bottom.  Its state coming out of
the frontend therefore reflects the last `#pragma GCC target' seen and
doesn't appear to change from that point onward in the compilation
process.

Now suppose for a sec that the last pragma target in the file is
"+simd".

`pass_omp_simd_clone' then processes our functions in the reverse order
(effectively bottom of source code upwards).  When it runs the above
check, it will always see "+simd", which after we pass the `#pragma GCC
target ("+simd")' statement in bottom-up fashion, will be wrong anyway.
Therefore, it's a good thing that the ` if (!existing_target && args ==
current_target_pragma)' test always fails.

Regards,
Victor

>> +    }
>> +  else
>> +    {
>> +	/* Add aarch64_vector_pcs target attribute to SIMD clones so they
>> +	   use the correct ABI.  */
>> +	TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
>> +					      TYPE_ATTRIBUTES (t));
>> +    }
>> +  cgraph_simd_clone *sc = node->simdclone;
>> +
>> +  for (unsigned i = 0; i < sc->nargs; ++i)
>> +    {
>> +      bool is_mask = false;
>> +      tree type;
>> +      switch (sc->args[i].arg_type)
>> +	{
>> +	case SIMD_CLONE_ARG_TYPE_MASK:
>> +	  is_mask = true;
>> +	  gcc_fallthrough ();
>> +	case SIMD_CLONE_ARG_TYPE_VECTOR:
>> +	case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
>> +	case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
>> +	  type = sc->args[i].vector_type;
>> +	  gcc_assert (VECTOR_TYPE_P (type));
>> +	  if (node->simdclone->vecsize_mangle == 's')
>> +	    type = simd_clone_adjust_sve_vector_type (type, is_mask,
>> +						      sc->simdlen);
>> +	  else if (is_mask)
>> +	    type = truth_type_for (type);
>> +	  sc->args[i].vector_type = type;
> 
> Probably best to add a break here (or a fall-through if you prefer).
> 
>> +	default:
>> +	    continue;
> 
> Nit: over-indented continue.  But it might as well be a break.
> 
>> +	}
>> +    }
>> +  if (node->simdclone->vecsize_mangle == 's')
>> +    {
>> +      tree ret_type = TREE_TYPE (t);
>> +      if (VECTOR_TYPE_P (ret_type))
>> +	TREE_TYPE (t)
>> +	  = simd_clone_adjust_sve_vector_type (ret_type, false,
>> +					       node->simdclone->simdlen);
>> +      /* Restore current options.  */
>> +      cl_target_option_restore (&global_options, &global_options_set, &cur_target);
>> +      aarch64_override_options_internal (&global_options);
>> +      memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
>> +	      sizeof (have_regs_of_mode));
>> +    }
>>   }
>>   
>>   /* Implement TARGET_SIMD_CLONE_USABLE.  */
>> @@ -28705,6 +28827,10 @@ aarch64_simd_clone_usable (struct cgraph_node *node, stmt_vec_info stmt_vinfo)
>>   	  && aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo))))
>>   	return -1;
>>         return 0;
>> +    case 's':
>> +      if (!TARGET_SVE)
>> +	return -1;
>> +      return 0;
>>       default:
>>         gcc_unreachable ();
>>       }
>> diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
>> index 864586207ee..066b6217253 100644
>> --- a/gcc/omp-simd-clone.cc
>> +++ b/gcc/omp-simd-clone.cc
>> @@ -541,9 +541,12 @@ simd_clone_mangle (struct cgraph_node *node,
>>     pp_string (&pp, "_ZGV");
>>     pp_character (&pp, vecsize_mangle);
>>     pp_character (&pp, mask);
>> -  /* For now, simdlen is always constant, while variable simdlen pp 'n'.  */
>> -  unsigned int len = simdlen.to_constant ();
>> -  pp_decimal_int (&pp, (len));
>> +
>> +  unsigned long long len = 0;
> 
> unsigned HOST_WIDE_INT
> 
>> +  if (simdlen.is_constant (&len))
>> +    pp_decimal_int (&pp, (int) (len));
>> +  else
>> +    pp_character (&pp, 'x');
>>   
>>     for (n = 0; n < clone_info->nargs; ++n)
>>       {
>> @@ -1533,8 +1536,8 @@ simd_clone_adjust (struct cgraph_node *node)
>>   	 below).  */
>>         loop = alloc_loop ();
>>         cfun->has_force_vectorize_loops = true;
>> -      /* For now, simlen is always constant.  */
>> -      loop->safelen = node->simdclone->simdlen.to_constant ();
>> +      /* We can assert that safelen is the 'minimum' simdlen.  */
>> +      loop->safelen = constant_lower_bound (node->simdclone->simdlen);
>>         loop->force_vectorize = true;
>>         loop->header = body_bb;
>>       }
>> diff --git a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
>> index e3668893afe..2b71869787e 100644
>> --- a/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
>> +++ b/gcc/testsuite/c-c++-common/gomp/declare-variant-14.c
>> @@ -1,6 +1,6 @@
>> -/* { dg-do compile { target vect_simd_clones } } */
>> +/* { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */
>>   /* { dg-additional-options "-fdump-tree-gimple -fdump-tree-optimized" } */
>> -/* { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } } */
>> +/* { dg-additional-options "-mno-sse3" } */
> 
> Please get Jakub's OK for this part.  Similarly for the Fortran test.
> 
>>   
>>   int f01 (int);
>>   int f02 (int);
>> @@ -15,15 +15,13 @@ int
>>   test1 (int x)
>>   {
>>     /* At gimplification time, we can't decide yet which function to call.  */
>> -  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" { target { !aarch64*-*-* } } } } */
>> +  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */
>>     /* After simd clones are created, the original non-clone test1 shall
>>        call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones
>>        shall call f01 with score 8.  */
>>     /* { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } } */
>> -  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } } */
>> -  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 10 "optimized" { target { aarch64*-*-* } } } } */
>> -  /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } } */
>> -  /* { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } } */
>> +  /* { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" } } */
>> +  /* { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" } } */
>>     int a = f04 (x);
>>     int b = f04 (x);
>>     return a + b;
> 
> This part I feel safer with :)
> 
>> diff --git a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
>> index e2e80f0c663..2f4d3a866e5 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/declare-simd-2.c
>> @@ -43,6 +43,7 @@ float f04 (double a)
>>   }
>>   /* { dg-final { scan-assembler {_ZGVnN2v_f04:} } } */
>>   /* { dg-final { scan-assembler {_ZGVnM2v_f04:} } } */
>> +/* { dg-final { scan-assembler-not {_ZGVs[0-9a-z]*_f04:} } } */
>>   
>>   #pragma omp declare simd uniform(a) linear (b)
>>   void f05 (short a, short *b, short c)
>> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c
>> new file mode 100644
>> index 00000000000..71fd361acec
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-1.c
>> @@ -0,0 +1,52 @@
>> +/* { dg-do compile }  */
>> +/* { dg-options "-std=c99" } */
>> +/* { dg-additional-options "-O3 -march=armv8-a+sve -mcpu=neoverse-n2" } */
>> +extern int __attribute__ ((simd, const)) fn0 (int);
>> +
>> +void test_fn0 (int *a, int *b, int n)
>> +{
>> +  for (int i = 0; i < n; ++i)
>> +    a[i] += fn0 (b[i]);
>> +}
>> +
>> +/* { dg-final { scan-assembler {_ZGVsMxv_fn0} } } */
>> +
>> +extern int __attribute__ ((simd, const)) fn1 (short, int);
>> +
>> +void test_fn1 (int *a, int *b, short *c, int n)
>> +{
>> +  for (int i = 0; i < n; ++i)
>> +    a[i] = fn1 (c[i], b[i]);
>> +}
>> +
>> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn1} } } */
>> +
>> +extern short __attribute__ ((simd, const)) fn2 (short, int);
>> +
>> +void test_fn2 (short *a, int *b, short *c, int n)
>> +{
>> +  for (int i = 0; i < n; ++i)
>> +    a[i] = fn2 (c[i], b[i]);
>> +}
>> +
>> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn2} } } */
>> +
>> +extern char __attribute__ ((simd, const)) fn3 (int, char);
>> +
>> +void test_fn3 (int *a, int *b, char *c, int n)
>> +{
>> +  for (int i = 0; i < n; ++i)
>> +    a[i] = (int) (fn3 (b[i], c[i]) + c[i]);
>> +}
>> +
>> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn3} } } */
>> +
>> +extern short __attribute__ ((simd, const)) fn4 (int, short);
>> +
>> +void test_fn4 (int *a, int *b, short *c, int n)
>> +{
>> +  for (int i = 0; i < n; ++i)
>> +    a[i] = (int) (fn4 (b[i], c[i]) + c[i]);
>> +}
>> +
>> +/* { dg-final { scan-assembler {_ZGVsMxvv_fn4} } } */
> 
> It'd be nice to have some more specific testing here.  Although there
> are 5 different signatures, the last 4 are interchangeable as far as
> the test goes.  E.g. maybe it would be possible to have some partial
> check-function-bodies tests that match the inner loop.  Do we
> use extending loads for the unpacked vectors?  (Hope so.)
> 
> Thanks,
> Richard
> 
>> diff --git a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90 b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
>> index 6319df0558f..3c7d093c5c6 100644
>> --- a/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
>> +++ b/gcc/testsuite/gfortran.dg/gomp/declare-variant-14.f90
>> @@ -1,6 +1,6 @@
>> -! { dg-do compile { target vect_simd_clones } }
>> +! { dg-do compile { target { { i?86-*-* x86_64-*-* } && vect_simd_clones } } } */
>>   ! { dg-additional-options "-O0 -fdump-tree-gimple -fdump-tree-optimized" }
>> -! { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } }
>> +! { dg-additional-options "-mno-sse3" }
>>   
>>   module main
>>     implicit none
>> @@ -41,7 +41,7 @@ contains
>>       ! shall call f01 with score 8.
>>       ! { dg-final { scan-tree-dump-not "f04 \\\(x" "optimized" } }
>>       ! { dg-final { scan-tree-dump-times "f03 \\\(x" 14 "optimized" { target { !aarch64*-*-* } } } }
>> -    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 6 "optimized" { target { aarch64*-*-* } } } }
>> +    ! { dg-final { scan-tree-dump-times "f03 \\\(x" 8 "optimized" { target { aarch64*-*-* } } } }
>>       ! { dg-final { scan-tree-dump-times "f01 \\\(x" 4 "optimized" { target { !aarch64*-*-* } } } }
>>       ! { dg-final { scan-tree-dump-times "f01 \\\(x" 0 "optimized" { target { aarch64*-*-* } } } }
>>       a = f04 (x)