[PATCH] [AVX512] [PR87767] Optimize memory broadcast for constant vector under AVX512

Richard Sandiford richard.sandiford@arm.com
Fri Aug 28 16:07:11 GMT 2020


Thanks for doing this.  I don't feel qualified to review the full
patch, but one thing:

Jakub Jelinek via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> +  auto_vec<target_unit, 128> buffer;
> +  auto_vec<constant_descriptor_rtx_data *, 128> vec;
> +  object_allocator<constant_descriptor_rtx_data>
> +    data_pool ("constant_descriptor_rtx_data_pool");
> +  int idx = 0;
> +  size_t size = 0;
> +  for (constant_descriptor_rtx *desc = pool->first; desc; desc = desc->next)
> +    if (desc->mark > 0
> +	&& ! (SYMBOL_REF_HAS_BLOCK_INFO_P (desc->sym)
> +	      && SYMBOL_REF_BLOCK (desc->sym)))
> +      {
> +	buffer.truncate (0);

128 isn't big enough for all targets (e.g. aarch64 with
-msve-vector-bits=2048), so I think we still need a reserve
call here.

Thanks,
Richard

> +	if (native_encode_rtx (desc->mode, desc->constant, buffer, 0,
> +			       GET_MODE_SIZE (desc->mode)))
> +	  {
> +	    constant_descriptor_rtx_data *data = data_pool.allocate ();
> +	    data->desc = desc;
> +	    data->bytes = NULL;
> +	    data->size = GET_MODE_SIZE (desc->mode);
> +	    data->offset = 0;
> +	    data->hash = idx++;
> +	    size += data->size;
> +	    vec.safe_push (data);
> +	  }
> +      }
> +  if (idx)
> +    {
> +      vec.qsort (constant_descriptor_rtx_data_cmp);
> +      unsigned min_size = vec.last ()->size;
> +      target_unit *bytes = XNEWVEC (target_unit, size);
> +      unsigned int i;
> +      constant_descriptor_rtx_data *data;
> +      hash_table<const_rtx_data_hasher> * htab
> +	= new hash_table<const_rtx_data_hasher> (31);
> +      size = 0;
> +      FOR_EACH_VEC_ELT (vec, i, data)
> +	{
> +	  buffer.truncate (0);
> +	  native_encode_rtx (data->desc->mode, data->desc->constant,
> +			     buffer, 0, data->size);
> +	  memcpy (bytes + size, buffer.address (), data->size);
> +	  data->bytes = bytes + size;
> +	  data->hash = iterative_hash (data->bytes,
> +				       data->size * sizeof (target_unit), 0);
> +	  size += data->size;
> +	  constant_descriptor_rtx_data **slot
> +	    = htab->find_slot_with_hash (data, data->hash, INSERT);
> +	  if (*slot)
> +	    {
> +	      data->desc->mark = ~(*slot)->desc->labelno;
> +	      data->desc->offset = (*slot)->offset;
> +	    }
> +	  else
> +	    {
> +	      unsigned int sz = 1 << floor_log2 (data->size);
> +
> +	      *slot = data;
> +	      for (sz >>= 1; sz >= min_size; sz >>= 1)
> +		for (unsigned off = 0; off + sz <= data->size; off += sz)
> +		  {
> +		    constant_descriptor_rtx_data tmp;
> +		    tmp.desc = data->desc;
> +		    tmp.bytes = data->bytes + off;
> +		    tmp.size = sz;
> +		    tmp.offset = off;
> +		    tmp.hash = iterative_hash (tmp.bytes,
> +					       sz * sizeof (target_unit), 0);
> +		    slot = htab->find_slot_with_hash (&tmp, tmp.hash, INSERT);
> +		    if (*slot == NULL)
> +		      {
> +			*slot = data_pool.allocate ();
> +			**slot = tmp;
> +		      }
> +		  }
> +	    }
> +	}
> +      delete htab;
> +      XDELETE (bytes);
> +    }
> +  data_pool.release ();
> +}
> +
>  /* Mark all constants that are used in the current function, then write
>     out the function's private constant pool.  */
>  
> @@ -4251,6 +4425,10 @@ output_constant_pool (const char *fnname
>  void
>  output_shared_constant_pool (void)
>  {
> +  if (optimize
> +      && TARGET_SUPPORTS_ALIASES)
> +    optimize_constant_pool (shared_constant_pool);
> +
>    output_constant_pool_contents (shared_constant_pool);
>  }
>  
>
>
> 	Jakub


More information about the Gcc-patches mailing list