[PATCH] [AVX512] [PR87767] Optimize memory broadcast for constant vector under AVX512

Richard Biener richard.guenther@gmail.com
Mon Aug 31 08:18:06 GMT 2020


On Sun, Aug 30, 2020 at 11:24 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Fri, Aug 28, 2020 at 06:25:46PM +0200, Jakub Jelinek via Gcc-patches wrote:
> > You're right, thanks for spotting it, I've missed native_encode_rtx will do
> > quick_push rather than safe_push.
> >
> > Updated patch below, it shouldn't be needed in the second loop, because
> > the first loop should already grow it to the largest size.
>
> Testing beyond a bug in i386.md revealed also that I've lost a cast to long
> to avoid breaking 32-bit bootstrap.
>
> This is the version that passed bootstrap/regtest on both x86_64-linux and
> i686-linux.  In both bootstraps/regtests together, it saved (from the
> statistics I've gathered) 63104 .rodata bytes (before constant merging),
> in 6814 hits of the data->desc->mark = ~(*slot)->desc->labelno;.
>
> Ok for trunk?

OK.

Thanks,
Richard.

> 2020-08-30  Jakub Jelinek  <jakub@redhat.com>
>
>         PR middle-end/54201
>         * varasm.c: Include alloc-pool.h.
>         (output_constant_pool_contents): Emit desc->mark < 0 entries as
>         aliases.
>         (struct constant_descriptor_rtx_data): New type.
>         (constant_descriptor_rtx_data_cmp): New function.
>         (struct const_rtx_data_hasher): New type.
>         (const_rtx_data_hasher::hash, const_rtx_data_hasher::equal): New
>         methods.
>         (optimize_constant_pool): New function.
>         (output_shared_constant_pool): Call it if TARGET_SUPPORTS_ALIASES.
>
> --- gcc/varasm.c.jj     2020-07-28 15:39:10.091755086 +0200
> +++ gcc/varasm.c        2020-08-28 18:21:58.943759578 +0200
> @@ -57,6 +57,7 @@ along with GCC; see the file COPYING3.
>  #include "asan.h"
>  #include "rtl-iter.h"
>  #include "file-prefix-map.h" /* remap_debug_filename()  */
> +#include "alloc-pool.h"
>
>  #ifdef XCOFF_DEBUGGING_INFO
>  #include "xcoffout.h"          /* Needed for external data declarations.  */
> @@ -4198,7 +4199,27 @@ output_constant_pool_contents (struct rt
>    class constant_descriptor_rtx *desc;
>
>    for (desc = pool->first; desc ; desc = desc->next)
> -    if (desc->mark)
> +    if (desc->mark < 0)
> +      {
> +#ifdef ASM_OUTPUT_DEF
> +       const char *name = targetm.strip_name_encoding (XSTR (desc->sym, 0));
> +       char label[256];
> +       char buffer[256 + 32];
> +       const char *p;
> +
> +       ASM_GENERATE_INTERNAL_LABEL (label, "LC", ~desc->mark);
> +       p = targetm.strip_name_encoding (label);
> +       if (desc->offset)
> +         {
> +           sprintf (buffer, "%s+%ld", p, (long) (desc->offset));
> +           p = buffer;
> +         }
> +       ASM_OUTPUT_DEF (asm_out_file, name, p);
> +#else
> +       gcc_unreachable ();
> +#endif
> +      }
> +    else if (desc->mark)
>        {
>         /* If the constant is part of an object_block, make sure that
>            the constant has been positioned within its block, but do not
> @@ -4216,6 +4237,160 @@ output_constant_pool_contents (struct rt
>        }
>  }
>
> +struct constant_descriptor_rtx_data {
> +  constant_descriptor_rtx *desc;
> +  target_unit *bytes;
> +  unsigned short size;
> +  unsigned short offset;
> +  unsigned int hash;
> +};
> +
> +/* qsort callback to sort constant_descriptor_rtx_data * vector by
> +   decreasing size.  */
> +
> +static int
> +constant_descriptor_rtx_data_cmp (const void *p1, const void *p2)
> +{
> +  constant_descriptor_rtx_data *const data1
> +    = *(constant_descriptor_rtx_data * const *) p1;
> +  constant_descriptor_rtx_data *const data2
> +    = *(constant_descriptor_rtx_data * const *) p2;
> +  if (data1->size > data2->size)
> +    return -1;
> +  if (data1->size < data2->size)
> +    return 1;
> +  if (data1->hash < data2->hash)
> +    return -1;
> +  gcc_assert (data1->hash > data2->hash);
> +  return 1;
> +}
> +
> +struct const_rtx_data_hasher : nofree_ptr_hash<constant_descriptor_rtx_data>
> +{
> +  static hashval_t hash (constant_descriptor_rtx_data *);
> +  static bool equal (constant_descriptor_rtx_data *,
> +                    constant_descriptor_rtx_data *);
> +};
> +
> +/* Hash and compare functions for const_rtx_data_htab.  */
> +
> +hashval_t
> +const_rtx_data_hasher::hash (constant_descriptor_rtx_data *data)
> +{
> +  return data->hash;
> +}
> +
> +bool
> +const_rtx_data_hasher::equal (constant_descriptor_rtx_data *x,
> +                             constant_descriptor_rtx_data *y)
> +{
> +  if (x->hash != y->hash || x->size != y->size)
> +    return 0;
> +  unsigned int align1 = x->desc->align;
> +  unsigned int align2 = y->desc->align;
> +  unsigned int offset1 = (x->offset * BITS_PER_UNIT) & (align1 - 1);
> +  unsigned int offset2 = (y->offset * BITS_PER_UNIT) & (align2 - 1);
> +  if (offset1)
> +    align1 = least_bit_hwi (offset1);
> +  if (offset2)
> +    align2 = least_bit_hwi (offset2);
> +  if (align2 > align1)
> +    return 0;
> +  if (memcmp (x->bytes, y->bytes, x->size * sizeof (target_unit)) != 0)
> +    return 0;
> +  return 1;
> +}
> +
> +/* Attempt to optimize constant pool POOL.  If it contains both CONST_VECTOR
> +   constants and scalar constants with the values of CONST_VECTOR elements,
> +   try to alias the scalar constants with the CONST_VECTOR elements.  */
> +
> +static void
> +optimize_constant_pool (struct rtx_constant_pool *pool)
> +{
> +  auto_vec<target_unit, 128> buffer;
> +  auto_vec<constant_descriptor_rtx_data *, 128> vec;
> +  object_allocator<constant_descriptor_rtx_data>
> +    data_pool ("constant_descriptor_rtx_data_pool");
> +  int idx = 0;
> +  size_t size = 0;
> +  for (constant_descriptor_rtx *desc = pool->first; desc; desc = desc->next)
> +    if (desc->mark > 0
> +       && ! (SYMBOL_REF_HAS_BLOCK_INFO_P (desc->sym)
> +             && SYMBOL_REF_BLOCK (desc->sym)))
> +      {
> +       buffer.truncate (0);
> +       buffer.reserve (GET_MODE_SIZE (desc->mode));
> +       if (native_encode_rtx (desc->mode, desc->constant, buffer, 0,
> +                              GET_MODE_SIZE (desc->mode)))
> +         {
> +           constant_descriptor_rtx_data *data = data_pool.allocate ();
> +           data->desc = desc;
> +           data->bytes = NULL;
> +           data->size = GET_MODE_SIZE (desc->mode);
> +           data->offset = 0;
> +           data->hash = idx++;
> +           size += data->size;
> +           vec.safe_push (data);
> +         }
> +      }
> +  if (idx)
> +    {
> +      vec.qsort (constant_descriptor_rtx_data_cmp);
> +      unsigned min_size = vec.last ()->size;
> +      target_unit *bytes = XNEWVEC (target_unit, size);
> +      unsigned int i;
> +      constant_descriptor_rtx_data *data;
> +      hash_table<const_rtx_data_hasher> * htab
> +       = new hash_table<const_rtx_data_hasher> (31);
> +      size = 0;
> +      FOR_EACH_VEC_ELT (vec, i, data)
> +       {
> +         buffer.truncate (0);
> +         native_encode_rtx (data->desc->mode, data->desc->constant,
> +                            buffer, 0, data->size);
> +         memcpy (bytes + size, buffer.address (), data->size);
> +         data->bytes = bytes + size;
> +         data->hash = iterative_hash (data->bytes,
> +                                      data->size * sizeof (target_unit), 0);
> +         size += data->size;
> +         constant_descriptor_rtx_data **slot
> +           = htab->find_slot_with_hash (data, data->hash, INSERT);
> +         if (*slot)
> +           {
> +             data->desc->mark = ~(*slot)->desc->labelno;
> +             data->desc->offset = (*slot)->offset;
> +           }
> +         else
> +           {
> +             unsigned int sz = 1 << floor_log2 (data->size);
> +
> +             *slot = data;
> +             for (sz >>= 1; sz >= min_size; sz >>= 1)
> +               for (unsigned off = 0; off + sz <= data->size; off += sz)
> +                 {
> +                   constant_descriptor_rtx_data tmp;
> +                   tmp.desc = data->desc;
> +                   tmp.bytes = data->bytes + off;
> +                   tmp.size = sz;
> +                   tmp.offset = off;
> +                   tmp.hash = iterative_hash (tmp.bytes,
> +                                              sz * sizeof (target_unit), 0);
> +                   slot = htab->find_slot_with_hash (&tmp, tmp.hash, INSERT);
> +                   if (*slot == NULL)
> +                     {
> +                       *slot = data_pool.allocate ();
> +                       **slot = tmp;
> +                     }
> +                 }
> +           }
> +       }
> +      delete htab;
> +      XDELETE (bytes);
> +    }
> +  data_pool.release ();
> +}
> +
>  /* Mark all constants that are used in the current function, then write
>     out the function's private constant pool.  */
>
> @@ -4251,6 +4426,10 @@ output_constant_pool (const char *fnname
>  void
>  output_shared_constant_pool (void)
>  {
> +  if (optimize
> +      && TARGET_SUPPORTS_ALIASES)
> +    optimize_constant_pool (shared_constant_pool);
> +
>    output_constant_pool_contents (shared_constant_pool);
>  }
>
>
>
>         Jakub
>


More information about the Gcc-patches mailing list