[PATCH] [AVX512] [PR87767] Optimize memory broadcast for constant vector under AVX512

Jakub Jelinek jakub@redhat.com
Sun Aug 30 09:24:33 GMT 2020


On Fri, Aug 28, 2020 at 06:25:46PM +0200, Jakub Jelinek via Gcc-patches wrote:
> You're right, thanks for spotting it, I've missed native_encode_rtx will do
> quick_push rather than safe_push.
> 
> Updated patch below, it shouldn't be needed in the second loop, because
> the first loop should already grow it to the largest size.

Testing beyond a bug in i386.md revealed also that I've lost a cast to long
to avoid breaking 32-bit bootstrap.

This is the version that passed bootstrap/regtest on both x86_64-linux and
i686-linux.  In both bootstraps/regtests together, it saved (from the
statistics I've gathered) 63104 .rodata bytes (before constant merging),
in 6814 hits of the data->desc->mark = ~(*slot)->desc->labelno;.

Ok for trunk?

2020-08-30  Jakub Jelinek  <jakub@redhat.com>

	PR middle-end/54201
	* varasm.c: Include alloc-pool.h.
	(output_constant_pool_contents): Emit desc->mark < 0 entries as
	aliases.
	(struct constant_descriptor_rtx_data): New type.
	(constant_descriptor_rtx_data_cmp): New function.
	(struct const_rtx_data_hasher): New type.
	(const_rtx_data_hasher::hash, const_rtx_data_hasher::equal): New
	methods.
	(optimize_constant_pool): New function.
	(output_shared_constant_pool): Call it if TARGET_SUPPORTS_ALIASES.

--- gcc/varasm.c.jj	2020-07-28 15:39:10.091755086 +0200
+++ gcc/varasm.c	2020-08-28 18:21:58.943759578 +0200
@@ -57,6 +57,7 @@ along with GCC; see the file COPYING3.
 #include "asan.h"
 #include "rtl-iter.h"
 #include "file-prefix-map.h" /* remap_debug_filename()  */
+#include "alloc-pool.h"
 
 #ifdef XCOFF_DEBUGGING_INFO
 #include "xcoffout.h"		/* Needed for external data declarations.  */
@@ -4198,7 +4199,27 @@ output_constant_pool_contents (struct rt
   class constant_descriptor_rtx *desc;
 
   for (desc = pool->first; desc ; desc = desc->next)
-    if (desc->mark)
+    if (desc->mark < 0)
+      {
+#ifdef ASM_OUTPUT_DEF
+	const char *name = targetm.strip_name_encoding (XSTR (desc->sym, 0));
+	char label[256];
+	char buffer[256 + 32];
+	const char *p;
+
+	ASM_GENERATE_INTERNAL_LABEL (label, "LC", ~desc->mark);
+	p = targetm.strip_name_encoding (label);
+	if (desc->offset)
+	  {
+	    sprintf (buffer, "%s+%ld", p, (long) (desc->offset));
+	    p = buffer;
+	  }
+	ASM_OUTPUT_DEF (asm_out_file, name, p);
+#else
+	gcc_unreachable ();
+#endif
+      }
+    else if (desc->mark)
       {
 	/* If the constant is part of an object_block, make sure that
 	   the constant has been positioned within its block, but do not
@@ -4216,6 +4237,160 @@ output_constant_pool_contents (struct rt
       }
 }
 
+struct constant_descriptor_rtx_data {
+  constant_descriptor_rtx *desc;
+  target_unit *bytes;
+  unsigned short size;
+  unsigned short offset;
+  unsigned int hash;
+};
+
+/* qsort callback to sort constant_descriptor_rtx_data * vector by
+   decreasing size.  */
+
+static int
+constant_descriptor_rtx_data_cmp (const void *p1, const void *p2)
+{
+  constant_descriptor_rtx_data *const data1
+    = *(constant_descriptor_rtx_data * const *) p1;
+  constant_descriptor_rtx_data *const data2
+    = *(constant_descriptor_rtx_data * const *) p2;
+  if (data1->size > data2->size)
+    return -1;
+  if (data1->size < data2->size)
+    return 1;
+  if (data1->hash < data2->hash)
+    return -1;
+  gcc_assert (data1->hash > data2->hash);
+  return 1;
+}
+
+struct const_rtx_data_hasher : nofree_ptr_hash<constant_descriptor_rtx_data>
+{
+  static hashval_t hash (constant_descriptor_rtx_data *);
+  static bool equal (constant_descriptor_rtx_data *,
+		     constant_descriptor_rtx_data *);
+};
+
+/* Hash and compare functions for const_rtx_data_htab.  */
+
+hashval_t
+const_rtx_data_hasher::hash (constant_descriptor_rtx_data *data)
+{
+  return data->hash;
+}
+
+bool
+const_rtx_data_hasher::equal (constant_descriptor_rtx_data *x,
+			      constant_descriptor_rtx_data *y)
+{
+  if (x->hash != y->hash || x->size != y->size)
+    return 0;
+  unsigned int align1 = x->desc->align;
+  unsigned int align2 = y->desc->align;
+  unsigned int offset1 = (x->offset * BITS_PER_UNIT) & (align1 - 1);
+  unsigned int offset2 = (y->offset * BITS_PER_UNIT) & (align2 - 1);
+  if (offset1)
+    align1 = least_bit_hwi (offset1);
+  if (offset2)
+    align2 = least_bit_hwi (offset2);
+  if (align2 > align1)
+    return 0;
+  if (memcmp (x->bytes, y->bytes, x->size * sizeof (target_unit)) != 0)
+    return 0;
+  return 1;
+}
+
+/* Attempt to optimize constant pool POOL.  If it contains both CONST_VECTOR
+   constants and scalar constants with the values of CONST_VECTOR elements,
+   try to alias the scalar constants with the CONST_VECTOR elements.  */
+
+static void
+optimize_constant_pool (struct rtx_constant_pool *pool)
+{
+  auto_vec<target_unit, 128> buffer;
+  auto_vec<constant_descriptor_rtx_data *, 128> vec;
+  object_allocator<constant_descriptor_rtx_data>
+    data_pool ("constant_descriptor_rtx_data_pool");
+  int idx = 0;
+  size_t size = 0;
+  for (constant_descriptor_rtx *desc = pool->first; desc; desc = desc->next)
+    if (desc->mark > 0
+	&& ! (SYMBOL_REF_HAS_BLOCK_INFO_P (desc->sym)
+	      && SYMBOL_REF_BLOCK (desc->sym)))
+      {
+	buffer.truncate (0);
+	buffer.reserve (GET_MODE_SIZE (desc->mode));
+	if (native_encode_rtx (desc->mode, desc->constant, buffer, 0,
+			       GET_MODE_SIZE (desc->mode)))
+	  {
+	    constant_descriptor_rtx_data *data = data_pool.allocate ();
+	    data->desc = desc;
+	    data->bytes = NULL;
+	    data->size = GET_MODE_SIZE (desc->mode);
+	    data->offset = 0;
+	    data->hash = idx++;
+	    size += data->size;
+	    vec.safe_push (data);
+	  }
+      }
+  if (idx)
+    {
+      vec.qsort (constant_descriptor_rtx_data_cmp);
+      unsigned min_size = vec.last ()->size;
+      target_unit *bytes = XNEWVEC (target_unit, size);
+      unsigned int i;
+      constant_descriptor_rtx_data *data;
+      hash_table<const_rtx_data_hasher> * htab
+	= new hash_table<const_rtx_data_hasher> (31);
+      size = 0;
+      FOR_EACH_VEC_ELT (vec, i, data)
+	{
+	  buffer.truncate (0);
+	  native_encode_rtx (data->desc->mode, data->desc->constant,
+			     buffer, 0, data->size);
+	  memcpy (bytes + size, buffer.address (), data->size);
+	  data->bytes = bytes + size;
+	  data->hash = iterative_hash (data->bytes,
+				       data->size * sizeof (target_unit), 0);
+	  size += data->size;
+	  constant_descriptor_rtx_data **slot
+	    = htab->find_slot_with_hash (data, data->hash, INSERT);
+	  if (*slot)
+	    {
+	      data->desc->mark = ~(*slot)->desc->labelno;
+	      data->desc->offset = (*slot)->offset;
+	    }
+	  else
+	    {
+	      unsigned int sz = 1 << floor_log2 (data->size);
+
+	      *slot = data;
+	      for (sz >>= 1; sz >= min_size; sz >>= 1)
+		for (unsigned off = 0; off + sz <= data->size; off += sz)
+		  {
+		    constant_descriptor_rtx_data tmp;
+		    tmp.desc = data->desc;
+		    tmp.bytes = data->bytes + off;
+		    tmp.size = sz;
+		    tmp.offset = off;
+		    tmp.hash = iterative_hash (tmp.bytes,
+					       sz * sizeof (target_unit), 0);
+		    slot = htab->find_slot_with_hash (&tmp, tmp.hash, INSERT);
+		    if (*slot == NULL)
+		      {
+			*slot = data_pool.allocate ();
+			**slot = tmp;
+		      }
+		  }
+	    }
+	}
+      delete htab;
+      XDELETE (bytes);
+    }
+  data_pool.release ();
+}
+
 /* Mark all constants that are used in the current function, then write
    out the function's private constant pool.  */
 
@@ -4251,6 +4426,10 @@ output_constant_pool (const char *fnname
 void
 output_shared_constant_pool (void)
 {
+  if (optimize
+      && TARGET_SUPPORTS_ALIASES)
+    optimize_constant_pool (shared_constant_pool);
+
   output_constant_pool_contents (shared_constant_pool);
 }
 


	Jakub



More information about the Gcc-patches mailing list