[PATCH][RFC] Fix PR61473, inline small memcpy/memmove during tree opts

Richard Biener rguenther@suse.de
Thu Jun 12 10:15:00 GMT 2014


This implements the requested inlining of memmove for possibly
overlapping arguments by doing first all loads and then all stores.
The easiest place is to do this in memory op folding where we already
perform inlining of some memcpy cases (but fail to do the equivalent
memcpy optimization - though RTL expansion later does it).

The following patch restricts us to max. word-mode size.  Ideally
we'd have a way to check for the number of real instructions needed
to load an (aligned) value of size N.  But maybe we don't care
and are fine with doing multiple loads / stores?

Anyway, the following is conservative (but maybe not enough).

Bootstrap / regtest running on x86_64-unknown-linux-gnu.

These transforms don't really belong to GENERIC folding (they
also run at -O0 ...), similar to most builtin foldings.  But this
patch is not to change that.

Any comments on the size/cost issue?

Thanks,
Richard.

2014-06-12  Richard Biener  <rguenther@suse.de>

	PR middle-end/61473
	* builtins.c (fold_builtin_memory_op): Inline memory moves
	that can be implemented with a single load followed by a
	single store.

	* gcc.dg/memmove-4.c: New testcase.

Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c	(revision 211449)
+++ gcc/builtins.c	(working copy)
@@ -8637,11 +8637,53 @@ fold_builtin_memory_op (location_t loc,
       unsigned int src_align, dest_align;
       tree off0;
 
-      if (endp == 3)
+      /* Build accesses at offset zero with a ref-all character type.  */
+      off0 = build_int_cst (build_pointer_type_for_mode (char_type_node,
+							 ptr_mode, true), 0);
+
+      /* If we can perform the copy efficiently with first doing all loads
+         and then all stores inline it that way.  Currently efficiently
+	 means that we can load all the memory into a single integer
+	 register and thus limited to word_mode size.  Ideally we'd have
+	 a way to query the largest mode that we can load/store with
+	 a signle instruction.  */
+      src_align = get_pointer_alignment (src);
+      dest_align = get_pointer_alignment (dest);
+      if (tree_fits_uhwi_p (len)
+	  && compare_tree_int (len, BITS_PER_WORD / 8) <= 0)
 	{
-	  src_align = get_pointer_alignment (src);
-	  dest_align = get_pointer_alignment (dest);
+	  unsigned ilen = tree_to_uhwi (len);
+	  if (exact_log2 (ilen) != -1)
+	    {
+	      tree type = lang_hooks.types.type_for_size (ilen * 8, 1);
+	      if (type
+		  && TYPE_MODE (type) != BLKmode
+		  && (GET_MODE_SIZE (TYPE_MODE (type)) * BITS_PER_UNIT
+		      == ilen * 8)
+		  /* If the pointers are not aligned we must be able to
+		     emit an unaligned load.  */
+		  && ((src_align >= GET_MODE_ALIGNMENT (TYPE_MODE (type))
+		       && dest_align >= GET_MODE_ALIGNMENT (TYPE_MODE (type)))
+		      || !SLOW_UNALIGNED_ACCESS (TYPE_MODE (type),
+						 MIN (src_align, dest_align))))
+		{
+		  tree srctype = type;
+		  tree desttype = type;
+		  if (src_align < GET_MODE_ALIGNMENT (TYPE_MODE (type)))
+		    srctype = build_aligned_type (type, src_align);
+		  if (dest_align < GET_MODE_ALIGNMENT (TYPE_MODE (type)))
+		    desttype = build_aligned_type (type, dest_align);
+		  destvar = fold_build2 (MEM_REF, desttype, dest, off0);
+		  expr = build2 (MODIFY_EXPR, type,
+				 fold_build2 (MEM_REF, desttype, dest, off0),
+				 fold_build2 (MEM_REF, srctype, src, off0));
+		  goto done;
+		}
+	    }
+	}
 
+      if (endp == 3)
+	{
 	  /* Both DEST and SRC must be pointer types.
 	     ??? This is what old code did.  Is the testing for pointer types
 	     really mandatory?
@@ -8818,10 +8860,6 @@ fold_builtin_memory_op (location_t loc,
       if (!ignore)
         dest = builtin_save_expr (dest);
 
-      /* Build accesses at offset zero with a ref-all character type.  */
-      off0 = build_int_cst (build_pointer_type_for_mode (char_type_node,
-							 ptr_mode, true), 0);
-
       destvar = dest;
       STRIP_NOPS (destvar);
       if (TREE_CODE (destvar) == ADDR_EXPR
@@ -8888,6 +8926,7 @@ fold_builtin_memory_op (location_t loc,
       expr = build2 (MODIFY_EXPR, TREE_TYPE (destvar), destvar, srcvar);
     }
 
+done:
   if (ignore)
     return expr;
 
Index: gcc/testsuite/gcc.dg/memmove-4.c
===================================================================
--- gcc/testsuite/gcc.dg/memmove-4.c	(revision 0)
+++ gcc/testsuite/gcc.dg/memmove-4.c	(working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-optimized" } */
+
+typedef int w __attribute__((mode(word)));
+
+void b(char *a, char *b, int i)
+{
+  __builtin_memmove (&a[i], &b[i], sizeof(w));
+}
+
+/* { dg-final { scan-tree-dump-not "memmove" "optimized" { xfail { ! non_strict_align } } } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */



More information about the Gcc-patches mailing list