This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Lift alignment restrictions from inlining register size memcpy


On 12.07.2016 11:38, Richard Biener wrote:

The following patch does $subject which is requested in a comment in
PR50417 as this restriction defeats the purpose of having memcpy
as a portable and efficient way to circumvent strict-aliasing violations
(or even as a portable and efficient way to do unaligned loads).

IIUC the original purpose of the PR is a complaint about execution performance in situations like, say, memcpy (int*, int*, 0x1000).

GCC 4.5 used int-wide chunks in movmem even if no alignment could be inferred from symbols like in

static int a[..], b[..];
memcpy (a, b, 0x1000)

The more I think about it the more I tend to not changing the current behaviour, be conservative and treat memcpy like a function.

If the alignment is deduced from the pointer type, then this is clearly not what the standard says as you already pointed out in the PR.

Hence, if we want such an "optimization" it should be optional by means of a respective command option so that the user can explicitly request this ABI change for memcpy.

Bootstrap / regtest running on x86_64-unknown-linux-gnu (quite pointless
as there is no change on that arch).

I have added a testcase that should exercise most relevant cases so
we can look for fallout on "interesting" targets.

Boostrap / regtest on strict-alignment platforms welcome.

I do expect that with -Os expanding unaligned-unaligned moves
might be a size pessimization compared to a libcall (or what
the targets block move expander does).  But the whole point is
to remove abstraction penalty so it isn't an easy stmt-local
decision to make.  Comments on this front welcome as well.

Unless I hear some positives I'll let the patch sit here as I
don't really care too much about those pesky targets (and
targets can choose to opt-in by providing movmisalign optabs
anyway so they don't go the store/extract_bit_field path).

As I said, my doubts are increasing...

If movmem sees no alignment (where 4.5 passed a bigger alignment) then it could just FAIL and assume that libcall provides a fast implementation. This means inquiring the alignment at runtime and adding noticeable penalty for small number of byte... but well, we have to love with that I think

... and if it was secured by an option, -fstrict-align[ment] would be really confusing (to gcc people at least) and also to the users of back-ends that support -mstrict-align[ment].


Johann


Thanks,
Richard.

2016-07-12  Richard Biener  <rguenther@suse.de>

	* gimple-fold.c (gimple_fold_builtin_memory_op): Lift alignment
	restrictions from inlining register size memcpy.

	* gcc.dg/torture/builtin-memcpy.c: New testcase.

Index: gcc/gimple-fold.c
===================================================================
*** gcc/gimple-fold.c	(revision 238237)
--- gcc/gimple-fold.c	(working copy)
*************** gimple_fold_builtin_memory_op (gimple_st
*** 705,718 ****
  	      if (type
  		  && TYPE_MODE (type) != BLKmode
  		  && (GET_MODE_SIZE (TYPE_MODE (type)) * BITS_PER_UNIT
! 		      == ilen * 8)
! 		  /* If the destination pointer is not aligned we must be able
! 		     to emit an unaligned store.  */
! 		  && (dest_align >= GET_MODE_ALIGNMENT (TYPE_MODE (type))
! 		      || !SLOW_UNALIGNED_ACCESS (TYPE_MODE (type), dest_align)
! 		      || (optab_handler (movmisalign_optab, TYPE_MODE (type))
! 			  != CODE_FOR_nothing)))
  		{
  		  tree srctype = type;
  		  tree desttype = type;
  		  if (src_align < GET_MODE_ALIGNMENT (TYPE_MODE (type)))
--- 705,718 ----
  	      if (type
  		  && TYPE_MODE (type) != BLKmode
  		  && (GET_MODE_SIZE (TYPE_MODE (type)) * BITS_PER_UNIT
! 		      == ilen * 8))
  		{
+ 		  /* RTL expansion handles misaligned destination / source
+ 		     MEM_REFs either via target provided movmisalign or
+ 		     via extract/store_bit_field for targets that set
+ 		     SLOW_UNALIGNED_ACCESS for the move.  For move
+ 		     quantities up to MOVE_MAX this should be always
+ 		     more efficient than a libcall to memcpy.  */
  		  tree srctype = type;
  		  tree desttype = type;
  		  if (src_align < GET_MODE_ALIGNMENT (TYPE_MODE (type)))
*************** gimple_fold_builtin_memory_op (gimple_st
*** 721,767 ****
  		  tree tem = fold_const_aggregate_ref (srcmem);
  		  if (tem)
  		    srcmem = tem;
! 		  else if (src_align < GET_MODE_ALIGNMENT (TYPE_MODE (type))
! 			   && SLOW_UNALIGNED_ACCESS (TYPE_MODE (type),
! 						     src_align)
! 			   && (optab_handler (movmisalign_optab,
! 					      TYPE_MODE (type))
! 			       == CODE_FOR_nothing))
! 		    srcmem = NULL_TREE;
! 		  if (srcmem)
  		    {
! 		      gimple *new_stmt;
! 		      if (is_gimple_reg_type (TREE_TYPE (srcmem)))
! 			{
! 			  new_stmt = gimple_build_assign (NULL_TREE, srcmem);
! 			  if (gimple_in_ssa_p (cfun))
! 			    srcmem = make_ssa_name (TREE_TYPE (srcmem),
! 						    new_stmt);
! 			  else
! 			    srcmem = create_tmp_reg (TREE_TYPE (srcmem));
! 			  gimple_assign_set_lhs (new_stmt, srcmem);
! 			  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
! 			  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
! 			}
! 		      if (dest_align < GET_MODE_ALIGNMENT (TYPE_MODE (type)))
! 			desttype = build_aligned_type (type, dest_align);
! 		      new_stmt
! 			= gimple_build_assign (fold_build2 (MEM_REF, desttype,
! 							    dest, off0),
! 					       srcmem);
  		      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
- 		      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
- 		      if (gimple_vdef (new_stmt)
- 			  && TREE_CODE (gimple_vdef (new_stmt)) == SSA_NAME)
- 			SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
- 		      if (!lhs)
- 			{
- 			  gsi_replace (gsi, new_stmt, false);
- 			  return true;
- 			}
  		      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
- 		      goto done;
  		    }
  		}
  	    }
  	}
--- 721,756 ----
  		  tree tem = fold_const_aggregate_ref (srcmem);
  		  if (tem)
  		    srcmem = tem;
! 		  gimple *new_stmt;
! 		  if (is_gimple_reg_type (TREE_TYPE (srcmem)))
  		    {
! 		      new_stmt = gimple_build_assign (NULL_TREE, srcmem);
! 		      if (gimple_in_ssa_p (cfun))
! 			srcmem = make_ssa_name (TREE_TYPE (srcmem),
! 						new_stmt);
! 		      else
! 			srcmem = create_tmp_reg (TREE_TYPE (srcmem));
! 		      gimple_assign_set_lhs (new_stmt, srcmem);
  		      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
  		      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
  		    }
+ 		  if (dest_align < GET_MODE_ALIGNMENT (TYPE_MODE (type)))
+ 		    desttype = build_aligned_type (type, dest_align);
+ 		  new_stmt
+ 		    = gimple_build_assign (fold_build2 (MEM_REF, desttype,
+ 							dest, off0), srcmem);
+ 		  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+ 		  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+ 		  if (gimple_vdef (new_stmt)
+ 		      && TREE_CODE (gimple_vdef (new_stmt)) == SSA_NAME)
+ 		    SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
+ 		  if (!lhs)
+ 		    {
+ 		      gsi_replace (gsi, new_stmt, false);
+ 		      return true;
+ 		    }
+ 		  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+ 		  goto done;
  		}
  	    }
  	}
Index: gcc/testsuite/gcc.dg/torture/builtin-memcpy.c
===================================================================
*** gcc/testsuite/gcc.dg/torture/builtin-memcpy.c	(revision 0)
--- gcc/testsuite/gcc.dg/torture/builtin-memcpy.c	(working copy)
***************
*** 0 ****
--- 1,89 ----
+ /* { dg-do run } */
+
+ char src[32], dest[32];
+ long long llsrc, lldest;
+
+ /* Unaligned source/dest variants.  */
+ void __attribute__((noinline,noclone))
+ copyuu16 (long long *src, long long *dest)
+ {
+   __builtin_memcpy (dest, src, 16);
+ }
+ void __attribute__((noinline,noclone))
+ copyuu8 (long *src, long *dest)
+ {
+   __builtin_memcpy (dest, src, 8);
+ }
+ void __attribute__((noinline,noclone))
+ copyuu4 (int *src, int *dest)
+ {
+   __builtin_memcpy (dest, src, 4);
+ }
+ void __attribute__((noinline,noclone))
+ copyuu2 (short *src, short *dest)
+ {
+   __builtin_memcpy (dest, src, 2);
+ }
+
+ /* Aligned source, unaligned dest variants.  */
+ void __attribute__((noinline,noclone))
+ copyau16 (long long *dest)
+ {
+   __builtin_memcpy (dest, &llsrc, 16);
+ }
+ void __attribute__((noinline,noclone))
+ copyau8 (long long *dest)
+ {
+   __builtin_memcpy (dest, &llsrc, 8);
+ }
+ void __attribute__((noinline,noclone))
+ copyau4 (long long *dest)
+ {
+   __builtin_memcpy (dest, &llsrc, 4);
+ }
+ void __attribute__((noinline,noclone))
+ copyau2 (long long *dest)
+ {
+   __builtin_memcpy (dest, &llsrc, 2);
+ }
+
+ /* Unaligned source, aligned dest variants.  */
+ void __attribute__((noinline,noclone))
+ copyua16 (long long *src)
+ {
+   __builtin_memcpy (&lldest, src, 16);
+ }
+ void __attribute__((noinline,noclone))
+ copyua8 (long *src)
+ {
+   __builtin_memcpy (&lldest, src, 8);
+ }
+ void __attribute__((noinline,noclone))
+ copyua4 (int *src)
+ {
+   __builtin_memcpy (&lldest, src, 4);
+ }
+ void __attribute__((noinline,noclone))
+ copyua2 (short *src)
+ {
+   __builtin_memcpy (&lldest, src, 2);
+ }
+
+ int main()
+ {
+   void *usrc = (void *)(((__UINTPTR_TYPE__)src & -16) + 17);
+   void *udest = (void *)(((__UINTPTR_TYPE__)dest & -16) + 17);
+   copyuu16 (udest, usrc);
+   copyuu8 (udest, usrc);
+   copyuu4 (udest, usrc);
+   copyuu2 (udest, usrc);
+   copyau16 (usrc);
+   copyau8 (usrc);
+   copyau4 (usrc);
+   copyau2 (usrc);
+   copyua16 (udest);
+   copyua8 (udest);
+   copyua4 (udest);
+   copyua2 (udest);
+   return 0;
+ }



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]