This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Avoid epilogue peeling for x264 vectorization in x264_pixel_sad_x4_8x8


The following improves x264 vectorization by avoiding peeling for gaps
noticing that when the upper half of a vector is unused we can
load the lower part only (and fill the upper half with zeros - this
is what x86 does automatically, GIMPLE doesn't allow us to leave
the upper half undefined as RTL would with using subregs).

The implementation is a little bit awkward as for optimal GIMPLE
code-generation and costing we'd like to go the strided load path
instead.  That proves somewhat difficult though thus the following
is easier but doesn't fill out the re-align paths nor the masked
paths (at least the fully masked path would never need peeling for
gaps).

Bootstrapped and tested on x86_64-unknown-linux-gnu, tested with
SPEC CPU 2006 and 2017 with the expected (~4%) improvement for
625.x264_s.  Didn't see any positive or negative effects elsewhere.

Queued for GCC 10.

Richard.

2018-12-12  Richard Biener  <rguenther@suse.de>

	* tree-vect-stmts.c (get_group_load_store_type): Avoid
	peeling for gaps by loading only lower halves of vectors
	if possible.
	(vectorizable_load): Likewise.

	* gcc.dg/vect/slp-reduc-sad-2.c: New testcase.

Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	(revision 266744)
+++ gcc/tree-vect-stmts.c	(working copy)
@@ -2194,6 +2194,29 @@ get_group_load_store_type (stmt_vec_info
 	      && gap < (vect_known_alignment_in_bytes (first_dr_info)
 			/ vect_get_scalar_dr_size (first_dr_info)))
 	    overrun_p = false;
+
+	  /* If the gap splits the vector in half and the target
+	     can do half-vector operations avoid the epilogue peeling
+	     by simply loading half of the vector only.  Usually
+	     the construction with an upper zero half will be elided.  */
+	  dr_alignment_support alignment_support_scheme;
+	  scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
+	  machine_mode vmode;
+	  if (overrun_p
+	      && !masked_p
+	      && (((alignment_support_scheme
+		      = vect_supportable_dr_alignment (first_dr_info, false)))
+		   == dr_aligned
+		  || alignment_support_scheme == dr_unaligned_supported)
+	      && known_eq (nunits, (group_size - gap) * 2)
+	      && mode_for_vector (elmode, (group_size - gap)).exists (&vmode)
+	      && VECTOR_MODE_P (vmode)
+	      && targetm.vector_mode_supported_p (vmode)
+	      && (convert_optab_handler (vec_init_optab,
+					 TYPE_MODE (vectype), vmode)
+		  != CODE_FOR_nothing))
+	    overrun_p = false;
+
 	  if (overrun_p && !can_overrun_p)
 	    {
 	      if (dump_enabled_p ())
@@ -8362,8 +8385,24 @@ vectorizable_load (stmt_vec_info stmt_in
 		      }
 		    else
 		      {
+			tree ltype = vectype;
+			/* If there's no peeling for gaps but we have a gap
+			   with slp loads then load the lower half of the
+			   vector only.  See get_group_load_store_type for
+			   when we apply this optimization.  */
+			if (slp
+			    && loop_vinfo
+			    && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+			    && DR_GROUP_GAP (first_stmt_info) != 0
+			    && known_eq (nunits,
+					 (group_size
+					  - DR_GROUP_GAP (first_stmt_info)) * 2))
+			  ltype = build_vector_type (TREE_TYPE (vectype),
+						     (group_size
+						      - DR_GROUP_GAP
+						          (first_stmt_info)));
 			data_ref
-			  = fold_build2 (MEM_REF, vectype, dataref_ptr,
+			  = fold_build2 (MEM_REF, ltype, dataref_ptr,
 					 dataref_offset
 					 ? dataref_offset
 					 : build_int_cst (ref_type, 0));
@@ -8377,6 +8416,23 @@ vectorizable_load (stmt_vec_info stmt_in
 			  TREE_TYPE (data_ref)
 			    = build_aligned_type (TREE_TYPE (data_ref),
 						  TYPE_ALIGN (elem_type));
+			if (ltype != vectype)
+			  {
+			    vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+			    tree tem = make_ssa_name (ltype);
+			    new_stmt = gimple_build_assign (tem, data_ref);
+			    vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+			    data_ref = NULL;
+			    vec<constructor_elt, va_gc> *v;
+			    vec_alloc (v, 2);
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+						    build_zero_cst (ltype));
+			    new_stmt
+			      = gimple_build_assign (vec_dest,
+						     build_constructor
+						       (vectype, v));
+			  }
 		      }
 		    break;
 		  }
Index: gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c	(working copy)
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_usad_char } */
+/* With AVX256 or more we do not pull off the trick eliding the epilogue.  */
+/* { dg-additional-options "-mprefer-avx128" { target { x86_64-*-* i?86-*-* } } } */
+
+typedef unsigned char uint8_t;
+int x264_pixel_sad_8x8( uint8_t *pix1, uint8_t *pix2, int i_stride_pix2 )
+{
+  int i_sum = 0;
+  for( int y = 0; y < 8; y++ )
+    {
+      i_sum += __builtin_abs( pix1[0] - pix2[0] );
+      i_sum += __builtin_abs( pix1[1] - pix2[1] );
+      i_sum += __builtin_abs( pix1[2] - pix2[2] );
+      i_sum += __builtin_abs( pix1[3] - pix2[3] );
+      i_sum += __builtin_abs( pix1[4] - pix2[4] );
+      i_sum += __builtin_abs( pix1[5] - pix2[5] );
+      i_sum += __builtin_abs( pix1[6] - pix2[6] );
+      i_sum += __builtin_abs( pix1[7] - pix2[7] );
+      pix1 += 16;
+      pix2 += i_stride_pix2;
+    }
+  return i_sum;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_sad_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-not "access with gaps requires scalar epilogue loop" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]