middle-end: Implement preferred_div_as_shifts_over_mult [PR108583]

author Tamar Christina <tamar.christina@arm.com>

Sun, 12 Mar 2023 18:42:04 +0000 (18:42 +0000)

committer Tamar Christina <tamar.christina@arm.com>

Sun, 12 Mar 2023 18:42:04 +0000 (18:42 +0000)
author Tamar Christina <tamar.christina@arm.com>
Sun, 12 Mar 2023 18:42:04 +0000 (18:42 +0000)
committer Tamar Christina <tamar.christina@arm.com>
Sun, 12 Mar 2023 18:42:04 +0000 (18:42 +0000)
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi

index 144862e00c4d6984d4aa93641b2c5d832379e94e..c4a92a5ebee90473dc633db586bba30747d9260e 100644 (file)
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6146,6 +6146,12 @@ instruction pattern.  There is no need for the hook to handle these two
  implementation approaches itself.
  @end deftypefn
  
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type})
+Sometimes it is possible to implement a vector division using a sequence
+of two addition-shift pairs, giving four instructions in total.
+Return true if taking this approach for @var{vectype} is likely
+to be better than using a sequence involving highpart multiplication.
+Default is false if @code{can_mult_highpart_p}, otherwise true.
  @end deftypefn
  
  @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in

index 146f461ddad247906705d03bb389ac707e5f8cde..4075e71624c04cb8167314efdf27264f4d8c1f3f 100644 (file)
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4175,6 +4175,7 @@ address;  but often a machine-dependent strategy can generate better code.
  
  @hook TARGET_VECTORIZE_VEC_PERM_CONST
  
+@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
  
  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
  
diff --git a/gcc/target.def b/gcc/target.def

index 049dab64f683c67d1ea9d2ca8784b1efbb347c3d..f401fe148ee2ddf138359fa1b462cfebc9c88691 100644 (file)
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1868,6 +1868,18 @@ correct for most targets.",
   poly_uint64, (const_tree type),
   default_preferred_vector_alignment)
  
+/* Returns whether the target has a preference for decomposing divisions using
+   shifts rather than multiplies.  */
+DEFHOOK
+(preferred_div_as_shifts_over_mult,
+ "Sometimes it is possible to implement a vector division using a sequence\n\
+of two addition-shift pairs, giving four instructions in total.\n\
+Return true if taking this approach for @var{vectype} is likely\n\
+to be better than using a sequence involving highpart multiplication.\n\
+Default is false if @code{can_mult_highpart_p}, otherwise true.",
+ bool, (const_tree type),
+ default_preferred_div_as_shifts_over_mult)
+
  /* Return true if vector alignment is reachable (by peeling N
     iterations) for the given scalar type.  */
  DEFHOOK
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc

index 8b7452ce3364d2a1a11e67b35b9327fa9b050308..51bf3fb7a82ff83ca4c16e93f5f2472dfba71511 100644 (file)
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1488,6 +1488,15 @@ default_preferred_vector_alignment (const_tree type)
    return TYPE_ALIGN (type);
  }
  
+/* The default implementation of
+   TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
+
+bool
+default_preferred_div_as_shifts_over_mult (const_tree type)
+{
+  return !can_mult_highpart_p (TYPE_MODE (type), TYPE_UNSIGNED (type));
+}
+
  /* By default assume vectors of element TYPE require a multiple of the natural
     alignment of TYPE.  TYPE is naturally aligned if IS_PACKED is false.  */
  bool
diff --git a/gcc/targhooks.h b/gcc/targhooks.h

index 555160da48abb0fbe426345c7baa5977bcf7972d..cf3d3107a0db0ab79065a92e3af720f17bd78bf1 100644 (file)
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void);
  extern unsigned HOST_WIDE_INT default_shift_truncation_mask
    (machine_mode);
  extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
+extern bool default_preferred_div_as_shifts_over_mult
+  (const_tree);
  extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
  
  extern tree default_stack_protect_guard (void);
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c

new file mode 100644 (file)

index 0000000..c81f894
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+typedef unsigned __attribute__((__vector_size__ (16))) V;
+
+static __attribute__((__noinline__)) __attribute__((__noclone__)) V
+foo (V v, unsigned short i)
+{
+  v /= i;
+  return v;
+}
+
+int
+main (void)
+{
+  V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
+  for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
+    if (v[i] != 0x00010001)
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c

new file mode 100644 (file)

index 0000000..b4eb1a4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc

index 298fd295a738c111e81093d6c27eceb0d00c461b..887f02bf336da890fd93ec069a4c8d98d9f80b92 100644 (file)
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3934,6 +3934,83 @@ vect_recog_divmod_pattern (vec_info *vinfo,
        return pattern_stmt;
      }
  
+  if ((cst = uniform_integer_cst_p (oprnd1))
+      && TYPE_UNSIGNED (itype)
+      && rhs_code == TRUNC_DIV_EXPR
+      && vectype
+      && targetm.vectorize.preferred_div_as_shifts_over_mult (vectype))
+    {
+      /* We can use the relationship:
+
+          x // N == ((x+N+2) // (N+1) + x) // (N+1)  for 0 <= x < N(N+3)
+
+        to optimize cases where N+1 is a power of 2, and where // (N+1)
+        is therefore a shift right.  When operating in modes that are
+        multiples of a byte in size, there are two cases:
+
+        (1) N(N+3) is not representable, in which case the question
+            becomes whether the replacement expression overflows.
+            It is enough to test that x+N+2 does not overflow,
+            i.e. that x < MAX-(N+1).
+
+        (2) N(N+3) is representable, in which case it is the (only)
+            bound that we need to check.
+
+        ??? For now we just handle the case where // (N+1) is a shift
+        right by half the precision, since some architectures can
+        optimize the associated addition and shift combinations
+        into single instructions.  */
+
+      auto wcst = wi::to_wide (cst);
+      int pow = wi::exact_log2 (wcst + 1);
+      if (pow == prec / 2)
+       {
+         gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
+
+         gimple_ranger ranger;
+         int_range_max r;
+
+         /* Check that no overflow will occur.  If we don't have range
+            information we can't perform the optimization.  */
+
+         if (ranger.range_of_expr (r, oprnd0, stmt))
+           {
+             wide_int max = r.upper_bound ();
+             wide_int one = wi::shwi (1, prec);
+             wide_int adder = wi::add (one, wi::lshift (one, pow));
+             wi::overflow_type ovf;
+             wi::add (max, adder, UNSIGNED, &ovf);
+             if (ovf == wi::OVF_NONE)
+               {
+                 *type_out = vectype;
+                 tree tadder = wide_int_to_tree (itype, adder);
+                 tree rshift = wide_int_to_tree (itype, pow);
+
+                 tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
+                 gassign *patt1
+                   = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
+                 append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+                 tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
+                 patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
+                                              rshift);
+                 append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+                 tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
+                 patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
+                                              oprnd0);
+                 append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+                 tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
+                 pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
+                                                     new_lhs3, rshift);
+
+                 return pattern_stmt;
+               }
+           }
+       }
+    }
+
    if (prec > HOST_BITS_PER_WIDE_INT
        || integer_zerop (oprnd1))
      return NULL;
author	Tamar Christina <tamar.christina@arm.com>
	Sun, 12 Mar 2023 18:42:04 +0000 (18:42 +0000)
committer	Tamar Christina <tamar.christina@arm.com>
	Sun, 12 Mar 2023 18:42:04 +0000 (18:42 +0000)
gcc/doc/tm.texi		patch \| blob \| blame \| history
gcc/doc/tm.texi.in		patch \| blob \| blame \| history
gcc/target.def		patch \| blob \| blame \| history
gcc/targhooks.cc		patch \| blob \| blame \| history
gcc/targhooks.h		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-patterns.cc		patch \| blob \| blame \| history