implementation approaches itself.
@end deftypefn
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type})
+Sometimes it is possible to implement a vector division using a sequence
+of two addition-shift pairs, giving four instructions in total.
+Return true if taking this approach for @var{vectype} is likely
+to be better than using a sequence involving highpart multiplication.
+Default is false if @code{can_mult_highpart_p}, otherwise true.
@end deftypefn
@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
@hook TARGET_VECTORIZE_VEC_PERM_CONST
+@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
@hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
poly_uint64, (const_tree type),
default_preferred_vector_alignment)
+/* Returns whether the target has a preference for decomposing divisions using
+ shifts rather than multiplies. */
+DEFHOOK
+(preferred_div_as_shifts_over_mult,
+ "Sometimes it is possible to implement a vector division using a sequence\n\
+of two addition-shift pairs, giving four instructions in total.\n\
+Return true if taking this approach for @var{vectype} is likely\n\
+to be better than using a sequence involving highpart multiplication.\n\
+Default is false if @code{can_mult_highpart_p}, otherwise true.",
+ bool, (const_tree type),
+ default_preferred_div_as_shifts_over_mult)
+
/* Return true if vector alignment is reachable (by peeling N
iterations) for the given scalar type. */
DEFHOOK
return TYPE_ALIGN (type);
}
+/* The default implementation of
+ TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
+
+bool
+default_preferred_div_as_shifts_over_mult (const_tree type)
+{
+ return !can_mult_highpart_p (TYPE_MODE (type), TYPE_UNSIGNED (type));
+}
+
/* By default assume vectors of element TYPE require a multiple of the natural
alignment of TYPE. TYPE is naturally aligned if IS_PACKED is false. */
bool
extern unsigned HOST_WIDE_INT default_shift_truncation_mask
(machine_mode);
extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
+extern bool default_preferred_div_as_shifts_over_mult
+ (const_tree);
extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
extern tree default_stack_protect_guard (void);
--- /dev/null
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+typedef unsigned __attribute__((__vector_size__ (16))) V;
+
+static __attribute__((__noinline__)) __attribute__((__noclone__)) V
+foo (V v, unsigned short i)
+{
+ v /= i;
+ return v;
+}
+
+int
+main (void)
+{
+ V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
+ for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
+ if (v[i] != 0x00010001)
+ __builtin_abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
--- /dev/null
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+ for (int i = 0; i < n; i+=1)
+ pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+ for (int i = 0; i < n; i+=1)
+ pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+int main ()
+{
+ TYPE a[N];
+ TYPE b[N];
+
+ for (int i = 0; i < N; ++i)
+ {
+ a[i] = BASE + i * 13;
+ b[i] = BASE + i * 13;
+ if (DEBUG)
+ printf ("%d: 0x%x\n", i, a[i]);
+ }
+
+ fun1 (a, N / 2, N);
+ fun2 (b, N / 2, N);
+
+ for (int i = 0; i < N; ++i)
+ {
+ if (DEBUG)
+ printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+ if (a[i] != b[i])
+ __builtin_abort ();
+ }
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */
return pattern_stmt;
}
+ if ((cst = uniform_integer_cst_p (oprnd1))
+ && TYPE_UNSIGNED (itype)
+ && rhs_code == TRUNC_DIV_EXPR
+ && vectype
+ && targetm.vectorize.preferred_div_as_shifts_over_mult (vectype))
+ {
+ /* We can use the relationship:
+
+ x // N == ((x+N+2) // (N+1) + x) // (N+1) for 0 <= x < N(N+3)
+
+ to optimize cases where N+1 is a power of 2, and where // (N+1)
+ is therefore a shift right. When operating in modes that are
+ multiples of a byte in size, there are two cases:
+
+ (1) N(N+3) is not representable, in which case the question
+ becomes whether the replacement expression overflows.
+ It is enough to test that x+N+2 does not overflow,
+ i.e. that x < MAX-(N+1).
+
+ (2) N(N+3) is representable, in which case it is the (only)
+ bound that we need to check.
+
+ ??? For now we just handle the case where // (N+1) is a shift
+ right by half the precision, since some architectures can
+ optimize the associated addition and shift combinations
+ into single instructions. */
+
+ auto wcst = wi::to_wide (cst);
+ int pow = wi::exact_log2 (wcst + 1);
+ if (pow == prec / 2)
+ {
+ gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
+
+ gimple_ranger ranger;
+ int_range_max r;
+
+ /* Check that no overflow will occur. If we don't have range
+ information we can't perform the optimization. */
+
+ if (ranger.range_of_expr (r, oprnd0, stmt))
+ {
+ wide_int max = r.upper_bound ();
+ wide_int one = wi::shwi (1, prec);
+ wide_int adder = wi::add (one, wi::lshift (one, pow));
+ wi::overflow_type ovf;
+ wi::add (max, adder, UNSIGNED, &ovf);
+ if (ovf == wi::OVF_NONE)
+ {
+ *type_out = vectype;
+ tree tadder = wide_int_to_tree (itype, adder);
+ tree rshift = wide_int_to_tree (itype, pow);
+
+ tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
+ gassign *patt1
+ = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
+ append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+ tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
+ patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
+ rshift);
+ append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+ tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
+ patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
+ oprnd0);
+ append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+ tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
+ pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
+ new_lhs3, rshift);
+
+ return pattern_stmt;
+ }
+ }
+ }
+ }
+
if (prec > HOST_BITS_PER_WIDE_INT
|| integer_zerop (oprnd1))
return NULL;