This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Re: Vectorizer question: DIV to RSHIFT conversion
- From: Jakub Jelinek <jakub at redhat dot com>
- To: Kirill Yukhin <kirill dot yukhin at gmail dot com>
- Cc: Richard Guenther <rguenther at suse dot de>, gcc-patches at gcc dot gnu dot org, Ira Rosen <ira dot rosen at linaro dot org>
- Date: Wed, 14 Dec 2011 13:25:13 +0100
- Subject: [PATCH] Re: Vectorizer question: DIV to RSHIFT conversion
- References: <CAGs3RfvOFgfVQ=PkYM+CtsgBL99k_gF_R-Yet3oFLHU7b-k5jQ@mail.gmail.com> <alpine.LNX.2.00.1112131406430.4527@zhemvz.fhfr.qr> <20111213132128.GZ1957@tyan-ft48-01.lab.bos.redhat.com> <CAGs3Rfucj9C7DqcjjJOzor0X=Yf_DT1av1T6cZdfZkmOsS+VNw@mail.gmail.com> <20111213134741.GA1957@tyan-ft48-01.lab.bos.redhat.com> <CAGs3RfvkZW+k-NmqHkNx7V42zOV4SCL-OXVLUr7M-iHs_HVAYA@mail.gmail.com>
- Reply-to: Jakub Jelinek <jakub at redhat dot com>
On Tue, Dec 13, 2011 at 05:57:40PM +0400, Kirill Yukhin wrote:
> > Let me hack up a quick pattern recognizer for this...
Here it is, untested so far.
On the testcase doing 2000000 f1+f2+f3+f4 calls in the loop with -O3 -mavx
on Sandybridge (so, vectorized just with 16 byte vectors) gives:
vanilla 0m34.571s
the tree-vect* parts of this patch only 0m9.013s
the whole patch 0m8.824s
The i386 parts are just a small optimization, I guess it could be
done in the vectorizer too (but then we'd have to check whether the
arithmetic/logical right shifts are supported and check costs?), or
perhaps in the generic vcond expander (again, we'd need to check some
costs).
Can you see if it gives a measurable speedup on SPEC bzip2?
2011-12-14 Jakub Jelinek <jakub@redhat.com>
* tree-vectorizer.h (NUM_PATTERNS): Bump to 10.
* tree-vect-patterns.c (vect_recog_sdivmod_pow2_pattern): New
function.
(vect_vect_recog_func_ptrs): Add it.
* config/i386/sse.md (vcond<V_256:mode><VI_256:mode>,
vcond<V_128:mode><VI124_128:mode>, vcond<VI8F_128:mode>v2di):
Use general_operand instead of nonimmediate_operand for
operand 5 and no predicate for operands 1 and 2.
* config/i386/i386.c (ix86_expand_int_vcond): Optimize
x < 0 ? -1 : 0 and x < 0 ? 1 : 0 into vector arithmetic
resp. logical shift.
* gcc.dg/vect/vect-sdivmod-1.c: New test.
--- gcc/tree-vectorizer.h.jj 2011-12-14 08:11:03.592010101 +0100
+++ gcc/tree-vectorizer.h 2011-12-14 08:28:57.006693371 +0100
@@ -929,7 +929,7 @@ extern void vect_slp_transform_bb (basic
Additional pattern recognition functions can (and will) be added
in the future. */
typedef gimple (* vect_recog_func_ptr) (VEC (gimple, heap) **, tree *, tree *);
-#define NUM_PATTERNS 9
+#define NUM_PATTERNS 10
void vect_pattern_recog (loop_vec_info);
/* In tree-vectorizer.c. */
--- gcc/tree-vect-patterns.c.jj 2011-12-02 01:52:27.918883940 +0100
+++ gcc/tree-vect-patterns.c 2011-12-14 11:50:10.439763740 +0100
@@ -53,6 +53,8 @@ static gimple vect_recog_widen_shift_pat
tree *, tree *);
static gimple vect_recog_vector_vector_shift_pattern (VEC (gimple, heap) **,
tree *, tree *);
+static gimple vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **,
+ tree *, tree *);
static gimple vect_recog_mixed_size_cond_pattern (VEC (gimple, heap) **,
tree *, tree *);
static gimple vect_recog_bool_pattern (VEC (gimple, heap) **, tree *, tree *);
@@ -64,6 +66,7 @@ static vect_recog_func_ptr vect_vect_rec
vect_recog_over_widening_pattern,
vect_recog_widen_shift_pattern,
vect_recog_vector_vector_shift_pattern,
+ vect_recog_sdivmod_pow2_pattern,
vect_recog_mixed_size_cond_pattern,
vect_recog_bool_pattern};
@@ -1573,6 +1576,211 @@ vect_recog_vector_vector_shift_pattern (
return pattern_stmt;
}
+/* Detect a signed division by power of two constant that wouldn't be
+ otherwise vectorized:
+
+ type a_t, b_t;
+
+ S1 a_t = b_t / N;
+
+ where type 'type' is a signed integral type and N is a constant positive
+ power of two.
+
+ Similarly handle signed modulo by power of two constant:
+
+ S4 a_t = b_t % N;
+
+ Input/Output:
+
+ * STMTS: Contains a stmt from which the pattern search begins,
+ i.e. the division stmt. S1 is replaced by:
+ S3 y_t = b_t < 0 ? N - 1 : 0;
+ S2 x_t = b_t + y_t;
+ S1' a_t = x_t >> log2 (N);
+
+ S4 is replaced by (where *_T temporaries have unsigned type):
+ S9 y_T = b_t < 0 ? -1U : 0U;
+ S8 z_T = y_T >> (sizeof (type_t) * CHAR_BIT - log2 (N));
+ S7 z_t = (type) z_T;
+ S6 w_t = b_t + z_t;
+ S5 x_t = w_t & (N - 1);
+ S4' a_t = x_t - z_t;
+
+ Output:
+
+ * TYPE_IN: The type of the input arguments to the pattern.
+
+ * TYPE_OUT: The type of the output of this pattern.
+
+ * Return value: A new stmt that will be used to replace the division
+ S1 or modulo S4 stmt. */
+
+static gimple
+vect_recog_sdivmod_pow2_pattern (VEC (gimple, heap) **stmts,
+ tree *type_in, tree *type_out)
+{
+ gimple last_stmt = VEC_pop (gimple, *stmts);
+ gimple_stmt_iterator gsi;
+ tree oprnd0, oprnd1, vectype, itype, cond;
+ gimple pattern_stmt, def_stmt;
+ enum tree_code rhs_code;
+ stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
+ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
+ optab optab;
+
+ if (!is_gimple_assign (last_stmt))
+ return NULL;
+
+ rhs_code = gimple_assign_rhs_code (last_stmt);
+ switch (rhs_code)
+ {
+ case TRUNC_DIV_EXPR:
+ case TRUNC_MOD_EXPR:
+ break;
+ default:
+ return NULL;
+ }
+
+ if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
+ return NULL;
+
+ oprnd0 = gimple_assign_rhs1 (last_stmt);
+ oprnd1 = gimple_assign_rhs2 (last_stmt);
+ itype = TREE_TYPE (oprnd0);
+ if (TREE_CODE (oprnd0) != SSA_NAME
+ || TREE_CODE (oprnd1) != INTEGER_CST
+ || TREE_CODE (itype) != INTEGER_TYPE
+ || TYPE_UNSIGNED (itype)
+ || TYPE_PRECISION (itype) != GET_MODE_PRECISION (TYPE_MODE (itype))
+ || !integer_pow2p (oprnd1)
+ || tree_int_cst_sgn (oprnd1) != 1)
+ return NULL;
+
+ vectype = get_vectype_for_scalar_type (itype);
+ if (vectype == NULL_TREE)
+ return NULL;
+
+ /* If the target can handle vectorized division or modulo natively,
+ don't attempt to optimize this. */
+ optab = optab_for_tree_code (rhs_code, vectype, optab_default);
+ if (optab != NULL)
+ {
+ enum machine_mode vec_mode = TYPE_MODE (vectype);
+ int icode = (int) optab_handler (optab, vec_mode);
+ if (icode != CODE_FOR_nothing
+ || GET_MODE_SIZE (vec_mode) == UNITS_PER_WORD)
+ return NULL;
+ }
+
+ /* Pattern detected. */
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vect_recog_sdivmod_pow2_pattern: detected: ");
+
+ cond = build2 (LT_EXPR, boolean_type_node, oprnd0, build_int_cst (itype, 0));
+ gsi = gsi_for_stmt (last_stmt);
+ if (rhs_code == TRUNC_DIV_EXPR)
+ {
+ tree var = vect_recog_temp_ssa_var (itype, NULL);
+ def_stmt
+ = gimple_build_assign_with_ops3 (COND_EXPR, var, cond,
+ fold_build2 (MINUS_EXPR, itype,
+ oprnd1,
+ build_int_cst (itype,
+ 1)),
+ build_int_cst (itype, 0));
+ gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+ set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo,
+ NULL));
+ var = vect_recog_temp_ssa_var (itype, NULL);
+ def_stmt
+ = gimple_build_assign_with_ops (PLUS_EXPR, var, oprnd0,
+ gimple_assign_lhs (def_stmt));
+ STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt;
+
+ pattern_stmt
+ = gimple_build_assign_with_ops (RSHIFT_EXPR,
+ vect_recog_temp_ssa_var (itype, NULL),
+ var,
+ build_int_cst (itype,
+ tree_log2 (oprnd1)));
+ }
+ else
+ {
+ tree signmask;
+ tree utype = build_nonstandard_integer_type (TYPE_PRECISION (itype), 1);
+ tree shift = build_int_cst (utype, GET_MODE_BITSIZE (TYPE_MODE (itype))
+ - tree_log2 (oprnd1));
+ if (compare_tree_int (oprnd1, 2) == 0)
+ {
+ signmask = vect_recog_temp_ssa_var (itype, NULL);
+ def_stmt
+ = gimple_build_assign_with_ops3 (COND_EXPR, signmask, cond,
+ build_int_cst (itype, 1),
+ build_int_cst (itype, 0));
+ gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+ set_vinfo_for_stmt (def_stmt,
+ new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
+ }
+ else
+ {
+ tree var = vect_recog_temp_ssa_var (utype, NULL);
+ def_stmt
+ = gimple_build_assign_with_ops3 (COND_EXPR, var, cond,
+ build_int_cst (utype, -1),
+ build_int_cst (utype, 0));
+ gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+ set_vinfo_for_stmt (def_stmt,
+ new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
+ var = vect_recog_temp_ssa_var (utype, NULL);
+ def_stmt
+ = gimple_build_assign_with_ops (RSHIFT_EXPR, var,
+ gimple_assign_lhs (def_stmt),
+ shift);
+ gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+ set_vinfo_for_stmt (def_stmt,
+ new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
+ signmask = vect_recog_temp_ssa_var (itype, NULL);
+ def_stmt
+ = gimple_build_assign_with_ops (NOP_EXPR, signmask, var,
+ NULL_TREE);
+ gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+ set_vinfo_for_stmt (def_stmt,
+ new_stmt_vec_info (def_stmt, loop_vinfo, NULL));
+ }
+ def_stmt
+ = gimple_build_assign_with_ops (PLUS_EXPR,
+ vect_recog_temp_ssa_var (itype, NULL),
+ oprnd0, signmask);
+ gsi_insert_before (&gsi, def_stmt, GSI_SAME_STMT);
+ set_vinfo_for_stmt (def_stmt, new_stmt_vec_info (def_stmt, loop_vinfo,
+ NULL));
+ def_stmt
+ = gimple_build_assign_with_ops (BIT_AND_EXPR,
+ vect_recog_temp_ssa_var (itype, NULL),
+ gimple_assign_lhs (def_stmt),
+ fold_build2 (MINUS_EXPR, itype,
+ oprnd1,
+ build_int_cst (itype,
+ 1)));
+ STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = def_stmt;
+
+ pattern_stmt
+ = gimple_build_assign_with_ops (MINUS_EXPR,
+ vect_recog_temp_ssa_var (itype, NULL),
+ gimple_assign_lhs (def_stmt),
+ signmask);
+ }
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ print_gimple_stmt (vect_dump, pattern_stmt, 0, TDF_SLIM);
+
+ VEC_safe_push (gimple, heap, *stmts, last_stmt);
+
+ *type_in = vectype;
+ *type_out = vectype;
+ return pattern_stmt;
+}
+
/* Function vect_recog_mixed_size_cond_pattern
Try to find the following pattern:
--- gcc/config/i386/sse.md.jj 2011-12-03 12:22:33.000000000 +0100
+++ gcc/config/i386/sse.md 2011-12-14 13:00:47.695788256 +0100
@@ -6340,9 +6340,9 @@ (define_expand "vcond<V_256:mode><VI_256
(if_then_else:V_256
(match_operator 3 ""
[(match_operand:VI_256 4 "nonimmediate_operand" "")
- (match_operand:VI_256 5 "nonimmediate_operand" "")])
- (match_operand:V_256 1 "general_operand" "")
- (match_operand:V_256 2 "general_operand" "")))]
+ (match_operand:VI_256 5 "general_operand" "")])
+ (match_operand:V_256 1 "" "")
+ (match_operand:V_256 2 "" "")))]
"TARGET_AVX2
&& (GET_MODE_NUNITS (<V_256:MODE>mode)
== GET_MODE_NUNITS (<VI_256:MODE>mode))"
@@ -6357,9 +6357,9 @@ (define_expand "vcond<V_128:mode><VI124_
(if_then_else:V_128
(match_operator 3 ""
[(match_operand:VI124_128 4 "nonimmediate_operand" "")
- (match_operand:VI124_128 5 "nonimmediate_operand" "")])
- (match_operand:V_128 1 "general_operand" "")
- (match_operand:V_128 2 "general_operand" "")))]
+ (match_operand:VI124_128 5 "general_operand" "")])
+ (match_operand:V_128 1 "" "")
+ (match_operand:V_128 2 "" "")))]
"TARGET_SSE2
&& (GET_MODE_NUNITS (<V_128:MODE>mode)
== GET_MODE_NUNITS (<VI124_128:MODE>mode))"
@@ -6374,9 +6374,9 @@ (define_expand "vcond<VI8F_128:mode>v2di
(if_then_else:VI8F_128
(match_operator 3 ""
[(match_operand:V2DI 4 "nonimmediate_operand" "")
- (match_operand:V2DI 5 "nonimmediate_operand" "")])
- (match_operand:VI8F_128 1 "general_operand" "")
- (match_operand:VI8F_128 2 "general_operand" "")))]
+ (match_operand:V2DI 5 "general_operand" "")])
+ (match_operand:VI8F_128 1 "" "")
+ (match_operand:VI8F_128 2 "" "")))]
"TARGET_SSE4_2"
{
bool ok = ix86_expand_int_vcond (operands);
--- gcc/config/i386/i386.c.jj 2011-12-14 08:11:01.000000000 +0100
+++ gcc/config/i386/i386.c 2011-12-14 13:01:30.207538465 +0100
@@ -19434,6 +19434,45 @@ ix86_expand_int_vcond (rtx operands[])
cop0 = operands[4];
cop1 = operands[5];
+ /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
+ and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
+ if ((code == LT || code == GE)
+ && data_mode == mode
+ && cop1 == CONST0_RTX (mode)
+ && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
+ && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
+ && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
+ && (GET_MODE_SIZE (data_mode) == 16
+ || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
+ {
+ rtx negop = operands[2 - (code == LT)];
+ int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
+ if (negop == CONST1_RTX (data_mode))
+ {
+ rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
+ operands[0], 1, OPTAB_DIRECT);
+ if (res != operands[0])
+ emit_move_insn (operands[0], res);
+ return true;
+ }
+ else if (GET_MODE_INNER (data_mode) != DImode
+ && vector_all_ones_operand (negop, data_mode))
+ {
+ rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
+ operands[0], 0, OPTAB_DIRECT);
+ if (res != operands[0])
+ emit_move_insn (operands[0], res);
+ return true;
+ }
+ }
+
+ if (!nonimmediate_operand (cop1, mode))
+ cop1 = force_reg (mode, cop1);
+ if (!general_operand (operands[1], data_mode))
+ operands[1] = force_reg (data_mode, operands[1]);
+ if (!general_operand (operands[2], data_mode))
+ operands[2] = force_reg (data_mode, operands[2]);
+
/* XOP supports all of the comparisons on all 128-bit vector int types. */
if (TARGET_XOP
&& (mode == V16QImode || mode == V8HImode
--- gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c.jj 2011-12-14 13:05:36.240133846 +0100
+++ gcc/testsuite/gcc.dg/vect/vect-sdivmod-1.c 2011-12-14 13:08:10.288228745 +0100
@@ -0,0 +1,98 @@
+#include "tree-vect.h"
+
+extern void abort (void);
+int a[4096];
+
+__attribute__((noinline, noclone)) void
+f1 (int x)
+{
+ int i, j;
+ for (i = 1; i <= x; i++)
+ {
+ j = a[i] >> 8;
+ j = 1 + (j / 2);
+ a[i] = j << 8;
+ }
+}
+
+__attribute__((noinline, noclone)) void
+f2 (int x)
+{
+ int i, j;
+ for (i = 1; i <= x; i++)
+ {
+ j = a[i] >> 8;
+ j = 1 + (j / 16);
+ a[i] = j << 8;
+ }
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int x)
+{
+ int i, j;
+ for (i = 1; i <= x; i++)
+ {
+ j = a[i] >> 8;
+ j = 1 + (j % 2);
+ a[i] = j << 8;
+ }
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+ int i, j;
+ for (i = 1; i <= x; i++)
+ {
+ j = a[i] >> 8;
+ j = 1 + (j % 16);
+ a[i] = j << 8;
+ }
+}
+
+int
+main ()
+{
+ int i;
+ check_vect ();
+ for (i = 0; i < 4096; i++)
+ {
+ asm ("");
+ a[i] = (i - 2048) << 8;
+ }
+ f1 (4095);
+ if (a[0] != (-2048 << 8))
+ abort ();
+ for (i = 1; i < 4096; i++)
+ if (a[i] != ((1 + ((i - 2048) / 2)) << 8))
+ abort ();
+ else
+ a[i] = (i - 2048) << 8;
+ f2 (4095);
+ if (a[0] != (-2048 << 8))
+ abort ();
+ for (i = 1; i < 4096; i++)
+ if (a[i] != ((1 + ((i - 2048) / 16)) << 8))
+ abort ();
+ else
+ a[i] = (i - 2048) << 8;
+ f3 (4095);
+ if (a[0] != (-2048 << 8))
+ abort ();
+ for (i = 1; i < 4096; i++)
+ if (a[i] != ((1 + ((i - 2048) % 2)) << 8))
+ abort ();
+ else
+ a[i] = (i - 2048) << 8;
+ f4 (4095);
+ if (a[0] != (-2048 << 8))
+ abort ();
+ for (i = 1; i < 4096; i++)
+ if (a[i] != ((1 + ((i - 2048) % 16)) << 8))
+ abort ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" { target vect_condition } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
Jakub