This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] PR90838: Support ctz idioms
- From: Wilco Dijkstra <Wilco dot Dijkstra at arm dot com>
- To: Richard Biener <richard dot guenther at gmail dot com>
- Cc: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Date: Fri, 15 Nov 2019 15:23:56 +0000
- Subject: Re: [PATCH] PR90838: Support ctz idioms
- Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none
- Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=TxMVSjczeuuQeJOiMeNbW51mP8g3A8AvOf5mjnH/svo=; b=bqyBwafTlpnx68bojMpP5yQM2AJl62BoymODgztIAeo+0xa7VfYuW0/FNff7jCBurXLSufQQY3VnTKelKBkGZ9BsnGiGJZny/4qzeZOudawtLwP+LeypqlvHK+UpscdpjiwmLBmc8VQmMYCEPpn64/PePR5qZn8Y5O2RDV1sR5asXz39gTt/4NxVORmkPeCpEl21U4K+D35JSJTS7vWCCwRfK+tJoTgbM2KRCHnYXk9A1rl3OM6UIo59QQ5PmAfjbXN9kkvU9jAa3on5vs2ciV+VZcUe5h4IPJhPglUqaUoT4MANVrb2CiRDRqLtAD+dP+/JPmhbi7UVT8nrO/ggzQ==
- Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=QtYUqkPL30trVeNNqyTHtu5cmUF4sKJTB5DMFsIttyW0aURwupqIRzKYjXSDV1+GrRg0XvY9Ew7mIhVnHlmGuOfU577IScyxkmYya/kd7vwDAfBZRewfV1G4lBSc25q45y2nb6q0QX1Cg468M0VgWJn5sBWMAbfW0bHHdZl7X/QG3efJQYd2h+ir/DXH5jIOA1ZXFq462nw2UbTxbryi9S1uP15jJeNPBW40G7T+9tQ0d2aBbDVE6tz2+88EA8UmfegbdgeDim2Z1aO3nMmbQhRZyrj1x5OBzNkGfetNAjOY7Sjjy8+6BFaa4FulY3v+Pvo2yZHCagPLNsjF64uSuQ==
- Original-authentication-results: spf=none (sender IP is ) smtp.mailfrom=Wilco dot Dijkstra at arm dot com;
- References: <VI1PR0801MB212780CDB6061AD5045E74B283770@VI1PR0801MB2127.eurprd08.prod.outlook.com>,<CAFiYyc1jRXCMeMXf2nVs7xKD-a91rXKzXbCrqjDmukevehC0Zg@mail.gmail.com>
Hi Richard,
> Uh. Well. I think that the gimple-match-head.c hunk isn't something we want. Instead,
> since this optimizes a memory access, the handling should move
> to tree-ssa-forwprop.c where you _may_ use a (match ...)
> match.pd pattern to do the (rshift (mult (bit_and (negate @1) @1)
> matching. It might be the first to use that feature, you need to
> declare the function to use it from tree-ssa-forwprop.c. So
OK, I've moved to to fwprop, and it works just fine there while still
using match.pd to do the idiom matching. Here is the updated version:
[PATCH v2] PR90838: Support ctz idioms
v2: Use fwprop pass rather than match.pd
Support common idioms for count trailing zeroes using an array lookup.
The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic
constant which when multiplied by a power of 2 contains a unique value
in the top 5 or 6 bits. This is then indexed into a table which maps it
to the number of trailing zeroes. When the table is valid, we emit a
sequence using the target defined value for ctz (0):
int ctz1 (unsigned x)
{
static const char table[32] =
{
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
};
return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27];
}
Is optimized to:
rbit w0, w0
clz w0, w0
and w0, w0, 31
ret
Bootstrapped on AArch64. OK for commit?
ChangeLog:
2019-11-15 Wilco Dijkstra <wdijkstr@arm.com>
PR tree-optimization/90838
* tree-ssa-forwprop.c (optimize_count_trailing_zeroes):
Add new function.
(simplify_count_trailing_zeroes): Add new function.
(pass_forwprop::execute): Try ctz simplification.
* match.pd: Add matching for ctz idioms.
* testsuite/gcc.target/aarch64/pr90838.c: New test.
---
diff --git a/gcc/match.pd b/gcc/match.pd
index 6edf54b80012d87dbe7330f5ee638cdba2f9c099..479e9076f0d4deccda54425e93ee4567b85409aa 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6060,3 +6060,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(simplify
(vec_perm vec_same_elem_p@0 @0 @1)
@0)
+
+/* Match count trailing zeroes for simplify_count_trailing_zeroes in fwprop.
+ The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic
+ constant which when multiplied by a power of 2 contains a unique value
+ in the top 5 or 6 bits. This is then indexed into a table which maps it
+ to the number of trailing zeroes. */
+(match (ctz_table_index @1 @2 @3)
+ (rshift (mult (bit_and (negate @1) @1) INTEGER_CST@2) INTEGER_CST@3))
diff --git a/gcc/testsuite/gcc.target/aarch64/pr90838.c b/gcc/testsuite/gcc.target/aarch64/pr90838.c
new file mode 100644
index 0000000000000000000000000000000000000000..bff3144c0d1b3984016e5a404e986eae785c73ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr90838.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int ctz1 (unsigned x)
+{
+ static const char table[32] =
+ {
+ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+ 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+ };
+
+ return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27];
+}
+
+int ctz2 (unsigned x)
+{
+ const int u = 0;
+ static short table[64] =
+ {
+ 32, 0, 1,12, 2, 6, u,13, 3, u, 7, u, u, u, u,14,
+ 10, 4, u, u, 8, u, u,25, u, u, u, u, u,21,27,15,
+ 31,11, 5, u, u, u, u, u, 9, u, u,24, u, u,20,26,
+ 30, u, u, u, u,23, u,19,29, u,22,18,28,17,16, u
+ };
+
+ x = (x & -x) * 0x0450FBAF;
+ return table[x >> 26];
+}
+
+int ctz3 (unsigned x)
+{
+ static int table[32] =
+ {
+ 0, 1, 2,24, 3,19, 6,25, 22, 4,20,10,16, 7,12,26,
+ 31,23,18, 5,21, 9,15,11,30,17, 8,14,29,13,28,27
+ };
+
+ if (x == 0) return 32;
+ x = (x & -x) * 0x04D7651F;
+ return table[x >> 27];
+}
+
+static const unsigned long long magic = 0x03f08c5392f756cdULL;
+
+static const char table[64] = {
+ 0, 1, 12, 2, 13, 22, 17, 3,
+ 14, 33, 23, 36, 18, 58, 28, 4,
+ 62, 15, 34, 26, 24, 48, 50, 37,
+ 19, 55, 59, 52, 29, 44, 39, 5,
+ 63, 11, 21, 16, 32, 35, 57, 27,
+ 61, 25, 47, 49, 54, 51, 43, 38,
+ 10, 20, 31, 56, 60, 46, 53, 42,
+ 9, 30, 45, 41, 8, 40, 7, 6,
+};
+
+int ctz4 (unsigned long x)
+{
+ unsigned long lsb = x & -x;
+ return table[(lsb * magic) >> 58];
+}
+
+/* { dg-final { scan-assembler-times "clz\t" 4 } } */
+/* { dg-final { scan-assembler-times "and\t" 2 } } */
+/* { dg-final { scan-assembler-not "cmp\t.*0" } } */
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index fe55ca958b49b986f79a9a710d92b5d906959105..a632d54712be55f8070c9816e3c3702d4a493182 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3. If not see
#include "optabs-tree.h"
#include "tree-vector-builder.h"
#include "vec-perm-indices.h"
+#include "internal-fn.h"
/* This pass propagates the RHS of assignment statements into use
sites of the LHS of the assignment. It's basically a specialized
@@ -1778,6 +1779,126 @@ simplify_rotate (gimple_stmt_iterator *gsi)
return true;
}
+
+/* Recognize count trailing zeroes idiom.
+ The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic
+ constant which when multiplied by a power of 2 contains a unique value
+ in the top 5 or 6 bits. This is then indexed into a table which maps it
+ to the number of trailing zeroes. Array[0] is returned so the caller can
+ emit an appropriate sequence depending on whether ctz (0) is defined on
+ the target. */
+static bool
+optimize_count_trailing_zeroes (tree type, tree array, tree x, tree mulc,
+ tree tshift, tree &zero_val)
+{
+ gcc_assert (TREE_CODE (mulc) == INTEGER_CST);
+ gcc_assert (TREE_CODE (tshift) == INTEGER_CST);
+
+ tree input_type = TREE_TYPE (x);
+
+ if (!direct_internal_fn_supported_p (IFN_CTZ, input_type, OPTIMIZE_FOR_BOTH))
+ return false;
+
+ unsigned HOST_WIDE_INT val = tree_to_uhwi (mulc);
+ unsigned shiftval = tree_to_uhwi (tshift);
+ unsigned input_bits = tree_to_shwi (TYPE_SIZE (input_type));
+
+ /* Check the array is not wider than integer type and the input is a 32-bit
+ or 64-bit type. The shift should extract the top 5..7 bits. */
+ if (TYPE_PRECISION (type) > 32)
+ return false;
+ if (input_bits != 32 && input_bits != 64)
+ return false;
+ if (shiftval < input_bits - 7 || shiftval > input_bits - 5)
+ return false;
+
+ tree t = build4 (ARRAY_REF, type, array, size_int (0), NULL_TREE, NULL_TREE);
+ t = fold_const_aggregate_ref (t);
+ if (t == NULL)
+ return false;
+
+ zero_val = build_int_cst (integer_type_node, tree_to_shwi (t));
+
+ for (unsigned i = 0; i < input_bits; i++, val <<= 1)
+ {
+ if (input_bits == 32)
+ val &= 0xffffffff;
+ t = build4 (ARRAY_REF, type, array, size_int ((int)(val >> shiftval)),
+ NULL_TREE, NULL_TREE);
+ t = fold_const_aggregate_ref (t);
+ if (t == NULL || tree_to_shwi (t) != i)
+ return false;
+ }
+
+ return true;
+}
+
+/* Match.pd function to match the ctz expression. */
+extern bool gimple_ctz_table_index (tree, tree *, tree (*)(tree));
+
+static bool
+simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi)
+{
+ gimple *stmt = gsi_stmt (*gsi);
+ tree array_ref = gimple_assign_rhs1 (stmt);
+ tree res_ops[3];
+ tree zero_val;
+
+ gcc_checking_assert (TREE_CODE (array_ref) == ARRAY_REF);
+
+ if (!gimple_ctz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0], NULL))
+ return false;
+
+ if (optimize_count_trailing_zeroes (TREE_TYPE (array_ref),
+ TREE_OPERAND (array_ref, 0), res_ops[0],
+ res_ops[1], res_ops[2], zero_val))
+ {
+ tree lhs = gimple_assign_lhs (stmt);
+ tree type = TREE_TYPE (res_ops[0]);
+ HOST_WIDE_INT val;
+ HOST_WIDE_INT type_size = tree_to_shwi (TYPE_SIZE (type));
+ tree lhs = gimple_assign_lhs (stmt);
+ bool zero_ok = CTZ_DEFINED_VALUE_AT_ZERO (TYPE_MODE (type), val);
+ bool need_convert = !useless_type_conversion_p (TREE_TYPE (lhs),
+ integer_type_node);
+
+ gcall *call = gimple_build_call_internal (IFN_CTZ, 1, res_ops[0]);
+ gimple_set_location (call, gimple_location (stmt));
+ gimple_set_lhs (call, make_ssa_name (integer_type_node));
+
+ gimple *g = call;
+ tree prev_lhs = gimple_call_lhs (call);
+
+ /* Emit ctz (x) & 31 if ctz (0) is 32 but we need to return 0. */
+ if (zero_ok && val == type_size && integer_zerop (zero_val))
+ {
+ gsi_insert_before (gsi, g, GSI_SAME_STMT);
+ g = gimple_build_assign (make_ssa_name (integer_type_node),
+ BIT_AND_EXPR, prev_lhs,
+ build_int_cst (integer_type_node,
+ type_size - 1));
+ gimple_set_location (g, gimple_location (stmt));
+ prev_lhs = gimple_assign_lhs (g);
+ }
+ else if (!zero_ok || tree_to_shwi (zero_val) != val)
+ return false;
+
+ if (need_convert)
+ {
+ gsi_insert_before (gsi, g, GSI_SAME_STMT);
+ g = gimple_build_assign (lhs, NOP_EXPR, prev_lhs);
+ }
+ else
+ gimple_set_lhs (g, lhs);
+
+ gsi_replace (gsi, g, false);
+ return true;
+ }
+
+ return false;
+}
+
+
/* Combine an element access with a shuffle. Returns true if there were
any changes made, else it returns false. */
@@ -2759,6 +2880,8 @@ pass_forwprop::execute (function *fun)
else if (code == CONSTRUCTOR
&& TREE_CODE (TREE_TYPE (rhs1)) == VECTOR_TYPE)
changed = simplify_vector_constructor (&gsi);
+ else if (code == ARRAY_REF)
+ changed = simplify_count_trailing_zeroes (&gsi);
break;
}