[PATCH] Optimize manual byte swap implementations v2
Richard Guenther
richard.guenther@gmail.com
Thu Feb 5 21:50:00 GMT 2009
On Thu, Feb 5, 2009 at 9:38 PM, Andreas Krebbel <krebbel1@de.ibm.com> wrote:
> Hello,
>
> this is a new version of the bswap recognizer. The optimizer now
> simulates bit permutation and selection operations on a symbolic
> number in order to recognize handcrafted byte swap implementations.
>
> The optimizer is able to recognize all the different implementations
> which were cited as response to the first version. I've put these
> implementations together into a testcase which is part of the patch.
> The optimizer now will probably also be able to detect a variety of
> implementations beyond those and hopefully nothing which isn't
> actually a bswap ;)
>
> Also the Integer.reverseBytes method would be optimized that way if
> Java would enable the bswap builtins.
>
> Tested on x86_64, s390 and s390x.
>
> Ok for mainline when entering stage 1?
Nice!
How is the compile-time effect on say, cc1files?
Thanks,
Richard.
> Bye,
>
> -Andreas-
>
>
>
> 2009-02-05 Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
>
> * passes.c: Add bswap pass.
> * tree-pass.h: Add pass_optimize_bswap declaration.
> * tree-ssa-math-opts.c: Include diagnostics.h for print_gimple_stmt.
> (struct symbolic_number, pass_optimize_bswap): New definition.
> (BSWAP_RECURSION_LIMIT): Constant value definition added.
> (do_shift_rotate, verify_symbolic_number_p): New functions.
> (find_bswap_1, find_bswap, execute_optimize_bswap): New functions.
> (gate_optimize_bswap): New function.
>
>
> 2009-02-05 Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
>
> * gcc.dg/optimize-bswap-1.c: New testcase.
>
>
> Index: gcc/passes.c
> ===================================================================
> --- gcc/passes.c.orig
> +++ gcc/passes.c
> @@ -673,6 +673,7 @@ init_optimization_passes (void)
> }
> NEXT_PASS (pass_cse_reciprocals);
> NEXT_PASS (pass_convert_to_rsqrt);
> + NEXT_PASS (pass_optimize_bswap);
> NEXT_PASS (pass_reassoc);
> NEXT_PASS (pass_vrp);
> NEXT_PASS (pass_dominator);
> Index: gcc/tree-pass.h
> ===================================================================
> --- gcc/tree-pass.h.orig
> +++ gcc/tree-pass.h
> @@ -363,6 +363,7 @@ extern struct gimple_opt_pass pass_late_
> extern struct gimple_opt_pass pass_cse_reciprocals;
> extern struct gimple_opt_pass pass_cse_sincos;
> extern struct gimple_opt_pass pass_convert_to_rsqrt;
> +extern struct gimple_opt_pass pass_optimize_bswap;
> extern struct gimple_opt_pass pass_warn_function_return;
> extern struct gimple_opt_pass pass_warn_function_noreturn;
> extern struct gimple_opt_pass pass_cselim;
> Index: gcc/tree-ssa-math-opts.c
> ===================================================================
> --- gcc/tree-ssa-math-opts.c.orig
> +++ gcc/tree-ssa-math-opts.c
> @@ -97,6 +97,7 @@ along with GCC; see the file COPYING3.
> #include "alloc-pool.h"
> #include "basic-block.h"
> #include "target.h"
> +#include "diagnostic.h"
>
>
> /* This structure represents one basic block that either computes a
> @@ -879,3 +880,338 @@ struct gimple_opt_pass pass_convert_to_r
> | TODO_verify_stmts /* todo_flags_finish */
> }
> };
> +
> +/* A symbolic number is used to detect byte permutation and selection
> + patterns. Therefore the field N contains an artificial number
> + consisting of byte size markers:
> +
> + 0 - byte has the value 0
> + 1..size - byte contains the content of the byte
> + number indexed with that value minus one */
> +
> +struct symbolic_number {
> + HOST_WIDE_INT n;
> + ssize_t size;
> +};
> +
> +/* In order to limit the number of instructions considered while
> + looking for a bswap implementation this number defines the maximum
> + depth of the search. */
> +static const int BSWAP_RECURSION_LIMIT = 8;
> +
> +/* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
> + number N. Return false if the requested operation is not permitted
> + on a symbolic number. */
> +
> +static inline bool
> +do_shift_rotate (enum tree_code code,
> + struct symbolic_number *n,
> + HOST_WIDE_INT count)
> +{
> + if (count % 8 != 0)
> + return false;
> +
> + /* Zero out the extra bits of N in order to avoid them being shifted
> + into the significant bits. */
> + if (n->size < (int)sizeof (HOST_WIDE_INT))
> + n->n &= ((HOST_WIDE_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
> +
> + switch (code)
> + {
> + case LSHIFT_EXPR:
> + n->n <<= count;
> + break;
> + case RSHIFT_EXPR:
> + n->n >>= count;
> + break;
> + case LROTATE_EXPR:
> + n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
> + break;
> + case RROTATE_EXPR:
> + n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
> + break;
> + default:
> + return false;
> + }
> + return true;
> +}
> +
> +/* Perform sanity checking for the symbolic number N and the gimple
> + statement STMT. */
> +
> +static inline bool
> +verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
> +{
> + tree lhs_type;
> +
> + lhs_type = TREE_TYPE (gimple_assign_lhs (stmt));
> +
> + if (TREE_CODE (lhs_type) != INTEGER_TYPE)
> + return false;
> +
> + if (int_cst_value (TYPE_SIZE (lhs_type)) !=
> + (HOST_WIDE_INT)n->size * BITS_PER_UNIT)
> + return false;
> +
> + return true;
> +}
> +
> +/* find_bswap_1 invokes itself recursively with N and tries to perform
> + the operation given by the rhs of STMT on the result. If the
> + operation could successfully be executed the function returns the
> + tree expression of the source operand and NULL otherwise. */
> +
> +static tree
> +find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
> +{
> + enum tree_code code;
> + tree rhs1, rhs2 = NULL;
> + gimple rhs1_stmt, rhs2_stmt;
> + tree source_expr1;
> +
> + if (!limit || !is_gimple_assign (stmt))
> + return NULL;
> +
> + if (gimple_num_ops (stmt) != 2
> + && gimple_num_ops (stmt) != 3)
> + return NULL;
> +
> + code = gimple_assign_rhs_code (stmt);
> + rhs1 = gimple_assign_rhs1 (stmt);
> +
> + if (TREE_CODE (rhs1) != SSA_NAME)
> + return NULL;
> +
> + rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
> +
> + if (gimple_num_ops (stmt) == 3)
> + rhs2 = gimple_assign_rhs2 (stmt);
> +
> + /* Handle unary rhs and binary rhs with integer constants as second
> + operand. */
> +
> + if (gimple_num_ops (stmt) == 2
> + || (gimple_num_ops (stmt) == 3
> + && TREE_CODE (rhs2) == INTEGER_CST))
> + {
> + source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
> +
> + /* If find_bswap_1 returned NULL STMT is a leaf node and we have
> + to initialize the symbolic number. */
> + if (!source_expr1)
> + {
> + int i;
> +
> + /* Set up the symbolic number N by setting each byte to a
> + value between 1 and the byte size of rhs1. The highest
> + order byte is set to 1 and the lowest order byte to
> + n.size. */
> + n->size = (size_t)int_cst_value (TYPE_SIZE (
> + TREE_TYPE (rhs1))) / BITS_PER_UNIT;
> +
> + n->n = 0;
> + for (i = 0; i < n->size; i++)
> + n->n |= ((HOST_WIDE_INT)(n->size - i) << (i * BITS_PER_UNIT));
> +
> + source_expr1 = rhs1;
> + }
> +
> + switch (code)
> + {
> + case BIT_AND_EXPR:
> + {
> + int i;
> + HOST_WIDE_INT tmp = int_cst_value (rhs2);
> +
> + /* Only constants masking full bytes are allowed. */
> + for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
> + if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
> + return false;
> +
> + n->n &= int_cst_value (rhs2);
> + }
> + break;
> + case LSHIFT_EXPR:
> + case RSHIFT_EXPR:
> + case LROTATE_EXPR:
> + case RROTATE_EXPR:
> + if (!do_shift_rotate (code, n, int_cst_value (rhs2)))
> + return NULL;
> + break;
> + CASE_CONVERT:
> + {
> + HOST_WIDE_INT type_size;
> +
> + type_size = int_cst_value (TYPE_SIZE (TREE_TYPE (
> + gimple_assign_lhs (stmt))));
> +
> + if (type_size < (int)(sizeof (HOST_WIDE_INT) * BITS_PER_UNIT))
> + {
> + /* If STMT casts to a smaller type mask out the bits not
> + belonging to the target type. */
> + n->size = type_size / BITS_PER_UNIT;
> + n->n &= ((HOST_WIDE_INT)1 << type_size) - 1;
> + }
> + }
> + break;
> + default:
> + return NULL;
> + };
> + return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
> + }
> +
> + /* Handle binary rhs. */
> +
> + if (gimple_num_ops (stmt) == 3)
> + {
> + struct symbolic_number n1, n2;
> + tree source_expr2;
> +
> + if (TREE_CODE (rhs2) != SSA_NAME)
> + return NULL;
> +
> + rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
> +
> + switch (code)
> + {
> + case BIT_IOR_EXPR:
> + source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
> + source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
> +
> + if (!source_expr1
> + || source_expr1 != source_expr2
> + || n1.size != n2.size)
> + return NULL;
> +
> + n->size = n1.size;
> + n->n = n1.n | n2.n;
> +
> + if (!verify_symbolic_number_p (n, stmt))
> + return NULL;
> +
> + break;
> + default:
> + return NULL;
> + }
> + return source_expr1;
> + }
> + return NULL;
> +}
> +
> +/* Check if STMT completes a bswap implementation consisting of ORs,
> + SHIFTs and ANDs. Return the source tree expression on which the
> + byte swap is performed and NULL if no bswap was found. */
> +
> +static tree
> +find_bswap (gimple stmt)
> +{
> + struct symbolic_number n;
> + tree source_expr;
> + int i;
> +
> + source_expr = find_bswap_1 (stmt, &n, BSWAP_RECURSION_LIMIT);
> +
> + if (!source_expr)
> + return NULL;
> +
> + /* A complete byte swap should make the symbolic number to start
> + with the largest digit in the highest order byte. */
> + for (i = 0; i < n.size; i++, n.n >>= BITS_PER_UNIT)
> + if ((n.n & 0xff) != i + 1)
> + return NULL;
> +
> + return source_expr;
> +}
> +
> +/* Find manual byte swap implementations and turn them into a bswap
> + builtin invokation. */
> +
> +static unsigned int
> +execute_optimize_bswap (void)
> +{
> + basic_block bb;
> +
> + if (BITS_PER_UNIT != 8)
> + return 0;
> +
> + /* Not all languages might support the bswap builtins. */
> + if (!built_in_decls[BUILT_IN_BSWAP32]
> + && !built_in_decls[BUILT_IN_BSWAP64])
> + return 0;
> +
> + FOR_EACH_BB (bb)
> + {
> + gimple_stmt_iterator gsi;
> +
> + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> + {
> + gimple stmt = gsi_stmt (gsi);
> + tree bswap_src = find_bswap (stmt);
> + tree fndecl;
> + HOST_WIDE_INT type_size;
> + gimple call;
> +
> + if (!bswap_src)
> + continue;
> +
> + type_size = int_cst_value (TYPE_SIZE (TREE_TYPE (
> + gimple_assign_lhs (stmt))));
> +
> + switch (type_size)
> + {
> + case 32:
> + fndecl = built_in_decls[BUILT_IN_BSWAP32];
> + break;
> + case 64:
> + fndecl = built_in_decls[BUILT_IN_BSWAP64];
> + break;
> + default:
> + continue;
> + }
> +
> + if (!fndecl)
> + continue;
> +
> + call = gimple_build_call (fndecl, 1, bswap_src);
> + gimple_call_set_lhs (call, gimple_assign_lhs (stmt));
> +
> + if (dump_file)
> + {
> + fprintf (dump_file, "%d bit bswap implementation found at: ",
> + (int)type_size);
> + print_gimple_stmt (dump_file, stmt, 0, 0);
> + }
> +
> + gsi_insert_after (&gsi, call, GSI_SAME_STMT);
> + gsi_remove (&gsi, true);
> + }
> + }
> +
> + return 0;
> +}
> +
> +static bool
> +gate_optimize_bswap (void)
> +{
> + return flag_expensive_optimizations && optimize;
> +}
> +
> +struct gimple_opt_pass pass_optimize_bswap =
> +{
> + {
> + GIMPLE_PASS,
> + "bswap", /* name */
> + gate_optimize_bswap, /* gate */
> + execute_optimize_bswap, /* execute */
> + NULL, /* sub */
> + NULL, /* next */
> + 0, /* static_pass_number */
> + 0, /* tv_id */
> + PROP_ssa, /* properties_required */
> + 0, /* properties_provided */
> + 0, /* properties_destroyed */
> + 0, /* todo_flags_start */
> + TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
> + | TODO_verify_stmts /* todo_flags_finish */
> + }
> +};
> Index: gcc/testsuite/gcc.dg/optimize-bswap-1.c
> ===================================================================
> --- /dev/null
> +++ gcc/testsuite/gcc.dg/optimize-bswap-1.c
> @@ -0,0 +1,61 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target stdint_types } */
> +/* { dg-options "-O2 -fdump-tree-bswap" } */
> +
> +#include <stdint.h>
> +
> +#define __const_swab32(x) ((uint32_t)( \
> + (((uint32_t)(x) & (uint32_t)0x000000ffUL) << 24) | \
> + (((uint32_t)(x) & (uint32_t)0x0000ff00UL) << 8) | \
> + (((uint32_t)(x) & (uint32_t)0x00ff0000UL) >> 8) | \
> + (((uint32_t)(x) & (uint32_t)0xff000000UL) >> 24)))
> +
> +#define __const_swab64(x) ((uint64_t)( \
> + (((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 56) | \
> + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 40) | \
> + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 24) | \
> + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 8) | \
> + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 8) | \
> + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 24) | \
> + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 40) | \
> + (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 56)))
> +
> +/* This byte swap implementation is used by the Linux kernel and the
> + GNU C library. */
> +
> +uint32_t
> +swap32_a (uint32_t in)
> +{
> + return __const_swab32 (in);
> +}
> +
> +uint64_t
> +swap64 (uint64_t in)
> +{
> + return __const_swab64 (in);
> +}
> +
> +/* The OpenSSH byte swap implementation. */
> +uint32_t
> +swap32_b (uint32_t in)
> +{
> + uint32_t a;
> +
> + a = (in << 16) | (in >> 16);
> + a = ((a & 0x00ff00ff) << 8) | ((a & 0xff00ff00) >> 8);
> +
> + return a;
> +}
> +
> +uint32_t
> +swap32_c (uint32_t in)
> +{
> + return ((in << 24)
> + | (((uint8_t)(in >> 8) << 16))
> + | (((uint8_t)(in >> 16) << 8))
> + | (in >> 24));
> +}
> +
> +/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 3 "bswap" } } */
> +/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 1 "bswap" } } */
> +/* { dg-final { cleanup-tree-dump "bswap" } } */
>
More information about the Gcc-patches
mailing list