[PATCH][RFC] Make the function vectorizer capable of doing type transformations
Dorit Nuzman
DORIT@il.ibm.com
Wed Jan 31 10:51:00 GMT 2007
>
> This enhances the function vectorizer to handle functions with
> differing result and argument types.
>
> RFC because the code needs a cleanup.
>
> This enables us to vectorize
>
> int a[256];
> float b[256];
> long lrintf (float);
> void foo(void)
> {
> int i;
> for (i=0; i<256; ++i)
> {
> a[i] = lrintf (b[i]);
> }
> }
>
> on 32bit SSE2.
>
looks good to me. and it also addresses the problem I mentioned here -
http://gcc.gnu.org/ml/gcc-patches/2007-01/msg02088.html:
"By the way - this (supporting the case that ncopies>1) is something that
is
also missing in vectorizable_call ... I'm travelling next week, but could
provide a patch to add the required support in the following week. "
So this patch takes of that. You may want to add a testcase that checks
that. E.g. something like:
int a[256];
float b[256];
long lrintf (float);
void foo(void)
{
int i;
for (i=0; i<256; ++i)
{
a[i] = lrintf (b[i]);
char_arr[i] = 0;
}
}
A few small questions/comments:
> +
> + nargs++;
> + if (nargs >= 2)
> + return false;
> + }
any inherent problem behind this check, or just restricting (FORNOW?) to
the certain function-calls you expect to see? (which is fine, just
wondering)
> + case BUILT_IN_LRINT:
> + if (out_mode == SImode && out_n == 2
> + && in_mode == DFmode && in_n == 2)
> + return ix86_builtins[IX86_BUILTIN_CVTPD2PI];
> + return NULL_TREE;
(I assume you'll have a testcase for each of those?)
> + /* Only handle the case of vectors with the same number of elements.
> + FIXME: We need a way to handle for example the SSE2 cvtpd2dq
> + instruction which converts V2DFmode to V4SImode but only
> + using the lower half of the V4SImode result. */
> + if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS
> (vectype_out))
yes. this requires similar functionality to the one that vectorizes
v2di->v4si in vectorizable_demotion, expect we need a different idiom
instead of the vec_pack/unpack to "convert-and-unpack" 4 doubles (organized
in 2 regs) into 4 ints (some target hook maybe?).
Could you please also add a testcase for this (with xfail?)
thanks,
dorit
> Richard.
>
>
> 2007-01-30 Richard Guenther <rguenther@suse.de>
>
> * tree-vectorizer.h (vectorizable_function): Add argument type
> argument.
> * tree-vect-patterns.c (vect_recog_pow_pattern): Adjust caller.
> * tree-vect-transform.c (vectorizable_function): Handle extra
> argument.
> (build_vectorized_function_call): Likewise.
> (vectorizable_call): Handle calls with result and argument types
> differing. Handle loop vectorization factor correctly.
> * targhooks.c (default_builtin_vectorized_function): Adjust for
> extra argument.
> * targhooks.h (default_builtin_vectorized_function): Likewise.
> * target.h (builtin_vectorized_function): Add argument type
> argument.
> * config/i386/i386.c (ix86_builtin_vectorized_function): Handle
> extra argument, allow vectorizing of lrintf.
>
> Index: tree-vectorizer.h
> ===================================================================
> *** tree-vectorizer.h (revision 121338)
> --- tree-vectorizer.h (working copy)
> *************** extern bool vectorizable_operation (tree
> *** 412,418 ****
> extern bool vectorizable_type_promotion (tree, block_stmt_iterator
> *, tree *);
> extern bool vectorizable_type_demotion (tree, block_stmt_iterator
> *, tree *);
> extern bool vectorizable_assignment (tree, block_stmt_iterator *, tree
*);
> ! extern bool vectorizable_function (tree, tree);
> extern bool vectorizable_call (tree, block_stmt_iterator *, tree *);
> extern bool vectorizable_condition (tree, block_stmt_iterator *, tree
*);
> extern bool vectorizable_live_operation (tree, block_stmt_iterator
> *, tree *);
> --- 412,418 ----
> extern bool vectorizable_type_promotion (tree, block_stmt_iterator
> *, tree *);
> extern bool vectorizable_type_demotion (tree, block_stmt_iterator
> *, tree *);
> extern bool vectorizable_assignment (tree, block_stmt_iterator *, tree
*);
> ! extern bool vectorizable_function (tree, tree, tree);
> extern bool vectorizable_call (tree, block_stmt_iterator *, tree *);
> extern bool vectorizable_condition (tree, block_stmt_iterator *, tree
*);
> extern bool vectorizable_live_operation (tree, block_stmt_iterator
> *, tree *);
> Index: tree-vect-patterns.c
> ===================================================================
> *** tree-vect-patterns.c (revision 121338)
> --- tree-vect-patterns.c (working copy)
> *************** vect_recog_pow_pattern (tree last_stmt,
> *** 488,494 ****
> if (*type_in)
> {
> newfn = build_function_call_expr (newfn, newarglist);
> ! if (vectorizable_function (newfn, *type_in))
> return newfn;
> }
> }
> --- 488,494 ----
> if (*type_in)
> {
> newfn = build_function_call_expr (newfn, newarglist);
> ! if (vectorizable_function (newfn, *type_in, *type_in))
> return newfn;
> }
> }
> Index: tree-vect-transform.c
> ===================================================================
> *** tree-vect-transform.c (revision 121338)
> --- tree-vect-transform.c (working copy)
> *************** vectorizable_reduction (tree stmt, block
> *** 1583,1589 ****
> or false if the function cannot be vectorized. */
>
> bool
> ! vectorizable_function (tree call, tree vectype)
> {
> tree fndecl = get_callee_fndecl (call);
>
> --- 1583,1589 ----
> or false if the function cannot be vectorized. */
>
> bool
> ! vectorizable_function (tree call, tree vectype_out, tree vectype_in)
> {
> tree fndecl = get_callee_fndecl (call);
>
> *************** vectorizable_function (tree call, tree v
> *** 1597,1603 ****
> || !DECL_BUILT_IN (fndecl))
> return false;
>
> ! if (targetm.vectorize.builtin_vectorized_function
> (DECL_FUNCTION_CODE (fndecl), vectype))
> return true;
>
> return false;
> --- 1597,1603 ----
> || !DECL_BUILT_IN (fndecl))
> return false;
>
> ! if (targetm.vectorize.builtin_vectorized_function
> (DECL_FUNCTION_CODE (fndecl), vectype_out, vectype_in))
> return true;
>
> return false;
> *************** vectorizable_function (tree call, tree v
> *** 1610,1622 ****
>
> static tree
> build_vectorized_function_call (tree fndecl,
> ! tree vectype, tree args)
> {
> tree vfndecl;
> enum built_in_function code = DECL_FUNCTION_CODE (fndecl);
>
> /* The target specific builtin should be available. */
> ! vfndecl = targetm.vectorize.builtin_vectorized_function (code,
vectype);
> gcc_assert (vfndecl != NULL_TREE);
>
> return build_function_call_expr (vfndecl, args);
> --- 1610,1622 ----
>
> static tree
> build_vectorized_function_call (tree fndecl,
> ! tree vectype_out, tree vectype_in, tree args)
> {
> tree vfndecl;
> enum built_in_function code = DECL_FUNCTION_CODE (fndecl);
>
> /* The target specific builtin should be available. */
> ! vfndecl = targetm.vectorize.builtin_vectorized_function (code,
> vectype_out, vectype_in);
> gcc_assert (vfndecl != NULL_TREE);
>
> return build_function_call_expr (vfndecl, args);
> *************** vectorizable_call (tree stmt, block_stmt
> *** 1636,1647 ****
> tree scalar_dest;
> tree operation;
> tree op, args, type;
> ! tree vec_oprnd, vargs, *pvargs_end;
> ! stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> ! tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
> ! tree fndecl, rhs, new_temp, def, def_stmt;
> ! enum vect_def_type dt;
>
> /* Is STMT a vectorizable call? */
> if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
> --- 1636,1648 ----
> tree scalar_dest;
> tree operation;
> tree op, args, type;
> ! tree vargs, *pvargs_end;
> ! stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
> ! tree vectype_out, vectype_in;
> loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
> ! tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
> ! enum vect_def_type dt[2];
> ! int ncopies, j, nargs;
>
> /* Is STMT a vectorizable call? */
> if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
> *************** vectorizable_call (tree stmt, block_stmt
> *** 1653,1684 ****
> operation = GIMPLE_STMT_OPERAND (stmt, 1);
> if (TREE_CODE (operation) != CALL_EXPR)
> return false;
> -
> - /* For now, we only vectorize functions if a target specific builtin
> - is available. TODO -- in some cases, it might be profitable to
> - insert the calls for pieces of the vector, in order to be able
> - to vectorize other operations in the loop. */
> - if (!vectorizable_function (operation, vectype))
> - {
> - if (vect_print_dump_info (REPORT_DETAILS))
> - fprintf (vect_dump, "function is not vectorizable.");
>
> ! return false;
> ! }
> ! gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
>
> for (args = TREE_OPERAND (operation, 1); args; args = TREE_CHAIN
(args))
> {
> op = TREE_VALUE (args);
>
> ! if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
> {
> if (vect_print_dump_info (REPORT_DETAILS))
> fprintf (vect_dump, "use not simple.");
> return false;
> }
> }
>
> if (!vec_stmt) /* transformation not required. */
> {
> STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
> --- 1654,1717 ----
> operation = GIMPLE_STMT_OPERAND (stmt, 1);
> if (TREE_CODE (operation) != CALL_EXPR)
> return false;
>
> ! lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
> ! vectype_out = get_vectype_for_scalar_type (lhs_type);
>
> + /* We can only handle calls with arguments of the same type. */
> + rhs_type = NULL_TREE;
> + nargs = 0;
> for (args = TREE_OPERAND (operation, 1); args; args = TREE_CHAIN
(args))
> {
> op = TREE_VALUE (args);
> + if (rhs_type
> + && rhs_type != TREE_TYPE (op))
> + {
> + if (vect_print_dump_info (REPORT_DETAILS))
> + fprintf (vect_dump, "argument types differ.");
> + return false;
> + }
> + rhs_type = TREE_TYPE (op);
>
> ! if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def,
&dt[nargs]))
> {
> if (vect_print_dump_info (REPORT_DETAILS))
> fprintf (vect_dump, "use not simple.");
> return false;
> }
> +
> + nargs++;
> + if (nargs >= 2)
> + return false;
> + }
> +
> + /* No arguments is also not good. */
> + if (nargs == 0)
> + return false;
> +
> + vectype_in = get_vectype_for_scalar_type (rhs_type);
> +
> + /* Only handle the case of vectors with the same number of elements.
> + FIXME: We need a way to handle for example the SSE2 cvtpd2dq
> + instruction which converts V2DFmode to V4SImode but only
> + using the lower half of the V4SImode result. */
> + if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS
> (vectype_out))
> + return false;
> +
> + /* For now, we only vectorize functions if a target specific builtin
> + is available. TODO -- in some cases, it might be profitable to
> + insert the calls for pieces of the vector, in order to be able
> + to vectorize other operations in the loop. */
> + if (!vectorizable_function (operation, vectype_out, vectype_in))
> + {
> + if (vect_print_dump_info (REPORT_DETAILS))
> + fprintf (vect_dump, "function is not vectorizable.");
> +
> + return false;
> }
>
> + gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
> +
> if (!vec_stmt) /* transformation not required. */
> {
> STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
> *************** vectorizable_call (tree stmt, block_stmt
> *** 1690,1718 ****
> if (vect_print_dump_info (REPORT_DETAILS))
> fprintf (vect_dump, "transform operation.");
>
> /* Handle def. */
> scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
> ! vec_dest = vect_create_destination_var (scalar_dest, vectype);
>
> ! /* Handle uses. */
> ! vargs = NULL_TREE;
> ! pvargs_end = &vargs;
> ! for (args = TREE_OPERAND (operation, 1); args; args = TREE_CHAIN
(args))
> {
> ! op = TREE_VALUE (args);
> ! vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
>
> ! *pvargs_end = tree_cons (NULL_TREE, vec_oprnd, NULL_TREE);
> ! pvargs_end = &TREE_CHAIN (*pvargs_end);
> ! }
>
> ! fndecl = get_callee_fndecl (operation);
> ! rhs = build_vectorized_function_call (fndecl, vectype, vargs);
> ! *vec_stmt = build2 (GIMPLE_MODIFY_STMT, vectype, vec_dest, rhs);
> ! new_temp = make_ssa_name (vec_dest, *vec_stmt);
> ! GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
>
> ! vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
>
> /* The call in STMT might prevent it from being removed in dce. We
however
> cannot remove it here, due to the way the ssa name it definesis
mapped
> --- 1723,1776 ----
> if (vect_print_dump_info (REPORT_DETAILS))
> fprintf (vect_dump, "transform operation.");
>
> + ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
> + / TYPE_VECTOR_SUBPARTS (vectype_out));
> + gcc_assert (ncopies >= 1);
> +
> /* Handle def. */
> scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
> ! vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>
> ! fndecl = get_callee_fndecl (operation);
> !
> ! prev_stmt_info = NULL;
> ! for (j = 0; j < ncopies; ++j)
> {
> ! tree new_stmt;
> ! tree vec_oprnd[nargs];
> ! int n;
> !
> ! /* Handle uses. */
> ! vargs = NULL_TREE;
> ! pvargs_end = &vargs;
> ! n = 0;
> ! for (args = TREE_OPERAND (operation, 1); args; args =
> TREE_CHAIN (args))
> ! {
> ! op = TREE_VALUE (args);
> ! if (j == 0)
> ! vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
> ! else
> ! vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n],
vec_oprnd[n]);
>
> ! *pvargs_end = tree_cons (NULL_TREE, vec_oprnd[n], NULL_TREE);
> ! pvargs_end = &TREE_CHAIN (*pvargs_end);
> ! n++;
> ! }
>
> ! rhs = build_vectorized_function_call (fndecl, vectype_out,
vectype_in,
> ! vargs);
> ! new_stmt = build2 (GIMPLE_MODIFY_STMT, NULL_TREE, vec_dest, rhs);
> ! new_temp = make_ssa_name (vec_dest, new_stmt);
> ! GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
>
> ! vect_finish_stmt_generation (stmt, new_stmt, bsi);
> !
> ! if (j == 0)
> ! STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
> ! else
> ! STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
> ! prev_stmt_info = vinfo_for_stmt (new_stmt);
> ! }
>
> /* The call in STMT might prevent it from being removed in dce. We
however
> cannot remove it here, due to the way the ssa name it definesis
mapped
> Index: targhooks.c
> ===================================================================
> *** targhooks.c (revision 121338)
> --- targhooks.c (working copy)
> *************** default_invalid_within_doloop (rtx insn)
> *** 323,329 ****
>
> tree
> default_builtin_vectorized_function (enum built_in_function fn
> ATTRIBUTE_UNUSED,
> ! tree type ATTRIBUTE_UNUSED)
> {
> return NULL_TREE;
> }
> --- 323,330 ----
>
> tree
> default_builtin_vectorized_function (enum built_in_function fn
> ATTRIBUTE_UNUSED,
> ! tree type_out ATTRIBUTE_UNUSED,
> ! tree type_in ATTRIBUTE_UNUSED)
> {
> return NULL_TREE;
> }
> Index: targhooks.h
> ===================================================================
> *** targhooks.h (revision 121338)
> --- targhooks.h (working copy)
> *************** extern const char * default_invalid_with
> *** 57,63 ****
>
> extern bool default_narrow_bitfield (void);
>
> ! extern tree default_builtin_vectorized_function (enum
> built_in_function, tree);
>
> /* These are here, and not in hooks.[ch], because not all users of
> hooks.h include tm.h, and thus we don't have CUMULATIVE_ARGS. */
> --- 57,63 ----
>
> extern bool default_narrow_bitfield (void);
>
> ! extern tree default_builtin_vectorized_function (enum
> built_in_function, tree, tree);
>
> /* These are here, and not in hooks.[ch], because not all users of
> hooks.h include tm.h, and thus we don't have CUMULATIVE_ARGS. */
> Index: target.h
> ===================================================================
> *** target.h (revision 121338)
> --- target.h (working copy)
> *************** struct gcc_target
> *** 399,405 ****
>
> /* Returns a code for builtin that realizes vectorized version of
> function, or NULL_TREE if not available. */
> ! tree (* builtin_vectorized_function) (unsigned, tree);
>
> /* Target builtin that implements vector widening multiplication.
> builtin_mul_widen_eve computes the element-by-element products
> --- 399,405 ----
>
> /* Returns a code for builtin that realizes vectorized version of
> function, or NULL_TREE if not available. */
> ! tree (* builtin_vectorized_function) (unsigned, tree, tree);
>
> /* Target builtin that implements vector widening multiplication.
> builtin_mul_widen_eve computes the element-by-element products
> Index: config/i386/i386.c
> ===================================================================
> *** config/i386/i386.c (revision 121338)
> --- config/i386/i386.c (working copy)
> *************** static bool ix86_pass_by_reference (CUMU
> *** 1355,1361 ****
> tree, bool);
> static void ix86_init_builtins (void);
> static rtx ix86_expand_builtin (tree, rtx, rtx, enum machine_mode,
int);
> ! static tree ix86_builtin_vectorized_function (enum built_in_function,
tree);
> static const char *ix86_mangle_fundamental_type (tree);
> static tree ix86_stack_protect_fail (void);
> static rtx ix86_internal_arg_pointer (void);
> --- 1355,1361 ----
> tree, bool);
> static void ix86_init_builtins (void);
> static rtx ix86_expand_builtin (tree, rtx, rtx, enum machine_mode,
int);
> ! static tree ix86_builtin_vectorized_function (enum
> built_in_function, tree, tree);
> static const char *ix86_mangle_fundamental_type (tree);
> static tree ix86_stack_protect_fail (void);
> static rtx ix86_internal_arg_pointer (void);
> *************** ix86_expand_builtin (tree exp, rtx targe
> *** 17632,17660 ****
> if it is not available. */
>
> static tree
> ! ix86_builtin_vectorized_function (enum built_in_function fn, tree type)
> {
> ! enum machine_mode el_mode;
> ! int n;
>
> ! if (TREE_CODE (type) != VECTOR_TYPE)
> return NULL_TREE;
>
> ! el_mode = TYPE_MODE (TREE_TYPE (type));
> ! n = TYPE_VECTOR_SUBPARTS (type);
>
> switch (fn)
> {
> case BUILT_IN_SQRT:
> ! if (el_mode == DFmode && n == 2)
> return ix86_builtins[IX86_BUILTIN_SQRTPD];
> return NULL_TREE;
>
> case BUILT_IN_SQRTF:
> ! if (el_mode == SFmode && n == 4)
> return ix86_builtins[IX86_BUILTIN_SQRTPS];
> return NULL_TREE;
>
> default:
> ;
> }
> --- 17632,17681 ----
> if it is not available. */
>
> static tree
> ! ix86_builtin_vectorized_function (enum built_in_function fn, tree
type_out,
> ! tree type_in)
> {
> ! enum machine_mode in_mode, out_mode;
> ! int in_n, out_n;
>
> ! if (TREE_CODE (type_out) != VECTOR_TYPE
> ! || TREE_CODE (type_in) != VECTOR_TYPE)
> return NULL_TREE;
>
> ! out_mode = TYPE_MODE (TREE_TYPE (type_out));
> ! out_n = TYPE_VECTOR_SUBPARTS (type_out);
> ! in_mode = TYPE_MODE (TREE_TYPE (type_in));
> ! in_n = TYPE_VECTOR_SUBPARTS (type_in);
>
> switch (fn)
> {
> case BUILT_IN_SQRT:
> ! if (out_mode == DFmode && out_n == 2
> ! && in_mode == DFmode && in_n == 2)
> return ix86_builtins[IX86_BUILTIN_SQRTPD];
> return NULL_TREE;
>
> case BUILT_IN_SQRTF:
> ! if (out_mode == SFmode && out_n == 4
> ! && in_mode == SFmode && in_n == 4)
> return ix86_builtins[IX86_BUILTIN_SQRTPS];
> return NULL_TREE;
>
> + case BUILT_IN_LRINT:
> + if (out_mode == SImode && out_n == 2
> + && in_mode == DFmode && in_n == 2)
> + return ix86_builtins[IX86_BUILTIN_CVTPD2PI];
> + return NULL_TREE;
> +
> + case BUILT_IN_LRINTF:
> + if (out_mode == SImode && out_n == 2
> + && in_mode == SFmode && in_n == 2)
> + return ix86_builtins[IX86_BUILTIN_CVTPS2PI];
> + else if (out_mode == SImode && out_n == 4
> + && in_mode == SFmode && in_n == 4)
> + return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
> + return NULL_TREE;
> +
> default:
> ;
> }
More information about the Gcc-patches
mailing list