{PATCH, vect, i386]: Vectorize lrint() and generate cvtpd2dq insn
Uros Bizjak
ubizjak@gmail.com
Fri Jun 29 11:11:00 GMT 2007
Hello!
This patch introduces the same approach using NARROW and WIDEN
modifier as already implemented in vectorizable_conversion() into
vectorizable_call() function. Using this modifier, gcc can vectorize
calls where (nunits_in == nunits_out / 2).
Attached patch uses this infrastructure to vectorize BUILT_IN_RINT
using cvtpd2dq sse insn. Also, this patch re-defines all 2-arg i386
builtins as const builtins (all builtins were checked that none of
them clobbers global memory).
Following testcase:
--cut here--
void foo(void)
{
int i;
for (i=0; i<256; ++i)
b[i] = lrint (a[i]);
}
--cut here--
generates (-O2 -msse3 -ffast-math -ftree-vectorize):
.L7:
cvtpd2dq a(%eax,%eax), %xmm0
cvtpd2dq a+16(%eax,%eax), %xmm1
punpcklqdq %xmm1, %xmm0
movdqa %xmm0, b(%eax)
addl $16, %eax
cmpl $1024, %eax
jne .L7
The patch was bootstrapped on i686-pc-linux-gnu, regression tested for
all default languages. This patch finally closes PR
tree-optimization/24659, as all conversions are now vectorized (on
SSEx targets).
OK for mainline (The patch needs approval for vectorizer part)?
2007-06-29 Uros Bizjak <ubizjak@gmail.com>
PR tree-optimization/24659
* tree-vect-transform.c (vectorizable_call): Handle
(nunits_in == nunits_out / 2) and (nunits_out == nunits_in / 2) cases.
* config/i386/sse.md (vec_pack_sfix_v2df): New expander.
* config/i386/i386.c (enum ix86_builtins) [IX86_BUILTIN_VEC_PACK_SFIX]:
New constant.
(struct bdesc_2arg) [__builtin_ia32_vec_pack_sfix]: New builtin
description.
(ix86_init_mmx_sse_builtins): Define all builtins with 2 arguments as
const using def_builtin_const.
(ix86_expand_binop_builtin): Remove bogus assert() that insn wants
input operands in the same modes as the result.
(ix86_builtin_vectorized_function): Handle BUILT_IN_LRINT.
testsuite/ChangeLog:
2007-06-29 Uros Bizjak <ubizjak@gmail.com>
PR tree-optimization/24659
* gcc.target/i386/vectorize2.c: New test.
* gcc.target/i386/sse2-lrint-vec.c: New runtime test.
* gcc.target/i386/sse2-lrintf-vec.c: Ditto.
Uros.
-------------- next part --------------
Index: tree-vect-transform.c
===================================================================
--- tree-vect-transform.c (revision 126107)
+++ tree-vect-transform.c (working copy)
@@ -2253,13 +2253,19 @@ vectorizable_call (tree stmt, block_stmt
tree scalar_dest;
tree operation;
tree op, type;
+ tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
tree vectype_out, vectype_in;
+ int nunits_in;
+ int nunits_out;
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
enum vect_def_type dt[2];
+ tree new_stmt;
int ncopies, j, nargs;
call_expr_arg_iterator iter;
+ tree vargs;
+ enum { NARROW, NONE, WIDEN } modifier;
if (!STMT_VINFO_RELEVANT_P (stmt_info))
return false;
@@ -2291,12 +2297,10 @@ vectorizable_call (tree stmt, block_stmt
nargs = 0;
FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
{
- ++nargs;
-
/* Bail out if the function has more than two arguments, we
do not have interesting builtin functions to vectorize with
more than two arguments. */
- if (nargs > 2)
+ if (nargs >= 2)
return false;
/* We can only handle calls with arguments of the same type. */
@@ -2309,12 +2313,14 @@ vectorizable_call (tree stmt, block_stmt
}
rhs_type = TREE_TYPE (op);
- if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs-1]))
+ if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "use not simple.");
return false;
}
+
+ ++nargs;
}
/* No arguments is also not good. */
@@ -2322,15 +2328,20 @@ vectorizable_call (tree stmt, block_stmt
return false;
vectype_in = get_vectype_for_scalar_type (rhs_type);
+ nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
vectype_out = get_vectype_for_scalar_type (lhs_type);
+ nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
- /* Only handle the case of vectors with the same number of elements.
- FIXME: We need a way to handle for example the SSE2 cvtpd2dq
- instruction which converts V2DFmode to V4SImode but only
- using the lower half of the V4SImode result. */
- if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
+ /* FORNOW */
+ if (nunits_in == nunits_out / 2)
+ modifier = NARROW;
+ else if (nunits_out == nunits_in)
+ modifier = NONE;
+ else if (nunits_out == nunits_in / 2)
+ modifier = WIDEN;
+ else
return false;
/* For now, we only vectorize functions if a target specific builtin
@@ -2348,8 +2359,14 @@ vectorizable_call (tree stmt, block_stmt
gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
- ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
- / TYPE_VECTOR_SUBPARTS (vectype_out));
+ if (modifier == NARROW)
+ ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
+ else
+ ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
+
+ /* Sanity check: make sure that at least one copy of the vectorized stmt
+ needs to be generated. */
+ gcc_assert (ncopies >= 1);
if (!vec_stmt) /* transformation not required. */
{
@@ -2365,55 +2382,113 @@ vectorizable_call (tree stmt, block_stmt
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "transform operation.");
- gcc_assert (ncopies >= 1);
-
/* Handle def. */
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
prev_stmt_info = NULL;
- for (j = 0; j < ncopies; ++j)
+ switch (modifier)
{
- tree new_stmt, vargs;
- tree vec_oprnd[2];
- int n;
-
- /* Build argument list for the vectorized call. */
- /* FIXME: Rewrite this so that it doesn't construct a temporary
- list. */
- vargs = NULL_TREE;
- n = -1;
- FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
+ case NONE:
+ for (j = 0; j < ncopies; ++j)
{
- ++n;
+ /* Build argument list for the vectorized call. */
+ /* FIXME: Rewrite this so that it doesn't
+ construct a temporary list. */
+ vargs = NULL_TREE;
+ nargs = 0;
+ FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
+ {
+ if (j == 0)
+ vec_oprnd0
+ = vect_get_vec_def_for_operand (op, stmt, NULL);
+ else
+ vec_oprnd0
+ = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
+
+ vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
+
+ ++nargs;
+ }
+ vargs = nreverse (vargs);
+
+ rhs = build_function_call_expr (fndecl, vargs);
+ new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
if (j == 0)
- vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
+ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
else
- vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
- vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
}
- vargs = nreverse (vargs);
- rhs = build_function_call_expr (fndecl, vargs);
- new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
- new_temp = make_ssa_name (vec_dest, new_stmt);
- GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ break;
- vect_finish_stmt_generation (stmt, new_stmt, bsi);
+ case NARROW:
+ for (j = 0; j < ncopies; ++j)
+ {
+ /* Build argument list for the vectorized call. */
+ /* FIXME: Rewrite this so that it doesn't
+ construct a temporary list. */
+ vargs = NULL_TREE;
+ nargs = 0;
+ FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
+ {
+ if (j == 0)
+ {
+ vec_oprnd0
+ = vect_get_vec_def_for_operand (op, stmt, NULL);
+ vec_oprnd1
+ = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
+ }
+ else
+ {
+ vec_oprnd0
+ = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
+ vec_oprnd1
+ = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
+ }
- if (j == 0)
- STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
- else
- STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
- prev_stmt_info = vinfo_for_stmt (new_stmt);
+ vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
+ vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
+
+ ++nargs;
+ }
+ vargs = nreverse (vargs);
+
+ rhs = build_function_call_expr (fndecl, vargs);
+ new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
+
+ if (j == 0)
+ STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+ }
+
+ *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
+
+ break;
+
+ case WIDEN:
+ /* No current target implements this case. */
+ return false;
}
- /* The call in STMT might prevent it from being removed in dce. We however
- cannot remove it here, due to the way the ssa name it defines is mapped
- to the new definition. So just replace rhs of the statement with something
- harmless. */
+ /* The call in STMT might prevent it from being removed in dce.
+ We however cannot remove it here, due to the way the ssa name
+ it defines is mapped to the new definition. So just replace
+ rhs of the statement with something harmless. */
type = TREE_TYPE (scalar_dest);
GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
update_stmt (stmt);
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md (revision 126107)
+++ config/i386/sse.md (working copy)
@@ -2421,6 +2421,26 @@
DONE;
})
+(define_expand "vec_pack_sfix_v2df"
+ [(match_operand:V4SI 0 "register_operand" "")
+ (match_operand:V2DF 1 "nonimmediate_operand" "")
+ (match_operand:V2DF 2 "nonimmediate_operand" "")]
+ "TARGET_SSE2"
+{
+ rtx r1, r2;
+
+ r1 = gen_reg_rtx (V4SImode);
+ r2 = gen_reg_rtx (V4SImode);
+
+ emit_insn (gen_sse2_cvtpd2dq (r1, operands[1]));
+ emit_insn (gen_sse2_cvtpd2dq (r2, operands[2]));
+ emit_insn (gen_sse2_punpcklqdq (gen_lowpart (V2DImode, operands[0]),
+ gen_lowpart (V2DImode, r1),
+ gen_lowpart (V2DImode, r2)));
+ DONE;
+})
+
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Parallel double-precision floating point element swizzling
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c (revision 126107)
+++ config/i386/i386.c (working copy)
@@ -16820,6 +16820,8 @@ enum ix86_builtins
IX86_BUILTIN_VEC_SET_V4HI,
IX86_BUILTIN_VEC_SET_V16QI,
+ IX86_BUILTIN_VEC_PACK_SFIX,
+
/* SSE4.2. */
IX86_BUILTIN_CRC32QI,
IX86_BUILTIN_CRC32HI,
@@ -17167,6 +17169,8 @@ static const struct builtin_description
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpckhpd, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpcklpd, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, 0 },
+
/* SSE2 MMX */
{ OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, 0 },
@@ -17563,6 +17567,9 @@ ix86_init_mmx_sse_builtins (void)
= build_function_type_list (V2DF_type_node, V4SI_type_node, NULL_TREE);
tree v4si_ftype_v2df
= build_function_type_list (V4SI_type_node, V2DF_type_node, NULL_TREE);
+ tree v4si_ftype_v2df_v2df
+ = build_function_type_list (V4SI_type_node,
+ V2DF_type_node, V2DF_type_node, NULL_TREE);
tree v2si_ftype_v2df
= build_function_type_list (V2SI_type_node, V2DF_type_node, NULL_TREE);
tree v4sf_ftype_v2df
@@ -17906,7 +17913,10 @@ ix86_init_mmx_sse_builtins (void)
|| d->icode == CODE_FOR_sse2_vmmaskcmpv2df3)
type = v2di_ftype_v2df_v2df;
- def_builtin (d->mask, d->name, type, d->code);
+ if (d->icode == CODE_FOR_vec_pack_sfix_v2df)
+ type = v4si_ftype_v2df_v2df;
+
+ def_builtin_const (d->mask, d->name, type, d->code);
}
/* Add all builtins that are more or less simple operations on 1 operand. */
@@ -18457,11 +18467,6 @@ ix86_expand_binop_builtin (enum insn_cod
op1 = gen_lowpart (TImode, x);
}
- /* The insn must want input operands in the same modes as the
- result. */
- gcc_assert ((GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
- && (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode));
-
if (!(*insn_data[icode].operand[1].predicate) (op0, mode0))
op0 = copy_to_mode_reg (mode0, op0);
if (!(*insn_data[icode].operand[2].predicate) (op1, mode1))
@@ -19863,6 +19868,12 @@ ix86_builtin_vectorized_function (unsign
return ix86_builtins[IX86_BUILTIN_SQRTPS];
return NULL_TREE;
+ case BUILT_IN_LRINT:
+ if (out_mode == SImode && out_n == 4
+ && in_mode == DFmode && in_n == 2)
+ return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
+ return NULL_TREE;
+
case BUILT_IN_LRINTF:
if (out_mode == SImode && out_n == 4
&& in_mode == SFmode && in_n == 4)
Index: testsuite/gcc.target/i386/sse2-lrint-vec.c
===================================================================
--- testsuite/gcc.target/i386/sse2-lrint-vec.c (revision 0)
+++ testsuite/gcc.target/i386/sse2-lrint-vec.c (revision 0)
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2" } */
+
+#include "../../gcc.dg/i386-cpuid.h"
+
+extern long lrint (double);
+extern void abort (void);
+
+#define N 32
+
+int __attribute__((noinline))
+main1 ()
+{
+ double a[N] = {0.4,3.5,6.6,9.4,12.5,15.6,18.4,21.5,24.6,27.4,30.5,33.6,36.4,39.5,42.6,45.4,0.5,3.6,6.4,9.5,12.6,15.4,18.5,21.6,24.4,27.5,30.6,33.4,36.5,39.6,42.4,45.5};
+ long r[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ {
+ r[i] = lrint (a[i]);
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (r[i] != lrint (a[i]))
+ abort();
+ }
+
+ return 0;
+}
+
+int
+main ()
+{
+ unsigned long cpu_facilities;
+
+ cpu_facilities = i386_cpuid ();
+
+ if ((cpu_facilities & (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
+ != (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
+ /* If host has no vector support, pass. */
+ return 0;
+
+ main1 ();
+ return 0;
+}
Index: testsuite/gcc.target/i386/vectorize2.c
===================================================================
--- testsuite/gcc.target/i386/vectorize2.c (revision 0)
+++ testsuite/gcc.target/i386/vectorize2.c (revision 0)
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse" } */
+
+double a[256];
+int b[256];
+unsigned short c[256];
+
+extern long lrint (double);
+
+void foo(void)
+{
+ int i;
+
+ for (i=0; i<256; ++i)
+ b[i] = lrint (a[i]);
+}
+
+void bar(void)
+{
+ int i;
+
+ for (i=0; i<256; ++i)
+ {
+ b[i] = lrint (a[i]);
+ c[i] += c[i];
+ }
+}
+
+/* { dg-final { scan-assembler "cvtpd2dq" } } */
Index: testsuite/gcc.target/i386/sse2-lrintf-vec.c
===================================================================
--- testsuite/gcc.target/i386/sse2-lrintf-vec.c (revision 0)
+++ testsuite/gcc.target/i386/sse2-lrintf-vec.c (revision 0)
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2" } */
+
+#include "../../gcc.dg/i386-cpuid.h"
+
+extern long lrintf (float);
+extern void abort (void);
+
+#define N 32
+
+int __attribute__((noinline))
+main1 ()
+{
+ float a[N] = {0.4,3.5,6.6,9.4,12.5,15.6,18.4,21.5,24.6,27.4,30.5,33.6,36.4,39.5,42.6,45.4,0.5,3.6,6.4,9.5,12.6,15.4,18.5,21.6,24.4,27.5,30.6,33.4,36.5,39.6,42.4,45.5};
+ long r[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ {
+ r[i] = lrintf (a[i]);
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (r[i] != lrintf (a[i]))
+ abort();
+ }
+
+ return 0;
+}
+
+int
+main ()
+{
+ unsigned long cpu_facilities;
+
+ cpu_facilities = i386_cpuid ();
+
+ if ((cpu_facilities & (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
+ != (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
+ /* If host has no vector support, pass. */
+ return 0;
+
+ main1 ();
+ return 0;
+}
More information about the Gcc-patches
mailing list