Index: doc/tm.texi =================================================================== *** doc/tm.texi (revision 115817) --- doc/tm.texi (working copy) *************** the argument @var{OFF} to @code{REALIGN_ *** 5272,5277 **** --- 5272,5299 ---- log2(@var{VS})-1 bits of @var{addr} will be considered. @end deftypefn + @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN (tree @var{x}) + This hook should return the DECL of a function @var{f} that implements + widening multiplication of the even elements of two input vectors of type @var{x}. + + If this hook is defined, the autovectorizer will use it along with the + @code{TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD} target hook when vectorizing + widening multiplication in cases that the order of the results does not have to be + preserved (e.g. used only by a reuction computation). Otherwise, the + @code{widen_mult_hi/lo} idioms will be used. + @end deftypefn + + @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD (tree @var{x}) + This hook should return the DECL of a function @var{f} that implements + widening multiplication of the odd elements of two input vectors of type @var{x}. + + If this hook is defined, the autovectorizer will use it along with the + @code{TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN} target hook when vectorizing + widening multiplication in cases that the order of the results does not have to be + preserved (e.g. used only by a reuction computation). Otherwise, the + @code{widen_mult_hi/lo} idioms will be used. + @end deftypefn + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses Index: doc/md.texi =================================================================== *** doc/md.texi (revision 115817) --- doc/md.texi (working copy) *************** Operand 2 is an integer shift amount in *** 3395,3400 **** --- 3395,3430 ---- Operand 0 is where the resulting shifted vector is stored. The output and input vectors should have the same modes. + @cindex @code{vec_pack_mod_@var{m}} instruction pattern + @cindex @code{vec_pack_ssat_@var{m}} instruction pattern + @cindex @code{vec_pack_usat_@var{m}} instruction pattern + @item @samp{vec_pack_mod_@var{m}}, @samp{vec_pack_ssat_@var{m}}, @samp{vec_pack_usat_@var{m}} + Narrow (demote) and merge the elements of two vectors. + Operands 1 and 2 are vectors of the same mode. + Operand 0 is the resulting vector in which the elements of the two input + vectors are concatenated after narrowing them down using modulo arithmetic or + signed/unsigned saturating arithmetic. + + @cindex @code{vec_unpacks_hi_@var{m}} instruction pattern + @cindex @code{vec_unpacks_lo_@var{m}} instruction pattern + @cindex @code{vec_unpacku_hi_@var{m}} instruction pattern + @cindex @code{vec_unpacku_lo_@var{m}} instruction pattern + @item @samp{vec_unpacks_hi_@var{m}}, @samp{vec_unpacks_lo_@var{m}}, @samp{vec_unpacku_hi_@var{m}}, @samp{vec_unpacku_lo_@var{m}} + Extract and widen (promote) the high/low part of a vector of signed/unsigned + elements. The input vector (operand 1) has N signed/unsigned elements of size S. + Using sign/zero extension widen (promote) the high/low elements of the vector, + and place the resulting N/2 values of size 2*S in the output vector (operand 0). + + @cindex @code{vec_widen_umult_hi_@var{m}} instruction pattern + @cindex @code{vec_widen_umult_lo__@var{m}} instruction pattern + @cindex @code{vec_widen_smult_hi_@var{m}} instruction pattern + @cindex @code{vec_widen_smult_lo_@var{m}} instruction pattern + @item @samp{vec_widen_umult_hi_@var{m}}, @samp{vec_widen_umult_lo_@var{m}}, @samp{vec_widen_smult_hi_@var{m}}, @samp{vec_widen_smult_lo_@var{m}} + Signed/Unsigned widening multiplication. + The two inputs (operands 1 and 2) are vectors with N + signed/unsigned elements of size S. Multiply the high/low elements of the two + vectors, and put the N/2 products of size 2*S in the output vector (opernad 0). + @cindex @code{mulhisi3} instruction pattern @item @samp{mulhisi3} Multiply operands 1 and 2, which have mode @code{HImode}, and store Index: tree-pretty-print.c =================================================================== *** tree-pretty-print.c (revision 115817) --- tree-pretty-print.c (working copy) *************** dump_generic_node (pretty_printer *buffe *** 1857,1862 **** --- 1857,1906 ---- pp_string (buffer, " > "); break; + case VEC_WIDEN_MULT_HI_EXPR: + pp_string (buffer, " VEC_WIDEN_MULT_HI_EXPR < "); + dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false); + pp_string (buffer, " , "); + dump_generic_node (buffer, TREE_OPERAND (node, 1), spc, flags, false); + pp_string (buffer, " > "); + break; + + case VEC_WIDEN_MULT_LO_EXPR: + pp_string (buffer, " VEC_WIDEN_MULT_LO_EXPR < "); + dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false); + pp_string (buffer, " , "); + dump_generic_node (buffer, TREE_OPERAND (node, 1), spc, flags, false); + pp_string (buffer, " > "); + break; + + case VEC_UNPACK_HI_EXPR: + pp_string (buffer, " VEC_UNPACK_HI_EXPR < "); + dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false); + pp_string (buffer, " > "); + break; + + case VEC_UNPACK_LO_EXPR: + pp_string (buffer, " VEC_UNPACK_LO_EXPR < "); + dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false); + pp_string (buffer, " > "); + break; + + case VEC_PACK_MOD_EXPR: + pp_string (buffer, " VEC_PACK_MOD_EXPR < "); + dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false); + pp_string (buffer, " , "); + dump_generic_node (buffer, TREE_OPERAND (node, 1), spc, flags, false); + pp_string (buffer, " > "); + break; + + case VEC_PACK_SAT_EXPR: + pp_string (buffer, " VEC_PACK_SAT_EXPR < "); + dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false); + pp_string (buffer, " , "); + dump_generic_node (buffer, TREE_OPERAND (node, 1), spc, flags, false); + pp_string (buffer, " > "); + break; + case BLOCK: { tree t; *************** op_prio (tree op) *** 2159,2164 **** --- 2203,2210 ---- case MINUS_EXPR: return 12; + case VEC_WIDEN_MULT_HI_EXPR: + case VEC_WIDEN_MULT_LO_EXPR: case WIDEN_MULT_EXPR: case DOT_PROD_EXPR: case MULT_EXPR: *************** op_prio (tree op) *** 2212,2217 **** --- 2258,2267 ---- case REDUC_PLUS_EXPR: case VEC_LSHIFT_EXPR: case VEC_RSHIFT_EXPR: + case VEC_UNPACK_HI_EXPR: + case VEC_UNPACK_LO_EXPR: + case VEC_PACK_MOD_EXPR: + case VEC_PACK_SAT_EXPR: return 16; case SAVE_EXPR: Index: optabs.c =================================================================== *** optabs.c (revision 115817) --- optabs.c (working copy) *************** optab_for_tree_code (enum tree_code code *** 315,320 **** --- 315,342 ---- case VEC_RSHIFT_EXPR: return vec_shr_optab; + case VEC_WIDEN_MULT_HI_EXPR: + return TYPE_UNSIGNED (type) ? + vec_widen_umult_hi_optab : vec_widen_smult_hi_optab; + + case VEC_WIDEN_MULT_LO_EXPR: + return TYPE_UNSIGNED (type) ? + vec_widen_umult_lo_optab : vec_widen_smult_lo_optab; + + case VEC_UNPACK_HI_EXPR: + return TYPE_UNSIGNED (type) ? + vec_unpacku_hi_optab : vec_unpacks_hi_optab; + + case VEC_UNPACK_LO_EXPR: + return TYPE_UNSIGNED (type) ? + vec_unpacku_lo_optab : vec_unpacks_lo_optab; + + case VEC_PACK_MOD_EXPR: + return vec_pack_mod_optab; + + case VEC_PACK_SAT_EXPR: + return TYPE_UNSIGNED (type) ? vec_pack_usat_optab : vec_pack_ssat_optab; + default: break; } *************** expand_binop (enum machine_mode mode, op *** 1276,1281 **** --- 1298,1304 ---- int icode = (int) binoptab->handlers[(int) mode].insn_code; enum machine_mode mode0 = insn_data[icode].operand[1].mode; enum machine_mode mode1 = insn_data[icode].operand[2].mode; + enum machine_mode tmp_mode; rtx pat; rtx xop0 = op0, xop1 = op1; *************** expand_binop (enum machine_mode mode, op *** 1329,1336 **** && mode1 != VOIDmode) xop1 = copy_to_mode_reg (mode1, xop1); ! if (!insn_data[icode].operand[0].predicate (temp, mode)) ! temp = gen_reg_rtx (mode); pat = GEN_FCN (icode) (temp, xop0, xop1); if (pat) --- 1352,1371 ---- && mode1 != VOIDmode) xop1 = copy_to_mode_reg (mode1, xop1); ! if (binoptab == vec_pack_mod_optab || binoptab == vec_pack_usat_optab ! || binoptab == vec_pack_ssat_optab) ! { ! /* The mode of the result is different then the mode of the ! arguments. */ ! tmp_mode = insn_data[icode].operand[0].mode; ! if (GET_MODE_NUNITS (tmp_mode) != 2*GET_MODE_NUNITS (mode)) ! return 0; ! } ! else ! tmp_mode = mode; ! ! if (!insn_data[icode].operand[0].predicate (temp, tmp_mode)) ! temp = gen_reg_rtx (tmp_mode); pat = GEN_FCN (icode) (temp, xop0, xop1); if (pat) *************** init_optabs (void) *** 5312,5317 **** --- 5347,5363 ---- vec_shr_optab = init_optab (UNKNOWN); vec_realign_load_optab = init_optab (UNKNOWN); movmisalign_optab = init_optab (UNKNOWN); + vec_widen_umult_hi_optab = init_optab (UNKNOWN); + vec_widen_umult_lo_optab = init_optab (UNKNOWN); + vec_widen_smult_hi_optab = init_optab (UNKNOWN); + vec_widen_smult_lo_optab = init_optab (UNKNOWN); + vec_unpacks_hi_optab = init_optab (UNKNOWN); + vec_unpacks_lo_optab = init_optab (UNKNOWN); + vec_unpacku_hi_optab = init_optab (UNKNOWN); + vec_unpacku_lo_optab = init_optab (UNKNOWN); + vec_pack_mod_optab = init_optab (UNKNOWN); + vec_pack_usat_optab = init_optab (UNKNOWN); + vec_pack_ssat_optab = init_optab (UNKNOWN); powi_optab = init_optab (UNKNOWN); Index: optabs.h =================================================================== *** optabs.h (revision 115817) --- optabs.h (working copy) *************** enum optab_index *** 261,266 **** --- 261,282 ---- OTI_vec_shr, /* Extract specified elements from vectors, for vector load. */ OTI_vec_realign_load, + /* Widening multiplication. + The high/low part of the resulting vector of products is returned. */ + OTI_vec_widen_umult_hi, + OTI_vec_widen_umult_lo, + OTI_vec_widen_smult_hi, + OTI_vec_widen_smult_lo, + /* Extract and widen the high/low part of a vector of signed/unsigned + elements. */ + OTI_vec_unpacks_hi, + OTI_vec_unpacks_lo, + OTI_vec_unpacku_hi, + OTI_vec_unpacku_lo, + /* Narrow (demote) and merge the elements of two vectors. */ + OTI_vec_pack_mod, + OTI_vec_pack_usat, + OTI_vec_pack_ssat, /* Perform a raise to the power of integer. */ OTI_powi, *************** extern GTY(()) optab optab_table[OTI_MAX *** 388,394 **** #define vec_shl_optab (optab_table[OTI_vec_shl]) #define vec_shr_optab (optab_table[OTI_vec_shr]) #define vec_realign_load_optab (optab_table[OTI_vec_realign_load]) ! #define powi_optab (optab_table[OTI_powi]) /* Conversion optabs have their own table and indexes. */ --- 404,421 ---- #define vec_shl_optab (optab_table[OTI_vec_shl]) #define vec_shr_optab (optab_table[OTI_vec_shr]) #define vec_realign_load_optab (optab_table[OTI_vec_realign_load]) ! #define vec_widen_umult_hi_optab (optab_table[OTI_vec_widen_umult_hi]) ! #define vec_widen_umult_lo_optab (optab_table[OTI_vec_widen_umult_lo]) ! #define vec_widen_smult_hi_optab (optab_table[OTI_vec_widen_smult_hi]) ! #define vec_widen_smult_lo_optab (optab_table[OTI_vec_widen_smult_lo]) ! #define vec_unpacks_hi_optab (optab_table[OTI_vec_unpacks_hi]) ! #define vec_unpacku_hi_optab (optab_table[OTI_vec_unpacku_hi]) ! #define vec_unpacks_lo_optab (optab_table[OTI_vec_unpacks_lo]) ! #define vec_unpacku_lo_optab (optab_table[OTI_vec_unpacku_lo]) ! #define vec_pack_mod_optab (optab_table[OTI_vec_pack_mod]) ! #define vec_pack_ssat_optab (optab_table[OTI_vec_pack_ssat]) ! #define vec_pack_usat_optab (optab_table[OTI_vec_pack_usat]) ! #define powi_optab (optab_table[OTI_powi]) /* Conversion optabs have their own table and indexes. */ Index: genopinit.c =================================================================== *** genopinit.c (revision 115817) --- genopinit.c (working copy) *************** static const char * const optabs[] = *** 212,218 **** "reduc_smin_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_smin_$a$)", "reduc_umin_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_umin_$a$)", "reduc_splus_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_splus_$a$)" , ! "reduc_uplus_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_uplus_$a$)" }; static void gen_insn (rtx); --- 212,228 ---- "reduc_smin_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_smin_$a$)", "reduc_umin_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_umin_$a$)", "reduc_splus_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_splus_$a$)" , ! "reduc_uplus_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_uplus_$a$)", ! "vec_widen_umult_hi_optab->handlers[$A].insn_code = CODE_FOR_$(vec_widen_umult_hi_$a$)", ! "vec_widen_umult_lo_optab->handlers[$A].insn_code = CODE_FOR_$(vec_widen_umult_lo_$a$)", ! "vec_widen_smult_hi_optab->handlers[$A].insn_code = CODE_FOR_$(vec_widen_smult_hi_$a$)", ! "vec_widen_smult_lo_optab->handlers[$A].insn_code = CODE_FOR_$(vec_widen_smult_lo_$a$)", ! "vec_unpacks_hi_optab->handlers[$A].insn_code = CODE_FOR_$(vec_unpacks_hi_$a$)", ! "vec_unpacks_lo_optab->handlers[$A].insn_code = CODE_FOR_$(vec_unpacks_lo_$a$)", ! "vec_unpacku_hi_optab->handlers[$A].insn_code = CODE_FOR_$(vec_unpacku_hi_$a$)", ! "vec_unpacku_lo_optab->handlers[$A].insn_code = CODE_FOR_$(vec_unpacku_lo_$a$)", ! "vec_pack_mod_optab->handlers[$A].insn_code = CODE_FOR_$(vec_pack_mod_$a$)", ! "vec_pack_ssat_optab->handlers[$A].insn_code = CODE_FOR_$(vec_pack_ssat_$a$)", "vec_pack_usat_optab->handlers[$A].insn_code = CODE_FOR_$(vec_pack_usat_$a$)" }; static void gen_insn (rtx); Index: target.h =================================================================== *** target.h (revision 115817) --- target.h (working copy) *************** struct gcc_target *** 369,374 **** --- 369,381 ---- by the vectorizer, and return the decl of the target builtin function. */ tree (* builtin_mask_for_load) (void); + + /* Target builtin that implements vector widening multiplication. + builtin_mul_widen_eve computes the element-by-element products + for the even elements, and builtin_mul_widen_odd computes the + element-by-element products for the odd elements. */ + tree (* builtin_mul_widen_even) (tree); + tree (* builtin_mul_widen_odd) (tree); } vectorize; /* The initial value of target_flags. */ Index: testsuite/gcc.dg/vect/vect-reduc-dot-u16a.c =================================================================== *** testsuite/gcc.dg/vect/vect-reduc-dot-u16a.c (revision 0) --- testsuite/gcc.dg/vect/vect-reduc-dot-u16a.c (revision 0) *************** *** 0 **** --- 1,52 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 64 + + #define DOT1 43680 + #define DOT2 43680 + + unsigned short X[N] __attribute__ ((__aligned__(16))); + unsigned short Y[N] __attribute__ ((__aligned__(16))); + + /* short->short->int dot product. + Not detected as a dot-product pattern. + Requires support for non-widneing multiplication and widening-summation. */ + unsigned int + foo1(int len) { + int i; + unsigned int result = 0; + unsigned short prod; + + for (i=0; i #include "tree-vect.h" ! #define N 16 ! int ! main1 () { int i; - short sc[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; - short sb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; - short sa[N]; - int ic[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; - int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; - int ia[N]; ! /* Two types with different nunits in vector. */ ! for (i = 0; i < N; i++) { ! ia[i] = ib[i] + ic[i]; sa[i] = sb[i] + sc[i]; } ! /* Check results. */ ! for (i = 0; i < N; i++) { ! if (ia[i] != ib[i] + ic[i] || sa[i] != sb[i] + sc[i]) ! abort(); } ! return 0; } ! int main (void) ! { check_vect (); ! return main1 (); } ! /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ --- 3,77 ---- #include #include "tree-vect.h" ! #define N 32 ! short sa[N]; ! short sc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, ! 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; ! short sb[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, ! 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; ! int ia[N]; ! int ic[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45, ! 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; ! int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45, ! 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; ! ! int main1 (int n) { int i; ! /* Multiple types with different sizes, used in idependent ! copmutations. Vectorizable. */ ! for (i = 0; i < n; i++) ! { ! sa[i+2] = sb[i] + sc[i]; ! ia[i+1] = ib[i] + ic[i]; ! } ! ! /* check results: */ ! for (i = 0; i < n; i++) { ! if (sa[i+2] != sb[i] + sc[i] || ia[i+1] != ib[i] + ic[i]) ! abort (); ! } ! ! return 0; ! } ! ! int main2 (int n) ! { ! int i; ! ! /* Multiple types with different sizes, used in idependent ! copmutations. Vectorizable. */ ! for (i = 0; i < n; i++) ! { ! ia[i+1] = ib[i] + ic[i]; sa[i] = sb[i] + sc[i]; } ! /* check results: */ ! for (i = 0; i < n; i++) { ! if (sa[i] != sb[i] + sc[i] || ia[i+1] != ib[i] + ic[i]) ! abort (); } ! return 0; } ! ! int main (void) ! { check_vect (); ! ! main1 (N-2); ! main2 (N-1); ! ! return 0; } ! /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 2 "vect" } } */ ! /* { dg-final { scan-tree-dump-times "not vectorized: unsupported unaligned store" 2 "vect" } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ Index: testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c =================================================================== *** testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c (revision 0) --- testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c (revision 0) *************** *** 0 **** --- 1,57 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 64 + + #define DOT1 43680 + + signed char X[N] __attribute__ ((__aligned__(16))); + signed char Y[N] __attribute__ ((__aligned__(16))); + + /* char->short->int dot product. + The dot-product pattern should be detected. + Vectorizable on vect_sdot_qi targets (targets that support dot-product of + signed chars). + + In the future could also be vectorized as widening-mult + widening-summation, + or with type-conversion support. + */ + int + foo1(int len) { + int i; + int result = 0; + short prod; + + for (i=0; i + #include "tree-vect.h" + + #define N 64 + + #define DOT 43680 + + unsigned char X[N] __attribute__ ((__aligned__(16))); + unsigned char Y[N] __attribute__ ((__aligned__(16))); + + /* char->short->int dot product. + Detected as a dot-product pattern. + Should be vectorized on targets that support dot-product for unsigned chars + (vect_udot_qi), + and on targets that support widening-multiplication and widening-summation + (vect_widen_mult_qi && vec_widen_sum_qi_to_si). + Widening-multiplication can also be supported by type promotion and non-widening + multiplication (vect_unpack && vect_short_mult); + Widening summation can also be supported by type promotion and non-widening + summation (vect_unpack). + */ + unsigned int + foo (int len) { + int i; + unsigned int result = 0; + unsigned short prod; + + for (i=0; i + #include "tree-vect.h" + + #define N 64 + #define SUM 0 + + /* Require widening-mult or data-unpacking (for the type promotion). */ + int + main1 (short *in, int off, short scale, int n) + { + int i; + int sum = 0; + + for (i = 0; i < n; i++) { + sum += ((int) in[i] * (int) in[i+off]) >> scale; + } + + return sum; + } + + int main (void) + { + int i; + int sum; + short X[N]; + + check_vect (); + + for (i=0; i + #include "tree-vect.h" + + #define N 32 + + short sa[N]; + short sb[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; + int ia[N]; + int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45, + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + + /* Current peeling-for-alignment scheme will consider the 'sa[i+7]' + access for peeling, and therefore will examine the option of + using a peeling factor = VF-7%VF. This will result in a peeling factor 1, + which will also align the access to 'ia[i+3]', and the loop could be + vectorized on all targets that support unaligned loads. + */ + + int main1 (int n) + { + int i; + + /* Multiple types with different sizes, used in idependent + copmutations. Vectorizable. */ + for (i = 0; i < n; i++) + { + sa[i+7] = sb[i]; + ia[i+3] = ib[i]; + } + + /* check results: */ + for (i = 0; i < n; i++) + { + if (sa[i+7] != sb[i] || ia[i+3] != ib[i]) + abort (); + } + + return 0; + } + + /* Current peeling-for-alignment scheme will consider the 'ia[i+3]' + access for peeling, and therefore will examine the option of + using a peeling factor = VF-3%VF. This will result in a peeling factor + 5 if VF=8, or 1 if VF=4,2. In either case, this will also align the access + to 'sa[i+3]', and the loop could be vectorized on targets that support + unaligned loads. */ + + int main2 (int n) + { + int i; + + /* Multiple types with different sizes, used in independent + copmutations. Vectorizable. */ + for (i = 0; i < n; i++) + { + ia[i+3] = ib[i]; + sa[i+3] = sb[i]; + } + + /* check results: */ + for (i = 0; i < n; i++) + { + if (sa[i+3] != sb[i] || ia[i+3] != ib[i]) + abort (); + } + + return 0; + } + + int main (void) + { + check_vect (); + + main1 (N-7); + main2 (N-3); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_align } } } */ + /* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail vect_no_align } } } */ + /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail vect_no_align } } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ + Index: testsuite/gcc.dg/vect/vect-multitypes-5.c =================================================================== *** testsuite/gcc.dg/vect/vect-multitypes-5.c (revision 0) --- testsuite/gcc.dg/vect/vect-multitypes-5.c (revision 0) *************** *** 0 **** --- 1,51 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 32 + + int main1 () + { + int i; + unsigned int ia[N]; + unsigned int ic[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned short sa[N]; + unsigned short sc[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned short sb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned char ca[N]; + unsigned char cc[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned char cb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + + /* Multiple types with different sizes, used in independent + computations. Vectorizable. All accesses aligned. */ + for (i = 0; i < N; i++) + { + ia[i] = ib[i] + ic[i]; + sa[i] = sb[i] + sc[i]; + ca[i] = cb[i] + cc[i]; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (ia[i] != ib[i] + ic[i] + || sa[i] != sb[i] + sc[i] + || ca[i] != cb[i] + cc[i]) + abort (); + } + + return 0; + } + + int main (void) + { + check_vect (); + + return main1 (); + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ + Index: testsuite/gcc.dg/vect/vect-multitypes-9.c =================================================================== *** testsuite/gcc.dg/vect/vect-multitypes-9.c (revision 0) --- testsuite/gcc.dg/vect/vect-multitypes-9.c (revision 0) *************** *** 0 **** --- 1,63 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 64 + + unsigned char uX[N] __attribute__ ((__aligned__(16))); + unsigned short uresult[N]; + signed char X[N] __attribute__ ((__aligned__(16))); + short result[N]; + + /* Unsigned type promotion (qi->hi) */ + int + foo1(int len) { + int i; + + for (i=0; ihi) */ + int + foo2(int len) { + int i; + + for (i=0; i + #include "tree-vect.h" + + #define N 64 + + #define DOT2 43680 + + unsigned short X[N] __attribute__ ((__aligned__(16))); + unsigned short Y[N] __attribute__ ((__aligned__(16))); + + /* short->int->int dot product. + Currently not detected as a dot-product pattern: the multiplication + promotes the ushorts to int, and then the product is promoted to unsigned + int for the addition. Which results in an int->unsigned int cast, which + since no bits are modified in the cast should be trivially vectorizable. */ + unsigned int + foo2(int len) { + int i; + unsigned int result = 0; + + for (i=0; i - #include "tree-vect.h" - - #define N 16 - - int - main1 (void) - { - int i; - short sb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; - int ia[N]; - - /* Type cast. */ - for (i = 0; i < N; i++) - { - ia[i] = (int) sb[i]; - } - - - /* Check results. */ - for (i = 0; i < N; i++) - { - if (ia[i] != (int) sb[i]) - abort(); - } - - return 0; - } - - int main (void) - { - check_vect (); - return main1 (); - } - - /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */ - /* { dg-final { cleanup-tree-dump "vect" } } */ - --- 0 ---- Index: testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c =================================================================== *** testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c (revision 0) --- testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c (revision 0) *************** *** 0 **** --- 1,64 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 64 + + #define DOT2 -21856 + + signed char X[N] __attribute__ ((__aligned__(16))); + signed char Y[N] __attribute__ ((__aligned__(16))); + + /* char->short->short dot product. + The dot-product pattern should be detected. + The reduction is currently not vectorized becaus of the signed->unsigned->signed + casts, since this patch: + + 2005-12-26 Kazu Hirata + + PR tree-optimization/25125 + + When the dot-product is detected, the loop should be vectorized on vect_sdot_qi + targets (targets that support dot-product of signed char). + This test would currently fail to vectorize on targets that support + dot-product of chars into an int accumulator. + Alternatively, the loop could also be vectorized as widening-mult + summation, + or with type-conversion support. + */ + short + foo2(int len) { + int i; + short result = 0; + + for (i=0; i + #include "tree-vect.h" + + #define N 64 + + #define DOT 43680 + + unsigned char X[N] __attribute__ ((__aligned__(16))); + unsigned char Y[N] __attribute__ ((__aligned__(16))); + + /* char->short->short dot product. + Detected as a dot-product pattern. + Should be vectorized on targets that support dot-product for unsigned chars, + but currently this test cannot be vectorized as a dot-product on targets + that support char->short->int dot-product. + Alternatively, this test can be vectorized using vect_widen_mult_qi (or + vect_unpack and non-widening multplication: vect_unpack && vect_short_mult). + */ + unsigned short + foo (int len) { + int i; + unsigned short result = 0; + + for (i=0; i + #include "tree-vect.h" + + #define N 64 + + short X[N] __attribute__ ((__aligned__(16))); + short Y[N] __attribute__ ((__aligned__(16))); + int result[N]; + + /* short->int widening-mult */ + int + foo1(int len) { + int i; + + for (i=0; i + #include "tree-vect.h" + + #define N 64 + + unsigned short X[N] __attribute__ ((__aligned__(16))); + unsigned short Y[N] __attribute__ ((__aligned__(16))); + unsigned int result[N]; + + /* short->int widening-mult */ + int + foo1(int len) { + int i; + + /* Not vectorized because X[i] and Y[i] are casted to 'int' + so the widening multiplication pattern is not recognized. */ + for (i=0; i - #include "tree-vect.h" - - #define N 64 - - #define DOT1 43680 - #define DOT2 -21856 - #define DOT3 43680 - - signed char X[N] __attribute__ ((__aligned__(16))); - signed char Y[N] __attribute__ ((__aligned__(16))); - - /* char->short->int dot product. - The dot-product pattern should be detected. - Vectorizable on vect_sdot_qi targets (targets that support dot-product of - signed chars). - - In the future could also be vectorized as widening-mult + widening-summation, - or with type-conversion support. - */ - int - foo1(int len) { - int i; - int result = 0; - short prod; - - for (i=0; ishort->short dot product. - The dot-product pattern should be detected. - Should be vectorized on vect_sdot_qi targets (targets that support - dot-product of signed char). - This test currently fails to vectorize on targets that support - dot-product of chars when the accumulator is int. - - In the future could also be vectorized as widening-mult + summation, - or with type-conversion support. - */ - short - foo2(int len) { - int i; - short result = 0; - - for (i=0; iint->int dot product. - Not detected as a dot-product pattern. - Currently fails to be vectorized due to presence of type conversions. */ - int - foo3(int len) { - int i; - int result = 0; - - for (i=0; i + #include "tree-vect.h" + + #define N 32 + + int main1 () + { + int i; + int ia[N]; + int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + short sa[N]; + short sb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + char ca[N]; + char cb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + + /* Multiple types with different sizes, used in independent + cmputations. Vectorizable. All accesses aligned. */ + for (i = 0; i < N; i++) + { + ia[i] = ib[i]; + sa[i] = sb[i]; + ca[i] = cb[i]; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (ia[i] != ib[i] + || sa[i] != sb[i] + || ca[i] != cb[i]) + abort (); + } + + return 0; + } + + int main (void) + { + check_vect (); + + return main1 (); + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ + Index: testsuite/gcc.dg/vect/vect-widen-mult-u8.c =================================================================== *** testsuite/gcc.dg/vect/vect-widen-mult-u8.c (revision 0) --- testsuite/gcc.dg/vect/vect-widen-mult-u8.c (revision 0) *************** *** 0 **** --- 1,45 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 64 + + unsigned char X[N] __attribute__ ((__aligned__(16))); + unsigned char Y[N] __attribute__ ((__aligned__(16))); + unsigned short result[N]; + + /* char->short widening-mult */ + int + foo1(int len) { + int i; + + for (i=0; i - #include "tree-vect.h" - - #define N 64 - - #define DOT1 43680 - #define DOT2 43680 - #define DOT3 43680 - - unsigned char X[N] __attribute__ ((__aligned__(16))); - unsigned char Y[N] __attribute__ ((__aligned__(16))); - - /* char->short->int dot product. - Detected as a dot-product pattern. - Should be vectorized on targets that support dot-product for unsigned chars. - */ - unsigned int - foo1(int len) { - int i; - unsigned int result = 0; - unsigned short prod; - - for (i=0; ishort->short dot product. - Detected as a dot-product pattern. - Should be vectorized on targets that support dot-product for unsigned chars. - This test currently fails to vectorize on targets that support dot-product - of chars only when the accumulator is int. - */ - unsigned short - foo2(int len) { - int i; - unsigned short result = 0; - - for (i=0; iint->int dot product. - Not detected as a dot-product. - Doesn't get vectorized due to presence of type converisons. */ - unsigned int - foo3(int len) { - int i; - unsigned int result = 0; - - for (i=0; i + #include "tree-vect.h" + + #define N 32 + + unsigned int ic[N] __attribute__ ((__aligned__(16))) = + {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned int ib[N] __attribute__ ((__aligned__(16))) = + {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned short sc[N] __attribute__ ((__aligned__(16))) = + {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned short sb[N] __attribute__ ((__aligned__(16))) = + {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned char cc[N] __attribute__ ((__aligned__(16))) = + {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned char cb[N] __attribute__ ((__aligned__(16))) = + {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + + int main1 (int n, + unsigned int * __restrict__ pic, unsigned int * __restrict__ pib, + unsigned short * __restrict__ psc, unsigned short * __restrict__ psb, + unsigned char * __restrict__ pcc, unsigned char * __restrict__ pcb) + { + int i; + unsigned int ia[N]; + unsigned short sa[N]; + unsigned char ca[N]; + + /* Multiple types with different sizes, used in independent + computations. Vectorizable. The loads are misaligned. */ + for (i = 0; i < n; i++) + { + ia[i] = pib[i] + pic[i]; + sa[i] = psb[i] + psc[i]; + ca[i] = pcb[i] + pcc[i]; + } + + /* check results: */ + for (i = 0; i < n; i++) + { + if (ia[i] != pib[i] + pic[i] + || sa[i] != psb[i] + psc[i] + || ca[i] != pcb[i] + pcc[i]) + abort (); + } + + return 0; + } + + int main (void) + { + check_vect (); + + main1 (N, ic, ib, sc, sb, cc, cb); + main1 (N-3, ic, ib, &sc[1], sb, cc, &cb[2]); + return 0; + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */ + /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 6 "vect" {xfail vect_no_align } } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ + Index: testsuite/gcc.dg/vect/vect-reduc-dot-u16.c =================================================================== *** testsuite/gcc.dg/vect/vect-reduc-dot-u16.c (revision 115817) --- testsuite/gcc.dg/vect/vect-reduc-dot-u16.c (working copy) *************** *** 1,77 **** - /* { dg-require-effective-target vect_int } */ - - #include - #include "tree-vect.h" - - #define N 64 - - #define DOT1 43680 - #define DOT2 43680 - - unsigned short X[N] __attribute__ ((__aligned__(16))); - unsigned short Y[N] __attribute__ ((__aligned__(16))); - - /* short->short->int dot product. - Not detected as a dot-product pattern. - Not vectorized due to presence of type-conversions. */ - unsigned int - foo1(int len) { - int i; - unsigned int result = 0; - unsigned short prod; - - for (i=0; iint->int dot product. - Currently not detected as a dot-product pattern: the multiplication - promotes the ushorts to int, and then the product is promoted to unsigned - int for the addition. Which results in an int->unsigned int cast, which - since no bits are modified in the cast should be trivially vectorizable. */ - unsigned int - foo2(int len) { - int i; - unsigned int result = 0; - - for (i=0; i + #include "tree-vect.h" + + #define N 64 + + #define DOT3 43680 + + signed char X[N] __attribute__ ((__aligned__(16))); + signed char Y[N] __attribute__ ((__aligned__(16))); + + /* char->int->int dot product. + Not detected as a dot-product pattern. + Currently fails to be vectorized due to presence of type conversions. */ + int + foo3(int len) { + int i; + int result = 0; + + for (i=0; i + #include "tree-vect.h" + + #define N 64 + + #define DOT -21856 + + signed char X[N] __attribute__ ((__aligned__(16))); + signed char Y[N] __attribute__ ((__aligned__(16))); + + /* char->short->short dot product. + The dot-product pattern should be detected. + Should be vectorized on vect_sdot_qi targets (targets that support + dot-product of signed char). + This test currently fails to vectorize on targets that support + dot-product of chars into and int accumulator. + Can also be vectorized as widening-mult + summation, + or with type-conversion support. + */ + short + foo(int len) { + int i; + short result = 0; + + for (i=0; i + #include "tree-vect.h" + + #define N 64 + + signed char X[N] __attribute__ ((__aligned__(16))); + signed char Y[N] __attribute__ ((__aligned__(16))); + short result[N]; + + /* char->short widening-mult */ + int + foo1(int len) { + int i; + + for (i=0; i - #include "tree-vect.h" - - #define N 64 - - #define DOT1 43680 - #define DOT2 -21856 - #define DOT3 43680 - - signed char X[N] __attribute__ ((__aligned__(16))); - signed char Y[N] __attribute__ ((__aligned__(16))); - - /* char->short->int dot product. - The dot-product pattern should be detected. - Vectorizable on vect_sdot_qi targets (targets that support dot-product of - signed chars). - - In the future could also be vectorized as widening-mult + widening-summation, - or with type-conversion support. - */ - int - foo1(int len) { - int i; - int result = 0; - short prod; - - for (i=0; ishort->short dot product. - The dot-product pattern should be detected. - The reduction is currently not vectorized becaus of the signed->unsigned->signed - casts, since this patch: - - 2005-12-26 Kazu Hirata - - PR tree-optimization/25125 - - When the dot-product is detected, the loop should be vectorized on vect_sdot_qi - targets (targets that support dot-product of signed char). - This test would currently fail to vectorize on targets that support - dot-product of chars when the accumulator is int. - - In the future could also be vectorized as widening-mult + summation, - or with type-conversion support. - */ - short - foo2(int len) { - int i; - short result = 0; - - for (i=0; iint->int dot product. - Not detected as a dot-product pattern. - Currently fails to be vectorized due to presence of type conversions. */ - int - foo3(int len) { - int i; - int result = 0; - - for (i=0; i + #include "tree-vect.h" + + #define N 32 + + int ib[N] __attribute__ ((__aligned__(16))) = + {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + short sb[N] __attribute__ ((__aligned__(16))) = + {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + char cb[N] __attribute__ ((__aligned__(16))) = + {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + + int main1 (int n, int * __restrict__ pib, + short * __restrict__ psb, + char * __restrict__ pcb) + { + int i; + int ia[N]; + short sa[N]; + char ca[N]; + + /* Multiple types with different sizes, used in independent + computations. Vectorizable. The loads are misaligned. */ + for (i = 0; i < n; i++) + { + ia[i] = pib[i]; + sa[i] = psb[i]; + ca[i] = pcb[i]; + } + + /* check results: */ + for (i = 0; i < n; i++) + { + if (ia[i] != pib[i] + || sa[i] != psb[i] + || ca[i] != pcb[i]) + abort (); + } + + return 0; + } + + int main (void) + { + check_vect (); + + main1 (N, ib, sb, cb); + main1 (N-3, ib, sb, &cb[2]); + return 0; + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */ + /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 3 "vect" {xfail vect_no_align } } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ + Index: testsuite/gcc.dg/vect/vect-multitypes-7.c =================================================================== *** testsuite/gcc.dg/vect/vect-multitypes-7.c (revision 0) --- testsuite/gcc.dg/vect/vect-multitypes-7.c (revision 0) *************** *** 0 **** --- 1,51 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + #include + + #define N 64 + + #define DOT1 43680 + #define DOT2 -20832 + + signed short X[N] __attribute__ ((__aligned__(16))); + signed short Y[N] __attribute__ ((__aligned__(16))); + unsigned char CX[N] __attribute__ ((__aligned__(16))); + + void + foo1(int len) { + int i; + int result1 = 0; + short prod; + + for (i=0; i + #include "tree-vect.h" + + #define N 32 + + unsigned short sa[N]; + unsigned short sc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; + unsigned short sb[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; + unsigned int ia[N]; + unsigned int ic[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45, + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + unsigned int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45, + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + + /* Current peeling-for-alignment scheme will consider the 'sa[i+7]' + access for peeling, and therefore will examine the option of + using a peeling factor = VF-7%VF. This will result in a peeling factor 1, + which will also align the access to 'ia[i+3]', and the loop could be + vectorized on all targets that support unaligned loads. + */ + + int main1 (int n) + { + int i; + + /* Multiple types with different sizes, used in independent + copmutations. Vectorizable. */ + for (i = 0; i < n; i++) + { + sa[i+7] = sb[i] + sc[i]; + ia[i+3] = ib[i] + ic[i]; + } + + /* check results: */ + for (i = 0; i < n; i++) + { + if (sa[i+7] != sb[i] + sc[i] || ia[i+3] != ib[i] + ic[i]) + abort (); + } + + return 0; + } + + /* Current peeling-for-alignment scheme will consider the 'ia[i+3]' + access for peeling, and therefore will examine the option of + using a peeling factor = VF-3%VF. This will result in a peeling factor + 5 if VF=8, or 1 if VF=4,2. In either case, this will also align the access + to 'sa[i+3]', and the loop could be vectorized on targets that support + unaligned loads. */ + + int main2 (int n) + { + int i; + + /* Multiple types with different sizes, used in independent + copmutations. Vectorizable. */ + for (i = 0; i < n; i++) + { + ia[i+3] = ib[i] + ic[i]; + sa[i+3] = sb[i] + sc[i]; + } + + /* check results: */ + for (i = 0; i < n; i++) + { + if (sa[i+3] != sb[i] + sc[i] || ia[i+3] != ib[i] + ic[i]) + abort (); + } + + return 0; + } + + int main (void) + { + check_vect (); + + main1 (N-7); + main2 (N-3); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_align } } } */ + /* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail vect_no_align } } } */ + /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 8 "vect" { xfail vect_no_align } } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ + Index: testsuite/gcc.dg/vect/vect-multitypes-8.c =================================================================== *** testsuite/gcc.dg/vect/vect-multitypes-8.c (revision 0) --- testsuite/gcc.dg/vect/vect-multitypes-8.c (revision 0) *************** *** 0 **** --- 1,50 ---- + /* { dg-require-effective-target vect_int } */ + + #include + #include "tree-vect.h" + + #define N 64 + + unsigned char uX[N] __attribute__ ((__aligned__(16))); + unsigned char uresultX[N]; + unsigned int uY[N] __attribute__ ((__aligned__(16))); + unsigned short uresultY[N]; + + /* Unsigned type demotion (si->hi) */ + + int + foo1(int len) { + int i; + + for (i=0; i + #include "tree-vect.h" + + #define N 64 + + unsigned char uX[N] __attribute__ ((__aligned__(16))); + unsigned short uY[N] __attribute__ ((__aligned__(16))); + unsigned int uresult[N]; + signed char X[N] __attribute__ ((__aligned__(16))); + signed short Y[N] __attribute__ ((__aligned__(16))); + int result[N]; + + /* Unsigned type promotion (hi->si) */ + int + foo1(int len) { + int i; + + for (i=0; isi) */ + int + foo2(int len) { + int i; + + for (i=0; ihandlers[(int) vec_mode].insn_code) == CODE_FOR_nothing + || insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype) + || (icode2 = optab2->handlers[(int) vec_mode].insn_code) + == CODE_FOR_nothing + || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype)) + return false; + + return true; + } + + /* Function reduction_code_for_scalar_code Input: Index: tree-vectorizer.h =================================================================== *** tree-vectorizer.h (revision 115817) --- tree-vectorizer.h (working copy) *************** enum stmt_vec_info_type { *** 165,171 **** op_vec_info_type, assignment_vec_info_type, condition_vec_info_type, ! reduc_vec_info_type }; typedef struct data_reference *dr_p; --- 165,179 ---- op_vec_info_type, assignment_vec_info_type, condition_vec_info_type, ! reduc_vec_info_type, ! type_promotion_vec_info_type, ! type_demotion_vec_info_type ! }; ! ! enum vect_relevant { ! vect_unused_in_loop = 0, ! vect_used_by_reduction, ! vect_used_in_loop }; typedef struct data_reference *dr_p; *************** typedef struct _stmt_vec_info { *** 185,191 **** /* Not all stmts in the loop need to be vectorized. e.g, the incrementation of the loop induction variable and computation of array indexes. relevant indicates whether the stmt needs to be vectorized. */ ! bool relevant; /* Indicates whether this stmts is part of a computation whose result is used outside the loop. */ --- 193,199 ---- /* Not all stmts in the loop need to be vectorized. e.g, the incrementation of the loop induction variable and computation of array indexes. relevant indicates whether the stmt needs to be vectorized. */ ! enum vect_relevant relevant; /* Indicates whether this stmts is part of a computation whose result is used outside the loop. */ *************** typedef struct _stmt_vec_info { *** 232,238 **** #define STMT_VINFO_TYPE(S) (S)->type #define STMT_VINFO_STMT(S) (S)->stmt #define STMT_VINFO_LOOP_VINFO(S) (S)->loop_vinfo ! #define STMT_VINFO_RELEVANT_P(S) (S)->relevant #define STMT_VINFO_LIVE_P(S) (S)->live #define STMT_VINFO_VECTYPE(S) (S)->vectype #define STMT_VINFO_VEC_STMT(S) (S)->vectorized_stmt --- 240,246 ---- #define STMT_VINFO_TYPE(S) (S)->type #define STMT_VINFO_STMT(S) (S)->stmt #define STMT_VINFO_LOOP_VINFO(S) (S)->loop_vinfo ! #define STMT_VINFO_RELEVANT(S) (S)->relevant #define STMT_VINFO_LIVE_P(S) (S)->live #define STMT_VINFO_VECTYPE(S) (S)->vectype #define STMT_VINFO_VEC_STMT(S) (S)->vectorized_stmt *************** typedef struct _stmt_vec_info { *** 242,247 **** --- 250,257 ---- #define STMT_VINFO_SAME_ALIGN_REFS(S) (S)->same_align_refs #define STMT_VINFO_DEF_TYPE(S) (S)->def_type + #define STMT_VINFO_RELEVANT_P(S) ((S)->relevant != vect_unused_in_loop) + static inline void set_stmt_info (tree_ann_t ann, stmt_vec_info stmt_info); static inline stmt_vec_info vinfo_for_stmt (tree stmt); *************** extern bool vect_can_force_dr_alignment_ *** 328,333 **** --- 338,345 ---- extern enum dr_alignment_support vect_supportable_dr_alignment (struct data_reference *); extern bool reduction_code_for_scalar_code (enum tree_code, enum tree_code *); + extern bool supportable_widening_operation (enum tree_code, tree, tree, + tree *, tree *, enum tree_code *, enum tree_code *); /* Creation and deletion of loop and stmt info structs. */ extern loop_vec_info new_loop_vec_info (struct loop *loop); extern void destroy_loop_vec_info (loop_vec_info); *************** void vect_pattern_recog (loop_vec_info); *** 354,359 **** --- 366,373 ---- extern bool vectorizable_load (tree, block_stmt_iterator *, tree *); extern bool vectorizable_store (tree, block_stmt_iterator *, tree *); extern bool vectorizable_operation (tree, block_stmt_iterator *, tree *); + extern bool vectorizable_type_promotion (tree, block_stmt_iterator *, tree *); + extern bool vectorizable_type_demotion (tree, block_stmt_iterator *, tree *); extern bool vectorizable_assignment (tree, block_stmt_iterator *, tree *); extern bool vectorizable_condition (tree, block_stmt_iterator *, tree *); extern bool vectorizable_live_operation (tree, block_stmt_iterator *, tree *); Index: tree-vect-analyze.c =================================================================== *** tree-vect-analyze.c (revision 115817) --- tree-vect-analyze.c (working copy) *************** static bool vect_determine_vectorization *** 54,61 **** /* Utility functions for the analyses. */ static bool exist_non_indexing_operands_for_use_p (tree, tree); ! static void vect_mark_relevant (VEC(tree,heap) **, tree, bool, bool); ! static bool vect_stmt_relevant_p (tree, loop_vec_info, bool *, bool *); static tree vect_get_loop_niters (struct loop *, tree *); static bool vect_analyze_data_ref_dependence (struct data_dependence_relation *, loop_vec_info); --- 54,63 ---- /* Utility functions for the analyses. */ static bool exist_non_indexing_operands_for_use_p (tree, tree); ! static void vect_mark_relevant ! (VEC(tree,heap) **, tree, enum vect_relevant, bool); ! static bool vect_stmt_relevant_p ! (tree, loop_vec_info, enum vect_relevant *, bool *); static tree vect_get_loop_niters (struct loop *, tree *); static bool vect_analyze_data_ref_dependence (struct data_dependence_relation *, loop_vec_info); *************** vect_determine_vectorization_factor (loo *** 187,208 **** if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "nunits = %d", nunits); ! if (vectorization_factor) ! { ! /* FORNOW: don't allow mixed units. ! This restriction will be relaxed in the future. */ ! if (nunits != vectorization_factor) ! { ! if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) ! fprintf (vect_dump, "not vectorized: mixed data-types"); ! return false; ! } ! } ! else vectorization_factor = nunits; - - gcc_assert (GET_MODE_SIZE (TYPE_MODE (scalar_type)) - * vectorization_factor == UNITS_PER_SIMD_WORD); } } --- 189,197 ---- if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "nunits = %d", nunits); ! if (!vectorization_factor ! || (nunits > vectorization_factor)) vectorization_factor = nunits; } } *************** vect_analyze_operations (loop_vec_info l *** 310,316 **** gcc_assert (!VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (stmt)))); gcc_assert (STMT_VINFO_VECTYPE (stmt_info)); ! ok = (vectorizable_operation (stmt, NULL, NULL) || vectorizable_assignment (stmt, NULL, NULL) || vectorizable_load (stmt, NULL, NULL) || vectorizable_store (stmt, NULL, NULL) --- 299,307 ---- gcc_assert (!VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (stmt)))); gcc_assert (STMT_VINFO_VECTYPE (stmt_info)); ! ok = (vectorizable_type_promotion (stmt, NULL, NULL) ! || vectorizable_type_demotion (stmt, NULL, NULL) ! || vectorizable_operation (stmt, NULL, NULL) || vectorizable_assignment (stmt, NULL, NULL) || vectorizable_load (stmt, NULL, NULL) || vectorizable_store (stmt, NULL, NULL) *************** vect_analyze_data_ref_dependence (struct *** 588,593 **** --- 579,586 ---- struct data_reference *drb = DDR_B (ddr); stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); + int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra)))); + int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb)))); lambda_vector dist_v; unsigned int loop_depth; *************** vect_analyze_data_ref_dependence (struct *** 628,634 **** fprintf (vect_dump, "dependence distance = %d.", dist); /* Same loop iteration. */ ! if (dist % vectorization_factor == 0) { /* Two references with distance zero have the same alignment. */ VEC_safe_push (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a), drb); --- 621,627 ---- fprintf (vect_dump, "dependence distance = %d.", dist); /* Same loop iteration. */ ! if (dist % vectorization_factor == 0 && dra_size == drb_size) { /* Two references with distance zero have the same alignment. */ VEC_safe_push (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a), drb); *************** vect_update_misalignment_for_peel (struc *** 834,845 **** struct data_reference *dr_peel, int npeel) { unsigned int i; - int drsize; VEC(dr_p,heap) *same_align_drs; struct data_reference *current_dr; if (known_alignment_for_access_p (dr) ! && DR_MISALIGNMENT (dr) == DR_MISALIGNMENT (dr_peel)) { DR_MISALIGNMENT (dr) = 0; return; --- 827,841 ---- struct data_reference *dr_peel, int npeel) { unsigned int i; VEC(dr_p,heap) *same_align_drs; struct data_reference *current_dr; + int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); + int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel)))); if (known_alignment_for_access_p (dr) ! && known_alignment_for_access_p (dr_peel) ! && (DR_MISALIGNMENT (dr)/dr_size == ! DR_MISALIGNMENT (dr_peel)/dr_peel_size)) { DR_MISALIGNMENT (dr) = 0; return; *************** vect_update_misalignment_for_peel (struc *** 853,859 **** { if (current_dr != dr) continue; ! gcc_assert (DR_MISALIGNMENT (dr) == DR_MISALIGNMENT (dr_peel)); DR_MISALIGNMENT (dr) = 0; return; } --- 849,856 ---- { if (current_dr != dr) continue; ! gcc_assert (DR_MISALIGNMENT (dr)/dr_size == ! DR_MISALIGNMENT (dr_peel)/dr_peel_size); DR_MISALIGNMENT (dr) = 0; return; } *************** vect_update_misalignment_for_peel (struc *** 861,872 **** if (known_alignment_for_access_p (dr) && known_alignment_for_access_p (dr_peel)) { ! drsize = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); ! DR_MISALIGNMENT (dr) += npeel * drsize; DR_MISALIGNMENT (dr) %= UNITS_PER_SIMD_WORD; return; } DR_MISALIGNMENT (dr) = -1; } --- 858,870 ---- if (known_alignment_for_access_p (dr) && known_alignment_for_access_p (dr_peel)) { ! DR_MISALIGNMENT (dr) += npeel * dr_size; DR_MISALIGNMENT (dr) %= UNITS_PER_SIMD_WORD; return; } + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Setting misalignment to -1."); DR_MISALIGNMENT (dr) = -1; } *************** vect_enhance_data_refs_alignment (loop_v *** 1011,1016 **** --- 1009,1017 ---- bool do_versioning = false; bool stat; + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "=== vect_enhance_data_refs_alignment ==="); + /* While cost model enhancements are expected in the future, the high level view of the code at this time is as follows: *************** vect_enhance_data_refs_alignment (loop_v *** 1077,1082 **** --- 1078,1085 ---- mis = DR_MISALIGNMENT (dr0); mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0)))); npeel = LOOP_VINFO_VECT_FACTOR (loop_vinfo) - mis; + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Try peeling by %d",npeel); } /* Ensure that all data refs can be vectorized after the peel. */ *************** vect_analyze_data_refs (loop_vec_info lo *** 1420,1433 **** static void vect_mark_relevant (VEC(tree,heap) **worklist, tree stmt, ! bool relevant_p, bool live_p) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); ! bool save_relevant_p = STMT_VINFO_RELEVANT_P (stmt_info); bool save_live_p = STMT_VINFO_LIVE_P (stmt_info); if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "mark relevant %d, live %d.",relevant_p, live_p); if (STMT_VINFO_IN_PATTERN_P (stmt_info)) { --- 1423,1436 ---- static void vect_mark_relevant (VEC(tree,heap) **worklist, tree stmt, ! enum vect_relevant relevant, bool live_p) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); ! enum vect_relevant save_relevant = STMT_VINFO_RELEVANT (stmt_info); bool save_live_p = STMT_VINFO_LIVE_P (stmt_info); if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "mark relevant %d, live %d.",relevant, live_p); if (STMT_VINFO_IN_PATTERN_P (stmt_info)) { *************** vect_mark_relevant (VEC(tree,heap) **wor *** 1442,1461 **** pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info); stmt_info = vinfo_for_stmt (pattern_stmt); gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt); ! save_relevant_p = STMT_VINFO_RELEVANT_P (stmt_info); save_live_p = STMT_VINFO_LIVE_P (stmt_info); stmt = pattern_stmt; } STMT_VINFO_LIVE_P (stmt_info) |= live_p; ! STMT_VINFO_RELEVANT_P (stmt_info) |= relevant_p; if (TREE_CODE (stmt) == PHI_NODE) /* Don't put phi-nodes in the worklist. Phis that are marked relevant or live will fail vectorization later on. */ return; ! if (STMT_VINFO_RELEVANT_P (stmt_info) == save_relevant_p && STMT_VINFO_LIVE_P (stmt_info) == save_live_p) { if (vect_print_dump_info (REPORT_DETAILS)) --- 1445,1465 ---- pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info); stmt_info = vinfo_for_stmt (pattern_stmt); gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt); ! save_relevant = STMT_VINFO_RELEVANT (stmt_info); save_live_p = STMT_VINFO_LIVE_P (stmt_info); stmt = pattern_stmt; } STMT_VINFO_LIVE_P (stmt_info) |= live_p; ! if (relevant > STMT_VINFO_RELEVANT (stmt_info)) ! STMT_VINFO_RELEVANT (stmt_info) = relevant; if (TREE_CODE (stmt) == PHI_NODE) /* Don't put phi-nodes in the worklist. Phis that are marked relevant or live will fail vectorization later on. */ return; ! if (STMT_VINFO_RELEVANT (stmt_info) == save_relevant && STMT_VINFO_LIVE_P (stmt_info) == save_live_p) { if (vect_print_dump_info (REPORT_DETAILS)) *************** vect_mark_relevant (VEC(tree,heap) **wor *** 1481,1487 **** static bool vect_stmt_relevant_p (tree stmt, loop_vec_info loop_vinfo, ! bool *relevant_p, bool *live_p) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); ssa_op_iter op_iter; --- 1485,1491 ---- static bool vect_stmt_relevant_p (tree stmt, loop_vec_info loop_vinfo, ! enum vect_relevant *relevant, bool *live_p) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); ssa_op_iter op_iter; *************** vect_stmt_relevant_p (tree stmt, loop_ve *** 1489,1500 **** use_operand_p use_p; def_operand_p def_p; ! *relevant_p = false; *live_p = false; /* cond stmt other than loop exit cond. */ if (is_ctrl_stmt (stmt) && (stmt != LOOP_VINFO_EXIT_COND (loop_vinfo))) ! *relevant_p = true; /* changing memory. */ if (TREE_CODE (stmt) != PHI_NODE) --- 1493,1504 ---- use_operand_p use_p; def_operand_p def_p; ! *relevant = vect_unused_in_loop; *live_p = false; /* cond stmt other than loop exit cond. */ if (is_ctrl_stmt (stmt) && (stmt != LOOP_VINFO_EXIT_COND (loop_vinfo))) ! *relevant = vect_used_in_loop; /* changing memory. */ if (TREE_CODE (stmt) != PHI_NODE) *************** vect_stmt_relevant_p (tree stmt, loop_ve *** 1502,1508 **** { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "vec_stmt_relevant_p: stmt has vdefs."); ! *relevant_p = true; } /* uses outside the loop. */ --- 1506,1512 ---- { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "vec_stmt_relevant_p: stmt has vdefs."); ! *relevant = vect_used_in_loop; } /* uses outside the loop. */ *************** vect_stmt_relevant_p (tree stmt, loop_ve *** 1526,1532 **** } } ! return (*live_p || *relevant_p); } --- 1530,1536 ---- } } ! return (*live_p || *relevant); } *************** vect_mark_stmts_to_be_vectorized (loop_v *** 1561,1567 **** stmt_vec_info stmt_vinfo; basic_block bb; tree phi; ! bool relevant_p, live_p; tree def, def_stmt; enum vect_def_type dt; --- 1565,1572 ---- stmt_vec_info stmt_vinfo; basic_block bb; tree phi; ! bool live_p; ! enum vect_relevant relevant; tree def, def_stmt; enum vect_def_type dt; *************** vect_mark_stmts_to_be_vectorized (loop_v *** 1581,1588 **** print_generic_expr (vect_dump, phi, TDF_SLIM); } ! if (vect_stmt_relevant_p (phi, loop_vinfo, &relevant_p, &live_p)) ! vect_mark_relevant (&worklist, phi, relevant_p, live_p); } for (i = 0; i < nbbs; i++) --- 1586,1593 ---- print_generic_expr (vect_dump, phi, TDF_SLIM); } ! if (vect_stmt_relevant_p (phi, loop_vinfo, &relevant, &live_p)) ! vect_mark_relevant (&worklist, phi, relevant, live_p); } for (i = 0; i < nbbs; i++) *************** vect_mark_stmts_to_be_vectorized (loop_v *** 1598,1605 **** print_generic_expr (vect_dump, stmt, TDF_SLIM); } ! if (vect_stmt_relevant_p (stmt, loop_vinfo, &relevant_p, &live_p)) ! vect_mark_relevant (&worklist, stmt, relevant_p, live_p); } } --- 1603,1610 ---- print_generic_expr (vect_dump, stmt, TDF_SLIM); } ! if (vect_stmt_relevant_p (stmt, loop_vinfo, &relevant, &live_p)) ! vect_mark_relevant (&worklist, stmt, relevant, live_p); } } *************** vect_mark_stmts_to_be_vectorized (loop_v *** 1616,1622 **** print_generic_expr (vect_dump, stmt, TDF_SLIM); } ! /* Examine the USEs of STMT. For each ssa-name USE thta is defined in the loop, mark the stmt that defines it (DEF_STMT) as relevant/irrelevant and live/dead according to the liveness and relevance properties of STMT. --- 1621,1627 ---- print_generic_expr (vect_dump, stmt, TDF_SLIM); } ! /* Examine the USEs of STMT. For each ssa-name USE that is defined in the loop, mark the stmt that defines it (DEF_STMT) as relevant/irrelevant and live/dead according to the liveness and relevance properties of STMT. *************** vect_mark_stmts_to_be_vectorized (loop_v *** 1627,1639 **** ann = stmt_ann (stmt); stmt_vinfo = vinfo_for_stmt (stmt); ! relevant_p = STMT_VINFO_RELEVANT_P (stmt_vinfo); live_p = STMT_VINFO_LIVE_P (stmt_vinfo); /* Generally, the liveness and relevance properties of STMT are propagated to the DEF_STMTs of its USEs: STMT_VINFO_LIVE_P (DEF_STMT_info) <-- live_p ! STMT_VINFO_RELEVANT_P (DEF_STMT_info) <-- relevant_p Exceptions: --- 1632,1644 ---- ann = stmt_ann (stmt); stmt_vinfo = vinfo_for_stmt (stmt); ! relevant = STMT_VINFO_RELEVANT (stmt_vinfo); live_p = STMT_VINFO_LIVE_P (stmt_vinfo); /* Generally, the liveness and relevance properties of STMT are propagated to the DEF_STMTs of its USEs: STMT_VINFO_LIVE_P (DEF_STMT_info) <-- live_p ! STMT_VINFO_RELEVANT (DEF_STMT_info) <-- relevant Exceptions: *************** vect_mark_stmts_to_be_vectorized (loop_v *** 1656,1673 **** the def_stmt of these uses we want to set liveness/relevance as follows: STMT_VINFO_LIVE_P (DEF_STMT_info) <-- false ! STMT_VINFO_RELEVANT_P (DEF_STMT_info) <-- true because even though STMT is classified as live (since it defines a value that is used across loop iterations) and irrelevant (since it is not used inside the loop), it will be vectorized, and therefore the corresponding DEF_STMTs need to marked as relevant. */ /* case 2.2: */ if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) { ! gcc_assert (!relevant_p && live_p); ! relevant_p = true; live_p = false; } --- 1661,1682 ---- the def_stmt of these uses we want to set liveness/relevance as follows: STMT_VINFO_LIVE_P (DEF_STMT_info) <-- false ! STMT_VINFO_RELEVANT (DEF_STMT_info) <-- vect_used_by_reduction because even though STMT is classified as live (since it defines a value that is used across loop iterations) and irrelevant (since it is not used inside the loop), it will be vectorized, and therefore the corresponding DEF_STMTs need to marked as relevant. + We distinguish between two kinds of relevant stmts - those that are + used by a reduction conputation, and those that are (also) used by a regular computation. This allows us later on to identify stmts + that are used solely by a reduction, and therefore the order of + the results that they produce does not have to be kept. */ /* case 2.2: */ if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) { ! gcc_assert (relevant == vect_unused_in_loop && live_p); ! relevant = vect_used_by_reduction; live_p = false; } *************** vect_mark_stmts_to_be_vectorized (loop_v *** 1707,1713 **** && TREE_CODE (def_stmt) == PHI_NODE) continue; ! vect_mark_relevant (&worklist, def_stmt, relevant_p, live_p); } } /* while worklist */ --- 1716,1722 ---- && TREE_CODE (def_stmt) == PHI_NODE) continue; ! vect_mark_relevant (&worklist, def_stmt, relevant, live_p); } } /* while worklist */ *************** vect_can_advance_ivs_p (loop_vec_info lo *** 1735,1741 **** /* Analyze phi functions of the loop header. */ if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "=== vect_can_advance_ivs_p ==="); for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi)) { --- 1744,1750 ---- /* Analyze phi functions of the loop header. */ if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "vect_can_advance_ivs_p:"); for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi)) { Index: tree.def =================================================================== *** tree.def (revision 115817) --- tree.def (working copy) *************** DEFTREECODE (WIDEN_MULT_EXPR, "widen_mul *** 1073,1078 **** --- 1073,1100 ---- DEFTREECODE (VEC_LSHIFT_EXPR, "vec_lshift_expr", tcc_binary, 2) DEFTREECODE (VEC_RSHIFT_EXPR, "vec_rshift_expr", tcc_binary, 2) + /* Widening vector multiplication. + The two operands are vectors with N elements of size S. Multiplying the + elements of the two vectors will result in N products of size 2*S. + VEC_WIDEN_MULT_HI_EXPR computes the N/2 high products. + VEC_WIDEN_MULT_LO_EXPR computes the N/2 low products. */ + DEFTREECODE (VEC_WIDEN_MULT_HI_EXPR, "widen_mult_hi_expr", tcc_binary, 2) + DEFTREECODE (VEC_WIDEN_MULT_LO_EXPR, "widen_mult_hi_expr", tcc_binary, 2) + + /* Unpack (extract and promote/widen) the high/low elements of the input vector + into the output vector. The input vector has twice as many elements + as the output vector, that are half the size of the elements + of the output vector. This is used to support type promotion. */ + DEFTREECODE (VEC_UNPACK_HI_EXPR, "vec_unpack_hi_expr", tcc_unary, 1) + DEFTREECODE (VEC_UNPACK_LO_EXPR, "vec_unpack_lo_expr", tcc_unary, 1) + + /* Pack (demote/narrow and merge) the elements of the two input vectors + into the output vector, using modulo/saturating arithmetic. + The elements of the input vectors are twice the size of the elements of the + output vector. This is used to support type demotion. */ + DEFTREECODE (VEC_PACK_MOD_EXPR, "vec_pack_mod_expr", tcc_binary, 2) + DEFTREECODE (VEC_PACK_SAT_EXPR, "vec_pack_sat_expr", tcc_binary, 2) + /* Local variables: mode:c Index: tree-vect-patterns.c =================================================================== *** tree-vect-patterns.c (revision 115817) --- tree-vect-patterns.c (working copy) *************** vect_recog_dot_prod_pattern (tree last_s *** 334,345 **** */ static tree ! vect_recog_widen_mult_pattern (tree last_stmt ATTRIBUTE_UNUSED, ! tree *type_in ATTRIBUTE_UNUSED, ! tree *type_out ATTRIBUTE_UNUSED) { ! /* Yet to be implemented. */ ! return NULL; } --- 334,402 ---- */ static tree ! vect_recog_widen_mult_pattern (tree last_stmt, ! tree *type_in, ! tree *type_out) { ! tree expr; ! tree def_stmt0, def_stmt1; ! tree oprnd0, oprnd1; ! tree type, half_type0, half_type1; ! tree pattern_expr; ! tree vectype; ! tree dummy; ! enum tree_code dummy_code; ! ! if (TREE_CODE (last_stmt) != MODIFY_EXPR) ! return NULL; ! ! expr = TREE_OPERAND (last_stmt, 1); ! type = TREE_TYPE (expr); ! ! /* Starting from LAST_STMT, follow the defs of its uses in search ! of the above pattern. */ ! ! if (TREE_CODE (expr) != MULT_EXPR) ! return NULL; ! ! oprnd0 = TREE_OPERAND (expr, 0); ! oprnd1 = TREE_OPERAND (expr, 1); ! if (TYPE_MAIN_VARIANT (TREE_TYPE (oprnd0)) != TYPE_MAIN_VARIANT (type) ! || TYPE_MAIN_VARIANT (TREE_TYPE (oprnd1)) != TYPE_MAIN_VARIANT (type)) ! return NULL; ! ! /* Check argument 0 */ ! if (!widened_name_p (oprnd0, last_stmt, &half_type0, &def_stmt0)) ! return NULL; ! oprnd0 = TREE_OPERAND (TREE_OPERAND (def_stmt0, 1), 0); ! ! /* Check argument 1 */ ! if (!widened_name_p (oprnd1, last_stmt, &half_type1, &def_stmt1)) ! return NULL; ! oprnd1 = TREE_OPERAND (TREE_OPERAND (def_stmt1, 1), 0); ! ! if (TYPE_MAIN_VARIANT (half_type0) != TYPE_MAIN_VARIANT (half_type1)) ! return NULL; ! ! /* Pattern detected. */ ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "vect_recog_widen_mult_pattern: detected: "); ! ! /* Check target support */ ! vectype = get_vectype_for_scalar_type (half_type0); ! if (!supportable_widening_operation (WIDEN_MULT_EXPR, last_stmt, vectype, ! &dummy, &dummy, &dummy_code, ! &dummy_code)) ! return NULL; ! ! *type_in = vectype; ! *type_out = NULL_TREE; ! ! /* Pattern supported. Create a stmt to be used to replace the pattern: */ ! pattern_expr = build2 (WIDEN_MULT_EXPR, type, oprnd0, oprnd1); ! if (vect_print_dump_info (REPORT_DETAILS)) ! print_generic_expr (vect_dump, pattern_expr, TDF_SLIM); ! return pattern_expr; } Index: target-def.h =================================================================== *** target-def.h (revision 115817) --- target-def.h (working copy) *************** Foundation, 51 Franklin Street, Fifth Fl *** 326,334 **** TARGET_SCHED_SET_SCHED_FLAGS} #define TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD 0 #define TARGET_VECTORIZE \ ! {TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD} #define TARGET_DEFAULT_TARGET_FLAGS 0 --- 326,338 ---- TARGET_SCHED_SET_SCHED_FLAGS} #define TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD 0 + #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN 0 + #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD 0 #define TARGET_VECTORIZE \ ! {TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD, \ ! TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN, \ ! TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD} #define TARGET_DEFAULT_TARGET_FLAGS 0 Index: tree-inline.c =================================================================== *** tree-inline.c (revision 115817) --- tree-inline.c (working copy) *************** estimate_num_insns_1 (tree *tp, int *wal *** 1759,1764 **** --- 1759,1770 ---- case REDUC_PLUS_EXPR: case WIDEN_SUM_EXPR: case DOT_PROD_EXPR: + case VEC_WIDEN_MULT_HI_EXPR: + case VEC_WIDEN_MULT_LO_EXPR: + case VEC_UNPACK_HI_EXPR: + case VEC_UNPACK_LO_EXPR: + case VEC_PACK_MOD_EXPR: + case VEC_PACK_SAT_EXPR: case WIDEN_MULT_EXPR: Index: tree-vect-transform.c =================================================================== *** tree-vect-transform.c (revision 115817) --- tree-vect-transform.c (working copy) *************** Software Foundation, 51 Franklin Street, *** 47,59 **** /* Utility functions for the code transformation. */ static bool vect_transform_stmt (tree, block_stmt_iterator *); - static void vect_align_data_ref (tree); static tree vect_create_destination_var (tree, tree); static tree vect_create_data_ref_ptr ! (tree, block_stmt_iterator *, tree, tree *, bool); static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree); static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *); static tree vect_get_vec_def_for_operand (tree, tree, tree *); static tree vect_init_vector (tree, tree); static void vect_finish_stmt_generation (tree stmt, tree vec_stmt, block_stmt_iterator *bsi); --- 47,61 ---- /* Utility functions for the code transformation. */ static bool vect_transform_stmt (tree, block_stmt_iterator *); static tree vect_create_destination_var (tree, tree); static tree vect_create_data_ref_ptr ! (tree, block_stmt_iterator *, tree, tree *, tree *, bool); static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree); + static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *); + static tree bump_vector_ptr (tree, tree, block_stmt_iterator *, tree); static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *); static tree vect_get_vec_def_for_operand (tree, tree, tree *); + static tree vect_get_vec_def_for_stmt_copy (enum vect_def_type, tree); static tree vect_init_vector (tree, tree); static void vect_finish_stmt_generation (tree stmt, tree vec_stmt, block_stmt_iterator *bsi); *************** static bool vect_is_simple_cond (tree, l *** 61,66 **** --- 63,70 ---- static void update_vuses_to_preheader (tree, struct loop*); static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree); static tree get_initial_def_for_reduction (tree, tree, tree *); + static tree vect_gen_widened_results_half (enum tree_code, tree, tree, tree, + tree, int, tree, block_stmt_iterator *, tree); /* Utility function dealing with loop peeling (not peeling itself). */ static void vect_generate_tmps_on_preheader *************** vect_create_addr_base_for_vector_ref (tr *** 191,220 **** } - /* Function vect_align_data_ref. - - Handle misalignment of a memory accesses. - - FORNOW: Can't handle misaligned accesses. - Make sure that the dataref is aligned. */ - - static void - vect_align_data_ref (tree stmt) - { - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - - /* FORNOW: can't handle misaligned accesses; - all accesses expected to be aligned. */ - gcc_assert (aligned_access_p (dr)); - } - - /* Function vect_create_data_ref_ptr. ! Create a memory reference expression for vector access, to be used in a ! vector load/store stmt. The reference is based on a new pointer to vector ! type (vp). Input: 1. STMT: a stmt that references memory. Expected to be of the form --- 195,208 ---- } /* Function vect_create_data_ref_ptr. ! Creat a new pointer to vector type (vp), that points to the first location ! accessed in the loop by STMT, along with the def-use update chain to ! appropriately advace the pointer through the loop iterations. Also set ! aliasing information for the pointer. This vector pointer is used by the ! callers to this function to create a memory reference expression for vector ! load/store access. Input: 1. STMT: a stmt that references memory. Expected to be of the form *************** vect_align_data_ref (tree stmt) *** 240,256 **** Return the initial_address in INITIAL_ADDRESS. ! 2. If ONLY_INIT is true, return the initial pointer. Otherwise, create ! a data-reference in the loop based on the new vector pointer vp. This ! new data reference will by some means be updated each iteration of ! the loop. Return the pointer vp'. ! FORNOW: handle only aligned and consecutive accesses. */ static tree vect_create_data_ref_ptr (tree stmt, block_stmt_iterator *bsi ATTRIBUTE_UNUSED, ! tree offset, tree *initial_address, bool only_init) { tree base_name; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); --- 228,245 ---- Return the initial_address in INITIAL_ADDRESS. ! 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also ! update the pointer in each iteration of the loop. ! Return the increment stmt that updates the pointer in PTR_INCR. ! ! 3. Return the pointer. */ static tree vect_create_data_ref_ptr (tree stmt, block_stmt_iterator *bsi ATTRIBUTE_UNUSED, ! tree offset, tree *initial_address, tree *ptr_incr, ! bool only_init) { tree base_name; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); *************** vect_create_data_ref_ptr (tree stmt, *** 362,373 **** --- 351,435 ---- } merge_alias_info (vect_ptr_init, indx_before_incr); merge_alias_info (vect_ptr_init, indx_after_incr); + if (ptr_incr) + *ptr_incr = incr; return indx_before_incr; } } + /* Function bump_vector_ptr + + Increment a pointer (to a vector type) by vector-size. Connect the new + increment stmt to the exising def-use update-chain of the pointer. + + The pointer def-use update-chain before this function: + DATAREF_PTR = phi (p_0, p_2) + .... + PTR_INCR: p_2 = DATAREF_PTR + step + + The pointer def-use update-chain after this function: + DATAREF_PTR = phi (p_0, p_2) + .... + NEW_DATAREF_PTR = DATAREF_PTR + vector_size + .... + PTR_INCR: p_2 = NEW_DATAREF_PTR + step + + Input: + DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated + in the loop. + PTR_INCR - the stmt that updates the pointer in each iteration of the loop. + The increment amount across iterations is also expected to be + vector_size. + BSI - location where the new update stmt is to be placed. + STMT - the original scalar memory-access stmt that is being vectorized. + + Output: Return NEW_DATAREF_PTR as illustrated above. + + */ + + static tree + bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, + tree stmt) + { + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + tree vptr_type = TREE_TYPE (dataref_ptr); + tree ptr_var = SSA_NAME_VAR (dataref_ptr); + tree update = fold_convert (vptr_type, TYPE_SIZE_UNIT (vectype)); + tree incr_stmt; + ssa_op_iter iter; + use_operand_p use_p; + tree new_dataref_ptr; + + incr_stmt = build2 (MODIFY_EXPR, vptr_type, ptr_var, + build2 (PLUS_EXPR, vptr_type, dataref_ptr, update)); + new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt); + TREE_OPERAND (incr_stmt, 0) = new_dataref_ptr; + vect_finish_stmt_generation (stmt, incr_stmt, bsi); + + /* Update the vector-pointer's cross-iteration increment. */ + FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) + { + tree use = USE_FROM_PTR (use_p); + + if (use == dataref_ptr) + SET_USE (use_p, new_dataref_ptr); + else + gcc_assert (tree_int_cst_compare (use, update) == 0); + } + + /* Copy the points-to information if it exists. */ + if (DR_PTR_INFO (dr)) + duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); + merge_alias_info (new_dataref_ptr, dataref_ptr); + + return new_dataref_ptr; + } + + /* Function vect_create_destination_var. Create a new temporary of type VECTYPE. */ *************** vect_get_vec_def_for_operand (tree op, t *** 568,581 **** } /* Function vect_finish_stmt_generation. Insert a new stmt. */ static void ! vect_finish_stmt_generation (tree stmt, tree vec_stmt, block_stmt_iterator *bsi) { bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT); if (vect_print_dump_info (REPORT_DETAILS)) { --- 630,728 ---- } + /* Function vect_get_vec_def_for_stmt_copy + + Return a vector-def for an operand. This function is used when the + vectorized stmt to be created (by the caller to this function) is a "copy" + created in case the vectorized result cannot fit in one vector, and several + copies of the vector-stmt are required. In this case the vector-def is + retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field + of the stmt that defines VEC_OPRND. + DT is the type of the vector def VEC_OPRND. + + Context: + In case the vectorization factor (VF) is bigger than the number + of elements that can fit in a vectype (nunits), we have to generate + more than one vector stmt to vectorize the scalar stmt. This situation + arises when there are multiple data-types operated upon in the loop; the + smallest data-type determines the VF, and as a result, when vectorizing + stmts operating on wider types we need to create 'VF/nunits' "copies" of the + vector stmt (each computing a vector of 'nunits' results, and together + computing 'VF' results in each iteration). This function is called when + vectorizing such a stmt (e.g. vectorizing S2 in the illusration below, in + which VF=16 and nuniti=4, so the number of copies required is 4): + + scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT + + S1: x = load VS1.0: vx.0 = memref0 VS1.1 + VS1.1: vx.1 = memref1 VS1.2 + VS1.2: vx.2 = memref2 VS1.3 + VS1.3: vx.3 = memref3 + + S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1 + VSnew.1: vz1 = vx.1 + ... VSnew.2 + VSnew.2: vz2 = vx.2 + ... VSnew.3 + VSnew.3: vz3 = vx.3 + ... + + The vectorization of S1 is explained in vectorizable_load. + The vectorization of S2: + To create the first vector-stmt out of the 4 copies - VSnew.0 - + the function 'vect_get_vec_def_for_operand' is called to + get the relevant vector-def for each operand of S2. For operand x it + returns the vector-def 'vx.0'. + + To create the remaining copies of the vector-stmt (VSnew.j), this + function is called to get the relevant vector-def for each operand. It is + obtained from the respective VS1.j stmt, which is recorded in the + STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND. + + For example, to obtain the vector-def 'vx.1' in order to create the + vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'. + Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the + STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1', + and return its def ('vx.1'). + Overall, to create the above sequence this function will be called 3 times: + vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0); + vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1); + vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */ + + static tree + vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd) + { + tree vec_stmt_for_operand; + stmt_vec_info def_stmt_info; + + if (dt == vect_invariant_def || dt == vect_constant_def) + { + /* Do nothing; can reuse same def. */ ; + return vec_oprnd; + } + + vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd); + def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand); + gcc_assert (def_stmt_info); + vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info); + gcc_assert (vec_stmt_for_operand); + vec_oprnd = TREE_OPERAND (vec_stmt_for_operand, 0); + + return vec_oprnd; + } + + /* Function vect_finish_stmt_generation. Insert a new stmt. */ static void ! vect_finish_stmt_generation (tree stmt, tree vec_stmt, ! block_stmt_iterator *bsi) { + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT); + set_stmt_info (get_tree_ann (vec_stmt), + new_stmt_vec_info (vec_stmt, loop_vinfo)); if (vect_print_dump_info (REPORT_DETAILS)) { *************** vectorizable_reduction (tree stmt, block *** 1135,1141 **** tree vec_dest; tree scalar_dest; tree op; ! tree loop_vec_def0, loop_vec_def1; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); --- 1282,1288 ---- tree vec_dest; tree scalar_dest; tree op; ! tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); *************** vectorizable_reduction (tree stmt, block *** 1145,1151 **** enum machine_mode vec_mode; int op_type; optab optab, reduc_optab; ! tree new_temp; tree def, def_stmt; enum vect_def_type dt; tree new_phi; --- 1292,1298 ---- enum machine_mode vec_mode; int op_type; optab optab, reduc_optab; ! tree new_temp = NULL_TREE; tree def, def_stmt; enum vect_def_type dt; tree new_phi; *************** vectorizable_reduction (tree stmt, block *** 1155,1160 **** --- 1302,1315 ---- stmt_vec_info orig_stmt_info; tree expr = NULL_TREE; int i; + int nunits = TYPE_VECTOR_SUBPARTS (vectype); + int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; + stmt_vec_info prev_stmt_info; + tree reduc_def; + tree new_stmt = NULL_TREE; + int j; + + gcc_assert (ncopies >= 1); /* 1. Is vectorizable reduction? */ *************** vectorizable_reduction (tree stmt, block *** 1194,1200 **** operation = TREE_OPERAND (stmt, 1); code = TREE_CODE (operation); op_type = TREE_CODE_LENGTH (code); - if (op_type != binary_op && op_type != ternary_op) return false; scalar_dest = TREE_OPERAND (stmt, 0); --- 1349,1354 ---- *************** vectorizable_reduction (tree stmt, block *** 1339,1366 **** /* Create the reduction-phi that defines the reduction-operand. */ new_phi = create_phi_node (vec_dest, loop->header); ! /* Prepare the operand that is defined inside the loop body */ ! op = TREE_OPERAND (operation, 0); ! loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL); ! if (op_type == binary_op) ! expr = build2 (code, vectype, loop_vec_def0, PHI_RESULT (new_phi)); ! else if (op_type == ternary_op) { ! op = TREE_OPERAND (operation, 1); ! loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL); ! expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, ! PHI_RESULT (new_phi)); } ! ! /* Create the vectorized operation that computes the partial results */ ! *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, expr); ! new_temp = make_ssa_name (vec_dest, *vec_stmt); ! TREE_OPERAND (*vec_stmt, 0) = new_temp; ! vect_finish_stmt_generation (stmt, *vec_stmt, bsi); ! /* Finalize the reduction-phi (set it's arguments) and create the epilog reduction code. */ ! vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi); return true; } --- 1493,1554 ---- /* Create the reduction-phi that defines the reduction-operand. */ new_phi = create_phi_node (vec_dest, loop->header); ! /* In case the vectorization factor (VF) is bigger than the number ! of elements that we can fit in a vectype (nunits), we have to generate ! more than one vector stmt - i.e - we need to "unroll" the ! vector stmt by a factor VF/nunits. For more details see documentation ! in vectorizable_operation. */ ! ! prev_stmt_info = NULL; ! for (j = 0; j < ncopies; j++) { ! /* Handle uses. */ ! if (j == 0) ! { ! op = TREE_OPERAND (operation, 0); ! loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL); ! if (op_type == ternary_op) ! { ! op = TREE_OPERAND (operation, 1); ! loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL); ! } ! ! /* Get the vector def for the reduction variable from the phi node */ ! reduc_def = PHI_RESULT (new_phi); ! } ! else ! { ! enum vect_def_type dt = vect_unknown_def_type; /* Dummy */ ! loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0); ! if (op_type == ternary_op) ! loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1); ! ! /* Get the vector def for the reduction variable from the vectorized ! reduction operation generated in the previous iteration (j-1) */ ! reduc_def = TREE_OPERAND (new_stmt ,0); ! } ! ! /* Arguments are ready. create the new vector stmt. */ ! ! if (op_type == binary_op) ! expr = build2 (code, vectype, loop_vec_def0, reduc_def); ! else ! expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, reduc_def); ! new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, expr); ! new_temp = make_ssa_name (vec_dest, new_stmt); ! TREE_OPERAND (new_stmt, 0) = new_temp; ! vect_finish_stmt_generation (stmt, new_stmt, bsi); ! ! if (j == 0) ! STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; ! else ! STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; ! prev_stmt_info = vinfo_for_stmt (new_stmt); } ! /* Finalize the reduction-phi (set it's arguments) and create the epilog reduction code. */ ! vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi); return true; } *************** vectorizable_assignment (tree stmt, bloc *** 1385,1390 **** --- 1573,1584 ---- tree new_temp; tree def, def_stmt; enum vect_def_type dt; + int nunits = TYPE_VECTOR_SUBPARTS (vectype); + int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; + + gcc_assert (ncopies >= 1); + if (ncopies > 1) + return false; /* FORNOW */ /* Is vectorizable assignment? */ if (!STMT_VINFO_RELEVANT_P (stmt_info)) *************** vectorizable_operation (tree stmt, block *** 1475,1495 **** tree scalar_dest; tree operation; tree op0, op1 = NULL; ! tree vec_oprnd0, vec_oprnd1=NULL; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - int i; enum tree_code code; enum machine_mode vec_mode; tree new_temp; int op_type; - tree op; optab optab; int icode; enum machine_mode optab_op2_mode; tree def, def_stmt; ! enum vect_def_type dt; /* Is STMT a vectorizable binary/unary operation? */ if (!STMT_VINFO_RELEVANT_P (stmt_info)) --- 1669,1696 ---- tree scalar_dest; tree operation; tree op0, op1 = NULL; ! tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); enum tree_code code; enum machine_mode vec_mode; tree new_temp; int op_type; optab optab; int icode; enum machine_mode optab_op2_mode; tree def, def_stmt; ! enum vect_def_type dt0, dt1; ! tree new_stmt; ! stmt_vec_info prev_stmt_info; ! int nunits_in = TYPE_VECTOR_SUBPARTS (vectype); ! int nunits_out; ! tree vectype_out; ! int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; ! int j; ! ! gcc_assert (ncopies >= 1); /* Is STMT a vectorizable binary/unary operation? */ if (!STMT_VINFO_RELEVANT_P (stmt_info)) *************** vectorizable_operation (tree stmt, block *** 1511,1516 **** --- 1712,1723 ---- if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME) return false; + scalar_dest = TREE_OPERAND (stmt, 0); + vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest)); + nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); + if (nunits_out != nunits_in) + return false; + operation = TREE_OPERAND (stmt, 1); code = TREE_CODE (operation); optab = optab_for_tree_code (code, vectype); *************** vectorizable_operation (tree stmt, block *** 1524,1539 **** return false; } ! for (i = 0; i < op_type; i++) { ! op = TREE_OPERAND (operation, i); ! if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt)) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "use not simple."); return false; ! } ! } /* Supportable by target? */ if (!optab) --- 1731,1754 ---- return false; } ! op0 = TREE_OPERAND (operation, 0); ! if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0)) { ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "use not simple."); ! return false; ! } ! ! if (op_type == binary_op) ! { ! op1 = TREE_OPERAND (operation, 1); ! if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1)) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "use not simple."); return false; ! } ! } /* Supportable by target? */ if (!optab) *************** vectorizable_operation (tree stmt, block *** 1576,1583 **** by a scalar shift operand. */ optab_op2_mode = insn_data[icode].operand[2].mode; if (! (VECTOR_MODE_P (optab_op2_mode) ! || dt == vect_constant_def ! || dt == vect_invariant_def)) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "operand mode requires invariant argument."); --- 1791,1798 ---- by a scalar shift operand. */ optab_op2_mode = insn_data[icode].operand[2].mode; if (! (VECTOR_MODE_P (optab_op2_mode) ! || dt1 == vect_constant_def ! || dt1 == vect_invariant_def)) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "operand mode requires invariant argument."); *************** vectorizable_operation (tree stmt, block *** 1597,1645 **** fprintf (vect_dump, "transform binary/unary operation."); /* Handle def. */ - scalar_dest = TREE_OPERAND (stmt, 0); vec_dest = vect_create_destination_var (scalar_dest, vectype); ! /* Handle uses. */ op0 = TREE_OPERAND (operation, 0); ! vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); if (op_type == binary_op) { op1 = TREE_OPERAND (operation, 1); ! if (code == LSHIFT_EXPR || code == RSHIFT_EXPR) ! { ! /* Vector shl and shr insn patterns can be defined with ! scalar operand 2 (shift operand). In this case, use ! constant or loop invariant op1 directly, without ! extending it to vector mode first. */ ! optab_op2_mode = insn_data[icode].operand[2].mode; ! if (!VECTOR_MODE_P (optab_op2_mode)) ! { ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "operand 1 using scalar mode."); ! vec_oprnd1 = op1; ! } ! } ! if (!vec_oprnd1) ! vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL); } ! /* Arguments are ready. create the new vector stmt. */ ! if (op_type == binary_op) ! *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, ! build2 (code, vectype, vec_oprnd0, vec_oprnd1)); ! else ! *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, ! build1 (code, vectype, vec_oprnd0)); ! new_temp = make_ssa_name (vec_dest, *vec_stmt); ! TREE_OPERAND (*vec_stmt, 0) = new_temp; ! vect_finish_stmt_generation (stmt, *vec_stmt, bsi); return true; } --- 1812,2297 ---- fprintf (vect_dump, "transform binary/unary operation."); /* Handle def. */ vec_dest = vect_create_destination_var (scalar_dest, vectype); ! /* In case the vectorization factor (VF) is bigger than the number ! of elements that we can fit in a vectype (nunits), we have to generate ! more than one vector stmt - i.e - we need to "unroll" the ! vector stmt by a factor VF/nunits. In doing so, we record a pointer ! from one copy of the vector stmt to the next, in the field ! STMT_VINFO_RELATED_STMT. This is necessary in order to allow following ! stages to find the correct vector defs to be used when vectorizing ! stmts that use the defs of the current stmt. The example below illustrates ! the vectorization process when VF=16 and nunits=4 (i.e - we need to create ! 4 vectorized stmts): ! ! before vectorization: ! RELATED_STMT VEC_STMT ! S1: x = memref - - ! S2: z = x + 1 - - ! ! step 1: vectorize stmt S1 (done in vectorizable_load. See more details ! there): ! RELATED_STMT VEC_STMT ! VS1_0: vx0 = memref0 VS1_1 - ! VS1_1: vx1 = memref1 VS1_2 - ! VS1_2: vx2 = memref2 VS1_3 - ! VS1_3: vx3 = memref3 - - ! S1: x = load - VS1_0 ! S2: z = x + 1 - - ! ! step2: vectorize stmt S2 (done here): ! To vectorize stmt S2 we first need to find the relevant vector ! def for the first operand 'x'. This is, as usual, obtained from ! the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt ! that defines 'x' (S1). This way we find the stmt VS1_0, and the ! relevant vector def 'vx0'. Having found 'vx0' we can generate ! the vector stmt VS2_0, and as usual, record it in the ! STMT_VINFO_VEC_STMT of stmt S2. ! When creating the second copy (VS2_1), we obtain the relevant vector ! def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of ! stmt VS1_0. This way we find the stmt VS1_1 and the relevant ! vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a ! pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0. ! Similarly when creating stmts VS2_2 and VS2_3. This is the resulting ! chain of stmts and pointers: ! RELATED_STMT VEC_STMT ! VS1_0: vx0 = memref0 VS1_1 - ! VS1_1: vx1 = memref1 VS1_2 - ! VS1_2: vx2 = memref2 VS1_3 - ! VS1_3: vx3 = memref3 - - ! S1: x = load - VS1_0 ! VS2_0: vz0 = vx0 + v1 VS2_1 - ! VS2_1: vz1 = vx1 + v1 VS2_2 - ! VS2_2: vz2 = vx2 + v1 VS2_3 - ! VS2_3: vz3 = vx3 + v1 - - ! S2: z = x + 1 - VS2_0 */ ! ! prev_stmt_info = NULL; ! for (j = 0; j < ncopies; j++) ! { ! /* Handle uses. */ ! if (j == 0) ! { ! vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); ! if (op_type == binary_op) ! { ! if (code == LSHIFT_EXPR || code == RSHIFT_EXPR) ! { ! /* Vector shl and shr insn patterns can be defined with ! scalar operand 2 (shift operand). In this case, use ! constant or loop invariant op1 directly, without ! extending it to vector mode first. */ ! optab_op2_mode = insn_data[icode].operand[2].mode; ! if (!VECTOR_MODE_P (optab_op2_mode)) ! { ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "operand 1 using scalar mode."); ! vec_oprnd1 = op1; ! } ! } ! if (!vec_oprnd1) ! vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL); ! } ! } ! else ! { ! vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0); ! if (op_type == binary_op) ! vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1); ! } ! ! /* Arguments are ready. create the new vector stmt. */ ! ! if (op_type == binary_op) ! new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, ! build2 (code, vectype, vec_oprnd0, vec_oprnd1)); ! else ! new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, ! build1 (code, vectype, vec_oprnd0)); ! new_temp = make_ssa_name (vec_dest, new_stmt); ! TREE_OPERAND (new_stmt, 0) = new_temp; ! vect_finish_stmt_generation (stmt, new_stmt, bsi); ! ! if (j == 0) ! STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; ! else ! STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; ! prev_stmt_info = vinfo_for_stmt (new_stmt); ! } ! ! return true; ! } ! ! ! /* Function vectorizable_type_demotion ! ! Check if STMT performs a binary or unary operation that involves ! type demotion, and if it can be vectorized. ! If VEC_STMT is also passed, vectorize the STMT: create a vectorized ! stmt to replace it, put it in VEC_STMT, and insert it at BSI. ! Return FALSE if not a vectorizable STMT, TRUE otherwise. */ ! ! bool ! vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi, ! tree *vec_stmt) ! { ! tree vec_dest; ! tree scalar_dest; ! tree operation; ! tree op0; ! tree vec_oprnd0=NULL, vec_oprnd1=NULL; ! stmt_vec_info stmt_info = vinfo_for_stmt (stmt); ! loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); ! enum tree_code code; ! tree new_temp; ! tree def, def_stmt; ! enum vect_def_type dt0; ! tree new_stmt; ! stmt_vec_info prev_stmt_info; ! int nunits_in; ! int nunits_out; ! tree vectype_out; ! int ncopies; ! int j; ! tree expr; ! tree vectype_in; ! tree scalar_type; ! optab optab; ! enum machine_mode vec_mode; ! ! /* Is STMT a vectorizable type-demotion operation? */ ! ! if (!STMT_VINFO_RELEVANT_P (stmt_info)) ! return false; ! ! gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def); ! ! if (STMT_VINFO_LIVE_P (stmt_info)) ! { ! /* FORNOW: not yet supported. */ ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "value used after loop."); ! return false; ! } ! ! if (TREE_CODE (stmt) != MODIFY_EXPR) ! return false; ! ! if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME) ! return false; ! ! operation = TREE_OPERAND (stmt, 1); ! code = TREE_CODE (operation); ! if (code != NOP_EXPR && code != CONVERT_EXPR) ! return false; ! op0 = TREE_OPERAND (operation, 0); ! vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0)); ! nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); ! ! scalar_dest = TREE_OPERAND (stmt, 0); ! scalar_type = TREE_TYPE (scalar_dest); ! vectype_out = get_vectype_for_scalar_type (scalar_type); ! nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); ! if (nunits_in != nunits_out/2) /* FORNOW */ ! return false; ! ! ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out; ! gcc_assert (ncopies >= 1); ! ! /* Check the operands of the operation. */ ! if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0)) ! { ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "use not simple."); ! return false; ! } ! ! /* Supportable by target? */ ! code = VEC_PACK_MOD_EXPR; ! optab = optab_for_tree_code (VEC_PACK_MOD_EXPR, vectype_in); ! if (!optab) ! return false; ! ! vec_mode = TYPE_MODE (vectype_in); ! if (optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing) ! return false; ! ! STMT_VINFO_VECTYPE (stmt_info) = vectype_in; ! ! if (!vec_stmt) /* transformation not required. */ ! { ! STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; ! return true; ! } ! ! /** Transform. **/ ! ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "transform type demotion operation. ncopies = %d.", ! ncopies); ! ! /* Handle def. */ ! vec_dest = vect_create_destination_var (scalar_dest, vectype_out); ! ! /* In case the vectorization factor (VF) is bigger than the number ! of elements that we can fit in a vectype (nunits), we have to generate ! more than one vector stmt - i.e - we need to "unroll" the ! vector stmt by a factor VF/nunits. */ ! prev_stmt_info = NULL; ! for (j = 0; j < ncopies; j++) ! { ! /* Handle uses. */ ! if (j == 0) ! { ! enum vect_def_type dt = vect_unknown_def_type; /* Dummy */ ! vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); ! vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd0); ! } ! else ! { ! vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1); ! vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0); ! } ! ! /* Arguments are ready. Create the new vector stmt. */ ! expr = build2 (code, vectype_out, vec_oprnd0, vec_oprnd1); ! new_stmt = build2 (MODIFY_EXPR, vectype_out, vec_dest, expr); ! new_temp = make_ssa_name (vec_dest, new_stmt); ! TREE_OPERAND (new_stmt, 0) = new_temp; ! vect_finish_stmt_generation (stmt, new_stmt, bsi); ! ! if (j == 0) ! STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; ! else ! STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; ! ! prev_stmt_info = vinfo_for_stmt (new_stmt); ! } ! ! *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); ! return true; ! } ! ! ! /* Function vect_gen_widened_results_half ! ! Create a vector stmt whose code, type, number of arguments, and result ! variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are ! VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI. ! In the case that CODE is a CALL_EXPR, this means that a call to DECL ! needs to be created (DECL is a function-decl of a target-builtin). ! STMT is the original scalar stmt that we are vectorizing. */ ! ! static tree ! vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl, ! tree vec_oprnd0, tree vec_oprnd1, int op_type, ! tree vec_dest, block_stmt_iterator *bsi, ! tree stmt) ! { ! tree vec_params; ! tree expr; ! tree new_stmt; ! tree new_temp; ! tree sym; ! ssa_op_iter iter; ! ! /* Generate half of the widened result: */ ! if (code == CALL_EXPR) ! { ! /* Target specific support */ ! vec_params = build_tree_list (NULL_TREE, vec_oprnd0); ! if (op_type == binary_op) ! vec_params = tree_cons (NULL_TREE, vec_oprnd1, vec_params); ! expr = build_function_call_expr (decl, vec_params); ! } ! else ! { ! /* Generic support */ ! gcc_assert (op_type == TREE_CODE_LENGTH (code)); ! if (op_type == binary_op) ! expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1); ! else ! expr = build1 (code, vectype, vec_oprnd0); ! } ! new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, expr); ! new_temp = make_ssa_name (vec_dest, new_stmt); ! TREE_OPERAND (new_stmt, 0) = new_temp; ! vect_finish_stmt_generation (stmt, new_stmt, bsi); + if (code == CALL_EXPR) + { + FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS) + { + if (TREE_CODE (sym) == SSA_NAME) + sym = SSA_NAME_VAR (sym); + mark_sym_for_renaming (sym); + } + } + + return new_stmt; + } + + + /* Function vectorizable_type_promotion + + Check if STMT performs a binary or unary operation that involves + type promotion, and if it can be vectorized. + If VEC_STMT is also passed, vectorize the STMT: create a vectorized + stmt to replace it, put it in VEC_STMT, and insert it at BSI. + Return FALSE if not a vectorizable STMT, TRUE otherwise. */ + + bool + vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi, + tree *vec_stmt) + { + tree vec_dest; + tree scalar_dest; + tree operation; + tree op0, op1 = NULL; + tree vec_oprnd0=NULL, vec_oprnd1=NULL; + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + enum tree_code code, code1 = CODE_FOR_nothing, code2 = CODE_FOR_nothing; + tree decl1 = NULL_TREE, decl2 = NULL_TREE; + int op_type; + tree def, def_stmt; + enum vect_def_type dt0, dt1; + tree new_stmt; + stmt_vec_info prev_stmt_info; + int nunits_in; + int nunits_out; + tree vectype_out; + int ncopies; + int j; + tree vectype_in; + + /* Is STMT a vectorizable type-promotion operation? */ + + if (!STMT_VINFO_RELEVANT_P (stmt_info)) + return false; + + gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_loop_def); + + if (STMT_VINFO_LIVE_P (stmt_info)) + { + /* FORNOW: not yet supported. */ + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "value used after loop."); + return false; + } + + if (TREE_CODE (stmt) != MODIFY_EXPR) + return false; + + if (TREE_CODE (TREE_OPERAND (stmt, 0)) != SSA_NAME) + return false; + + operation = TREE_OPERAND (stmt, 1); + code = TREE_CODE (operation); + if (code != NOP_EXPR && code != WIDEN_MULT_EXPR) + return false; + + op0 = TREE_OPERAND (operation, 0); + vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0)); + nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); + ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; + gcc_assert (ncopies >= 1); + + scalar_dest = TREE_OPERAND (stmt, 0); + vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest)); + nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); + if (nunits_out != nunits_in/2) /* FORNOW */ + return false; + + /* Check the operands of the operation. */ + if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "use not simple."); + return false; + } + + op_type = TREE_CODE_LENGTH (code); if (op_type == binary_op) { op1 = TREE_OPERAND (operation, 1); + if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt1)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "use not simple."); + return false; + } + } ! /* Supportable by target? */ ! if (!supportable_widening_operation (code, stmt, vectype_in, ! &decl1, &decl2, &code1, &code2)) ! return false; ! STMT_VINFO_VECTYPE (stmt_info) = vectype_in; ! if (!vec_stmt) /* transformation not required. */ ! { ! STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; ! return true; } ! /** Transform. **/ ! if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "transform type promotion operation. ncopies = %d.", ! ncopies); ! ! /* Handle def. */ ! vec_dest = vect_create_destination_var (scalar_dest, vectype_out); + /* In case the vectorization factor (VF) is bigger than the number + of elements that we can fit in a vectype (nunits), we have to generate + more than one vector stmt - i.e - we need to "unroll" the + vector stmt by a factor VF/nunits. */ + + prev_stmt_info = NULL; + for (j = 0; j < ncopies; j++) + { + /* Handle uses. */ + if (j == 0) + { + vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); + if (op_type == binary_op) + vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL); + } + else + { + vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0); + if (op_type == binary_op) + vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt1, vec_oprnd1); + } + + /* Arguments are ready. Create the new vector stmt. We are creating + two vector defs because the widened result does not fit in one vector. + The vectorized stmt can be expressed as a call to a taregt builtin, + or a using a tree-code. + */ + /* Generate first half of the widened result: */ + new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1, + vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt); + if (j == 0) + STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; + else + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; + prev_stmt_info = vinfo_for_stmt (new_stmt); + + /* Generate second half of the widened result: */ + new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2, + vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt); + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; + prev_stmt_info = vinfo_for_stmt (new_stmt); + + } + + *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); return true; } *************** vectorizable_store (tree stmt, block_stm *** 1658,1664 **** tree scalar_dest; tree data_ref; tree op; ! tree vec_oprnd1; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); tree vectype = STMT_VINFO_VECTYPE (stmt_info); --- 2310,2316 ---- tree scalar_dest; tree data_ref; tree op; ! tree vec_oprnd = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); tree vectype = STMT_VINFO_VECTYPE (stmt_info); *************** vectorizable_store (tree stmt, block_stm *** 1667,1674 **** --- 2319,2334 ---- tree dummy; enum dr_alignment_support alignment_support_cheme; ssa_op_iter iter; + def_operand_p def_p; tree def, def_stmt; enum vect_def_type dt; + stmt_vec_info prev_stmt_info; + tree dataref_ptr = NULL_TREE; + int nunits = TYPE_VECTOR_SUBPARTS (vectype); + int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; + int j; + + gcc_assert (ncopies >= 1); /* Is vectorizable store? */ *************** vectorizable_store (tree stmt, block_stm *** 1707,1751 **** /** Transform. **/ if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "transform store"); alignment_support_cheme = vect_supportable_dr_alignment (dr); gcc_assert (alignment_support_cheme); gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */ ! /* Handle use - get the vectorized def from the defining stmt. */ ! vec_oprnd1 = vect_get_vec_def_for_operand (op, stmt, NULL); ! /* Handle def. */ ! /* FORNOW: make sure the data reference is aligned. */ ! vect_align_data_ref (stmt); ! data_ref = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &dummy, false); ! data_ref = build_fold_indirect_ref (data_ref); ! /* Arguments are ready. create the new vector stmt. */ ! *vec_stmt = build2 (MODIFY_EXPR, vectype, data_ref, vec_oprnd1); ! vect_finish_stmt_generation (stmt, *vec_stmt, bsi); ! /* Copy the V_MAY_DEFS representing the aliasing of the original array ! element's definition to the vector's definition then update the ! defining statement. The original is being deleted so the same ! SSA_NAMEs can be used. */ ! copy_virtual_operands (*vec_stmt, stmt); ! ! FOR_EACH_SSA_TREE_OPERAND (def, stmt, iter, SSA_OP_VMAYDEF) ! { ! SSA_NAME_DEF_STMT (def) = *vec_stmt; ! ! /* If this virtual def has a use outside the loop and a loop peel is ! performed then the def may be renamed by the peel. Mark it for ! renaming so the later use will also be renamed. */ ! mark_sym_for_renaming (SSA_NAME_VAR (def)); } return true; } /* vectorizable_load. Check if STMT reads a non scalar data-ref (array/pointer/structure) that --- 2367,2553 ---- /** Transform. **/ if (vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "transform store. ncopies = %d",ncopies); alignment_support_cheme = vect_supportable_dr_alignment (dr); gcc_assert (alignment_support_cheme); gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */ ! /* In case the vectorization factor (VF) is bigger than the number ! of elements that we can fit in a vectype (nunits), we have to generate ! more than one vector stmt - i.e - we need to "unroll" the ! vector stmt by a factor VF/nunits. For more details see documentation in ! vect_get_vec_def_for_copy_stmt. */ ! prev_stmt_info = NULL; ! for (j = 0; j < ncopies; j++) ! { ! tree new_stmt; ! tree ptr_incr; ! if (j == 0) ! { ! vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL); ! dataref_ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &dummy, ! &ptr_incr, false); ! } ! else ! { ! vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd); ! dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); ! } ! /* Arguments are ready. create the new vector stmt. */ ! data_ref = build_fold_indirect_ref (dataref_ptr); ! new_stmt = build2 (MODIFY_EXPR, vectype, data_ref, vec_oprnd); ! vect_finish_stmt_generation (stmt, new_stmt, bsi); ! ! /* Set the V_MAY_DEFS for the vector pointer. If this virtual def has a ! use outside the loop and a loop peel is performed then the def may be ! renamed by the peel. Mark it for renaming so the later use will also ! be renamed. */ ! copy_virtual_operands (new_stmt, stmt); ! if (j == 0) ! { ! /* The original store is deleted so the same SSA_NAMEs can be used. ! */ ! FOR_EACH_SSA_TREE_OPERAND (def, stmt, iter, SSA_OP_VMAYDEF) ! { ! SSA_NAME_DEF_STMT (def) = new_stmt; ! mark_sym_for_renaming (SSA_NAME_VAR (def)); ! } ! ! STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; ! } ! else ! { ! /* Create new names for all the definitions created by COPY and ! add replacement mappings for each new name. */ ! FOR_EACH_SSA_DEF_OPERAND (def_p, new_stmt, iter, SSA_OP_VMAYDEF) ! { ! create_new_def_for (DEF_FROM_PTR (def_p), new_stmt, def_p); ! mark_sym_for_renaming (SSA_NAME_VAR (DEF_FROM_PTR (def_p))); ! } ! ! STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; ! } ! ! prev_stmt_info = vinfo_for_stmt (new_stmt); } return true; } + /* Function vect_setup_realignment + + This function is called when vectorizing an unaligned load using + the dr_unaligned_software_pipeline scheme. + This function generates the following code at the loop prolog: + + p = initial_addr; + msq_init = *(floor(p)); # prolog load + realignment_token = call target_builtin; + loop: + msq = phi (msq_init, ---) + + The code above sets up a new (vector) pointer, pointing to the first + location accessed by STMT, and a "floor-aligned" load using that pointer. + It also generates code to compute the "realignment-token" (if the relevant + target hook was defined), and creates a phi-node at the loop-header bb + whose arguments are the result of the prolog-load (created by this + function) and the result of a load that takes place in the loop (to be + created by the caller to this function). + The caller to this function uses the phi-result (msq) to create the + realignment code inside the loop, and sets up the missing phi argument, + as follows: + + loop: + msq = phi (msq_init, lsq) + lsq = *(floor(p')); # load in loop + result = realign_load (msq, lsq, realignment_token); + + Input: + STMT - (scalar) load stmt to be vectorized. This load accesses + a memory location that may be unaligned. + BSI - place where new code is to be inserted. + + Output: + REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load + target hook, if defined. + Return value - the result of the loop-header phi node. + */ + + static tree + vect_setup_realignment (tree stmt, block_stmt_iterator *bsi, + tree *realignment_token) + { + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + edge pe = loop_preheader_edge (loop); + tree scalar_dest = TREE_OPERAND (stmt, 0); + tree vec_dest; + tree init_addr; + tree inc; + tree ptr; + tree data_ref; + tree new_stmt; + basic_block new_bb; + tree msq_init; + tree new_temp; + tree phi_stmt; + tree msq; + + /* 1. Create msq_init = *(floor(p1)) in the loop preheader */ + vec_dest = vect_create_destination_var (scalar_dest, vectype); + ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true); + data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); + new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, data_ref); + new_temp = make_ssa_name (vec_dest, new_stmt); + TREE_OPERAND (new_stmt, 0) = new_temp; + new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); + gcc_assert (!new_bb); + msq_init = TREE_OPERAND (new_stmt, 0); + copy_virtual_operands (new_stmt, stmt); + update_vuses_to_preheader (new_stmt, loop); + + /* 2. Create permutation mask, if required, in loop preheader. */ + if (targetm.vectorize.builtin_mask_for_load) + { + tree builtin_decl; + tree params = build_tree_list (NULL_TREE, init_addr); + + vec_dest = vect_create_destination_var (scalar_dest, vectype); + builtin_decl = targetm.vectorize.builtin_mask_for_load (); + new_stmt = build_function_call_expr (builtin_decl, params); + new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, new_stmt); + new_temp = make_ssa_name (vec_dest, new_stmt); + TREE_OPERAND (new_stmt, 0) = new_temp; + new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); + gcc_assert (!new_bb); + *realignment_token = TREE_OPERAND (new_stmt, 0); + + /* The result of the CALL_EXPR to this builtin is determined from + the value of the parameter and no global variables are touched + which makes the builtin a "const" function. Requiring the + builtin to have the "const" attribute makes it unnecessary + to call mark_call_clobbered. */ + gcc_assert (TREE_READONLY (builtin_decl)); + } + + /* 3. Create msq = phi in loop */ + vec_dest = vect_create_destination_var (scalar_dest, vectype); + msq = make_ssa_name (vec_dest, NULL_TREE); + phi_stmt = create_phi_node (msq, loop->header); + SSA_NAME_DEF_STMT (msq) = phi_stmt; + add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop)); + + return msq; + } + + /* vectorizable_load. Check if STMT reads a non scalar data-ref (array/pointer/structure) that *************** vectorizable_load (tree stmt, block_stmt *** 1762,1779 **** tree data_ref = NULL; tree op; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree new_temp; int mode; - tree init_addr; tree new_stmt; tree dummy; - basic_block new_bb; - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - edge pe = loop_preheader_edge (loop); enum dr_alignment_support alignment_support_cheme; /* Is vectorizable load? */ if (!STMT_VINFO_RELEVANT_P (stmt_info)) --- 2564,2588 ---- tree data_ref = NULL; tree op; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + stmt_vec_info prev_stmt_info; + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree new_temp; int mode; tree new_stmt; tree dummy; enum dr_alignment_support alignment_support_cheme; + tree dataref_ptr = NULL_TREE; + tree ptr_incr; + int nunits = TYPE_VECTOR_SUBPARTS (vectype); + int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; + int j; + tree msq = NULL_TREE, lsq; + tree offset = NULL_TREE; + tree realignment_token = NULL_TREE; + tree phi_stmt = NULL_TREE; /* Is vectorizable load? */ if (!STMT_VINFO_RELEVANT_P (stmt_info)) *************** vectorizable_load (tree stmt, block_stmt *** 1828,1967 **** alignment_support_cheme = vect_supportable_dr_alignment (dr); gcc_assert (alignment_support_cheme); ! if (alignment_support_cheme == dr_aligned ! || alignment_support_cheme == dr_unaligned_supported) ! { ! /* Create: p = initial_addr; indx = 0; loop { vec_dest = *(p); indx = indx + 1; } - */ ! vec_dest = vect_create_destination_var (scalar_dest, vectype); ! data_ref = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &dummy, false); ! if (aligned_access_p (dr)) ! data_ref = build_fold_indirect_ref (data_ref); ! else ! { ! int mis = DR_MISALIGNMENT (dr); ! tree tmis = (mis == -1 ? size_zero_node : size_int (mis)); ! tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT)); ! data_ref = build2 (MISALIGNED_INDIRECT_REF, vectype, data_ref, tmis); ! } ! new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, data_ref); ! new_temp = make_ssa_name (vec_dest, new_stmt); ! TREE_OPERAND (new_stmt, 0) = new_temp; ! vect_finish_stmt_generation (stmt, new_stmt, bsi); ! copy_virtual_operands (new_stmt, stmt); ! } ! else if (alignment_support_cheme == dr_unaligned_software_pipeline) { ! /* Create: ! p1 = initial_addr; ! msq_init = *(floor(p1)) ! p2 = initial_addr + VS - 1; ! magic = have_builtin ? builtin_result : initial_address; ! indx = 0; ! loop { ! p2' = p2 + indx * vectype_size ! lsq = *(floor(p2')) ! vec_dest = realign_load (msq, lsq, magic) ! indx = indx + 1; ! msq = lsq; ! } ! */ ! ! tree offset; ! tree magic; ! tree phi_stmt; ! tree msq_init; ! tree msq, lsq; ! tree dataref_ptr; ! tree params; ! /* <1> Create msq_init = *(floor(p1)) in the loop preheader */ ! vec_dest = vect_create_destination_var (scalar_dest, vectype); ! data_ref = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, ! &init_addr, true); ! data_ref = build1 (ALIGN_INDIRECT_REF, vectype, data_ref); ! new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, data_ref); ! new_temp = make_ssa_name (vec_dest, new_stmt); ! TREE_OPERAND (new_stmt, 0) = new_temp; ! new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); ! gcc_assert (!new_bb); ! msq_init = TREE_OPERAND (new_stmt, 0); ! copy_virtual_operands (new_stmt, stmt); ! update_vuses_to_preheader (new_stmt, loop); ! /* <2> Create lsq = *(floor(p2')) in the loop */ ! offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); vec_dest = vect_create_destination_var (scalar_dest, vectype); - dataref_ptr = vect_create_data_ref_ptr (stmt, bsi, offset, &dummy, false); - data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, data_ref); new_temp = make_ssa_name (vec_dest, new_stmt); TREE_OPERAND (new_stmt, 0) = new_temp; vect_finish_stmt_generation (stmt, new_stmt, bsi); - lsq = TREE_OPERAND (new_stmt, 0); copy_virtual_operands (new_stmt, stmt); ! /* <3> */ ! if (targetm.vectorize.builtin_mask_for_load) ! { ! /* Create permutation mask, if required, in loop preheader. */ ! tree builtin_decl; ! params = build_tree_list (NULL_TREE, init_addr); ! vec_dest = vect_create_destination_var (scalar_dest, vectype); ! builtin_decl = targetm.vectorize.builtin_mask_for_load (); ! new_stmt = build_function_call_expr (builtin_decl, params); ! new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, new_stmt); ! new_temp = make_ssa_name (vec_dest, new_stmt); ! TREE_OPERAND (new_stmt, 0) = new_temp; ! new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); ! gcc_assert (!new_bb); ! magic = TREE_OPERAND (new_stmt, 0); ! ! /* The result of the CALL_EXPR to this builtin is determined from ! the value of the parameter and no global variables are touched ! which makes the builtin a "const" function. Requiring the ! builtin to have the "const" attribute makes it unnecessary ! to call mark_call_clobbered. */ ! gcc_assert (TREE_READONLY (builtin_decl)); ! } else ! { ! /* Use current address instead of init_addr for reduced reg pressure. ! */ ! magic = dataref_ptr; ! } ! ! ! /* <4> Create msq = phi in loop */ ! vec_dest = vect_create_destination_var (scalar_dest, vectype); ! msq = make_ssa_name (vec_dest, NULL_TREE); ! phi_stmt = create_phi_node (msq, loop->header); /* CHECKME */ ! SSA_NAME_DEF_STMT (msq) = phi_stmt; ! add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop)); ! add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop)); ! ! ! /* <5> Create in loop */ ! vec_dest = vect_create_destination_var (scalar_dest, vectype); ! new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, magic); ! new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, new_stmt); ! new_temp = make_ssa_name (vec_dest, new_stmt); ! TREE_OPERAND (new_stmt, 0) = new_temp; ! vect_finish_stmt_generation (stmt, new_stmt, bsi); } - else - gcc_unreachable (); - *vec_stmt = new_stmt; return true; } --- 2637,2784 ---- alignment_support_cheme = vect_supportable_dr_alignment (dr); gcc_assert (alignment_support_cheme); ! /* In case the vectorization factor (VF) is bigger than the number ! of elements that we can fit in a vectype (nunits), we have to generate ! more than one vector stmt - i.e - we need to "unroll" the ! vector stmt by a factor VF/nunits. In doing so, we record a pointer ! from one copy of the vector stmt to the next, in the field ! STMT_VINFO_RELATED_STMT. This is necessary in order to allow following ! stages to find the correct vector defs to be used when vectorizing ! stmts that use the defs of the current stmt. The example below illustrates ! the vectorization process when VF=16 and nunits=4 (i.e - we need to create ! 4 vectorized stmts): ! ! before vectorization: ! RELATED_STMT VEC_STMT ! S1: x = memref - - ! S2: z = x + 1 - - ! ! step 1: vectorize stmt S1: ! We first create the vector stmt VS1_0, and, as usual, record a ! pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1. ! Next, we create the vector stmt VS1_1, and record a pointer to ! it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0. ! Similarly, for VS1_2 and VS1_3. This is the resulting chain of ! stmts and pointers: ! RELATED_STMT VEC_STMT ! VS1_0: vx0 = memref0 VS1_1 - ! VS1_1: vx1 = memref1 VS1_2 - ! VS1_2: vx2 = memref2 VS1_3 - ! VS1_3: vx3 = memref3 - - ! S1: x = load - VS1_0 ! S2: z = x + 1 - - ! ! See in documentation in vect_get_vec_def_for_stmt_copy for how the ! information we recorded in RELATED_STMT field is used to vectorize ! stmt S2. */ ! ! /* If the data reference is aligned (dr_aligned) or potentially unaligned ! on a target that supports unaligned accesses (dr_unaligned_supported) ! we generate the following code: p = initial_addr; indx = 0; loop { + p = p + indx * vectype_size; vec_dest = *(p); indx = indx + 1; } ! Otherwise, the data reference is potentially unaligned on a target that ! does not support unaligned accesses (dr_unaligned_software_pipeline) - ! then generate the following code, in which the data in each iteration is ! obtained by two vector loads, one from the previous iteration, and one ! from the current iteration: ! p1 = initial_addr; ! msq_init = *(floor(p1)) ! p2 = initial_addr + VS - 1; ! realignment_token = call target_builtin; ! indx = 0; ! loop { ! p2 = p2 + indx * vectype_size ! lsq = *(floor(p2)) ! vec_dest = realign_load (msq, lsq, realignment_token) ! indx = indx + 1; ! msq = lsq; ! } ! */ ! ! if (alignment_support_cheme == dr_unaligned_software_pipeline) { ! msq = vect_setup_realignment (stmt, bsi, &realignment_token); ! phi_stmt = SSA_NAME_DEF_STMT (msq); ! offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); ! } ! prev_stmt_info = NULL; ! for (j = 0; j < ncopies; j++) ! { ! /* 1. Create the vector pointer update chain. */ ! if (j == 0) ! dataref_ptr = vect_create_data_ref_ptr (stmt, bsi, offset, ! &dummy, &ptr_incr, false); ! else ! dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); + /* 2. Create the vector-load in the loop. */ + switch (alignment_support_cheme) + { + case dr_aligned: + gcc_assert (aligned_access_p (dr)); + data_ref = build_fold_indirect_ref (dataref_ptr); + break; + case dr_unaligned_supported: + { + int mis = DR_MISALIGNMENT (dr); + tree tmis = (mis == -1 ? size_zero_node : size_int (mis)); ! gcc_assert (!aligned_access_p (dr)); ! tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT)); ! data_ref = ! build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis); ! break; ! } ! case dr_unaligned_software_pipeline: ! gcc_assert (!aligned_access_p (dr)); ! data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); ! break; ! default: ! gcc_unreachable (); ! } vec_dest = vect_create_destination_var (scalar_dest, vectype); new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, data_ref); new_temp = make_ssa_name (vec_dest, new_stmt); TREE_OPERAND (new_stmt, 0) = new_temp; vect_finish_stmt_generation (stmt, new_stmt, bsi); copy_virtual_operands (new_stmt, stmt); + mark_new_vars_to_rename (new_stmt); + /* 3. Handle explicit realignment if necessary/supported. */ + if (alignment_support_cheme == dr_unaligned_software_pipeline) + { + /* Create in loop: + */ + lsq = TREE_OPERAND (new_stmt, 0); + if (!realignment_token) + realignment_token = dataref_ptr; + vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_stmt = + build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token); + new_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, new_stmt); + new_temp = make_ssa_name (vec_dest, new_stmt); + TREE_OPERAND (new_stmt, 0) = new_temp; + vect_finish_stmt_generation (stmt, new_stmt, bsi); + if (j == ncopies - 1) + add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop)); + msq = lsq; + } ! if (j == 0) ! STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; else ! STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; ! prev_stmt_info = vinfo_for_stmt (new_stmt); } return true; } *************** vectorizable_condition (tree stmt, block *** 2091,2096 **** --- 2908,2919 ---- enum machine_mode vec_mode; tree def; enum vect_def_type dt; + int nunits = TYPE_VECTOR_SUBPARTS (vectype); + int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; + + gcc_assert (ncopies >= 1); + if (ncopies > 1) + return false; /* FORNOW */ if (!STMT_VINFO_RELEVANT_P (stmt_info)) return false; *************** vect_transform_stmt (tree stmt, block_st *** 2201,2206 **** --- 3024,3039 ---- { switch (STMT_VINFO_TYPE (stmt_info)) { + case type_demotion_vec_info_type: + done = vectorizable_type_demotion (stmt, bsi, &vec_stmt); + gcc_assert (done); + break; + + case type_promotion_vec_info_type: + done = vectorizable_type_promotion (stmt, bsi, &vec_stmt); + gcc_assert (done); + break; + case op_vec_info_type: done = vectorizable_operation (stmt, bsi, &vec_stmt); gcc_assert (done); *************** vect_transform_stmt (tree stmt, block_st *** 2267,2278 **** done = vectorizable_live_operation (stmt, bsi, &vec_stmt); gcc_assert (done); } - - if (vec_stmt) - { - gcc_assert (!STMT_VINFO_VEC_STMT (stmt_info)); - STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt; - } } return is_store; --- 3100,3105 ---- *************** vect_gen_niters_for_prolog_loop (loop_ve *** 2678,2684 **** if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) { int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); ! int element_size = vectype_align/vf; int elem_misalign = byte_misalign / element_size; if (vect_print_dump_info (REPORT_DETAILS)) --- 3505,3511 ---- if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) { int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); ! int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); int elem_misalign = byte_misalign / element_size; if (vect_print_dump_info (REPORT_DETAILS)) *************** vect_transform_loop (loop_vec_info loop_ *** 3098,3108 **** bsi_next (&si); continue; } ! /* FORNOW: Verify that all stmts operate on the same number of ! units and no inner unrolling is necessary. */ ! gcc_assert ! (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)) ! == (unsigned HOST_WIDE_INT) vectorization_factor); /* -------- vectorize statement ------------ */ if (vect_print_dump_info (REPORT_DETAILS)) --- 3925,3935 ---- bsi_next (&si); continue; } ! ! if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)) ! != (unsigned HOST_WIDE_INT) vectorization_factor) ! && vect_print_dump_info (REPORT_DETAILS)) ! fprintf (vect_dump, "multiple-types."); /* -------- vectorize statement ------------ */ if (vect_print_dump_info (REPORT_DETAILS)) Index: tree-vect-generic.c =================================================================== *** tree-vect-generic.c (revision 115817) --- tree-vect-generic.c (working copy) *************** expand_vector_operations_1 (block_stmt_i *** 411,419 **** gcc_assert (code != CONVERT_EXPR); op = optab_for_tree_code (code, type); ! /* For widening vector operations, the relevant type is of the arguments, ! not the widened result. */ ! if (code == WIDEN_SUM_EXPR) type = TREE_TYPE (TREE_OPERAND (rhs, 0)); /* Optabs will try converting a negation into a subtraction, so --- 411,425 ---- gcc_assert (code != CONVERT_EXPR); op = optab_for_tree_code (code, type); ! /* For widening/narrowgin vector operations, the relevant type is of the ! arguments, not the widened result. */ ! if (code == WIDEN_SUM_EXPR ! || code == VEC_WIDEN_MULT_HI_EXPR ! || code == VEC_WIDEN_MULT_LO_EXPR ! || code == VEC_UNPACK_HI_EXPR ! || code == VEC_UNPACK_LO_EXPR ! || code == VEC_PACK_MOD_EXPR ! || code == VEC_PACK_SAT_EXPR) type = TREE_TYPE (TREE_OPERAND (rhs, 0)); /* Optabs will try converting a negation into a subtraction, so Index: config/rs6000/rs6000.c =================================================================== *** config/rs6000/rs6000.c (revision 115817) --- config/rs6000/rs6000.c (working copy) *************** static int pad_groups (FILE *, int, rtx, *** 663,668 **** --- 663,670 ---- static void rs6000_sched_finish (FILE *, int); static int rs6000_use_sched_lookahead (void); static tree rs6000_builtin_mask_for_load (void); + static tree rs6000_builtin_mul_widen_even (tree); + static tree rs6000_builtin_mul_widen_odd (tree); static void def_builtin (int, const char *, tree, int); static void rs6000_init_builtins (void); *************** static const char alt_reg_names[][8] = *** 916,921 **** --- 918,927 ---- #undef TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD #define TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD rs6000_builtin_mask_for_load + #undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN + #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN rs6000_builtin_mul_widen_even + #undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD + #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD rs6000_builtin_mul_widen_odd #undef TARGET_INIT_BUILTINS #define TARGET_INIT_BUILTINS rs6000_init_builtins *************** rs6000_builtin_mask_for_load (void) *** 1584,1589 **** --- 1590,1639 ---- return 0; } + /* Implement targetm.vectorize.builtin_mul_widen_even. */ + static tree + rs6000_builtin_mul_widen_even (tree type) + { + if (!TARGET_ALTIVEC) + return NULL_TREE; + + switch (TYPE_MODE (type)) + { + case V8HImode: + return TYPE_UNSIGNED (type) ? + rs6000_builtin_decls[ALTIVEC_BUILTIN_VMULEUH] : + rs6000_builtin_decls[ALTIVEC_BUILTIN_VMULESH]; + case V16QImode: + return TYPE_UNSIGNED (type) ? + rs6000_builtin_decls[ALTIVEC_BUILTIN_VMULEUB] : + rs6000_builtin_decls[ALTIVEC_BUILTIN_VMULESB]; + default: + return NULL_TREE; + } + } + + /* Implement targetm.vectorize.builtin_mul_widen_odd. */ + static tree + rs6000_builtin_mul_widen_odd (tree type) + { + if (!TARGET_ALTIVEC) + return NULL_TREE; + + switch (TYPE_MODE (type)) + { + case V8HImode: + return TYPE_UNSIGNED (type) ? + rs6000_builtin_decls[ALTIVEC_BUILTIN_VMULOUH] : + rs6000_builtin_decls[ALTIVEC_BUILTIN_VMULOSH]; + case V16QImode: + return TYPE_UNSIGNED (type) ? + rs6000_builtin_decls[ALTIVEC_BUILTIN_VMULOUB] : + rs6000_builtin_decls[ALTIVEC_BUILTIN_VMULOSB]; + default: + return NULL_TREE; + } + } + /* Handle generic options of the form -mfoo=yes/no. NAME is the option name. VALUE is the option value. Index: config/rs6000/altivec.md =================================================================== *** config/rs6000/altivec.md (revision 115817) --- config/rs6000/altivec.md (working copy) *************** *** 122,127 **** --- 122,141 ---- (UNSPEC_VCONDU_V4SI 305) (UNSPEC_VCONDU_V8HI 306) (UNSPEC_VCONDU_V16QI 307) + (UNSPEC_VMULWHUB 308) + (UNSPEC_VMULWLUB 309) + (UNSPEC_VMULWHSB 310) + (UNSPEC_VMULWLSB 311) + (UNSPEC_VMULWHUH 312) + (UNSPEC_VMULWLUH 313) + (UNSPEC_VMULWHSH 314) + (UNSPEC_VMULWLSH 315) + (UNSPEC_VUPKHUB 316) + (UNSPEC_VUPKHUH 317) + (UNSPEC_VUPKLUB 318) + (UNSPEC_VUPKLUH 319) + (UNSPEC_VPERMSI 320) + (UNSPEC_VPERMHI 321) ]) (define_constants *************** *** 2203,2208 **** --- 2217,2587 ---- DONE; }") + (define_expand "vec_unpacks_hi_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v")] + UNSPEC_VUPKHSB))] + "TARGET_ALTIVEC" + " + { + emit_insn (gen_altivec_vupkhsb (operands[0], operands[1])); + DONE; + }") + + (define_expand "vec_unpacks_hi_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")] + UNSPEC_VUPKHSH))] + "TARGET_ALTIVEC" + " + { + emit_insn (gen_altivec_vupkhsh (operands[0], operands[1])); + DONE; + }") + + (define_expand "vec_unpacks_lo_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v")] + UNSPEC_VUPKLSB))] + "TARGET_ALTIVEC" + " + { + emit_insn (gen_altivec_vupklsb (operands[0], operands[1])); + DONE; + }") + + (define_expand "vec_unpacks_lo_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")] + UNSPEC_VUPKLSH))] + "TARGET_ALTIVEC" + " + { + emit_insn (gen_altivec_vupklsh (operands[0], operands[1])); + DONE; + }") + + (define_insn "vperm_v8hiv4si" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v") + (match_operand:V4SI 2 "register_operand" "v") + (match_operand:V16QI 3 "register_operand" "v")] + UNSPEC_VPERMSI))] + "TARGET_ALTIVEC" + "vperm %0,%1,%2,%3" + [(set_attr "type" "vecperm")]) + + (define_insn "vperm_v16qiv8hi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v") + (match_operand:V8HI 2 "register_operand" "v") + (match_operand:V16QI 3 "register_operand" "v")] + UNSPEC_VPERMHI))] + "TARGET_ALTIVEC" + "vperm %0,%1,%2,%3" + [(set_attr "type" "vecperm")]) + + + (define_expand "vec_unpacku_hi_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v")] + UNSPEC_VUPKHUB))] + "TARGET_ALTIVEC" + " + { + rtx vzero = gen_reg_rtx (V8HImode); + rtx mask = gen_reg_rtx (V16QImode); + rtvec v = rtvec_alloc (16); + + emit_insn (gen_altivec_vspltish (vzero, const0_rtx)); + + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, 0); + RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, 1); + RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, 2); + RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, 3); + RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, 4); + RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, 5); + RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, 6); + RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, 7); + + emit_insn (gen_vec_initv16qi (mask, gen_rtx_PARALLEL (V16QImode, v))); + emit_insn (gen_vperm_v16qiv8hi (operands[0], operands[1], vzero, mask)); + DONE; + }") + + (define_expand "vec_unpacku_hi_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")] + UNSPEC_VUPKHUH))] + "TARGET_ALTIVEC" + " + { + rtx vzero = gen_reg_rtx (V4SImode); + rtx mask = gen_reg_rtx (V16QImode); + rtvec v = rtvec_alloc (16); + + emit_insn (gen_altivec_vspltisw (vzero, const0_rtx)); + + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, 17); + RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, 0); + RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, 1); + RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, 17); + RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, 2); + RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, 3); + RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, 17); + RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, 4); + RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, 5); + RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, 17); + RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, 6); + RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, 7); + + emit_insn (gen_vec_initv16qi (mask, gen_rtx_PARALLEL (V16QImode, v))); + emit_insn (gen_vperm_v8hiv4si (operands[0], operands[1], vzero, mask)); + DONE; + }") + + (define_expand "vec_unpacku_lo_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v")] + UNSPEC_VUPKLUB))] + "TARGET_ALTIVEC" + " + { + rtx vzero = gen_reg_rtx (V8HImode); + rtx mask = gen_reg_rtx (V16QImode); + rtvec v = rtvec_alloc (16); + + emit_insn (gen_altivec_vspltish (vzero, const0_rtx)); + + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, 8); + RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, 9); + RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, 10); + RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, 11); + RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, 12); + RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, 13); + RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, 14); + RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, 15); + + emit_insn (gen_vec_initv16qi (mask, gen_rtx_PARALLEL (V16QImode, v))); + emit_insn (gen_vperm_v16qiv8hi (operands[0], operands[1], vzero, mask)); + DONE; + }") + + (define_expand "vec_unpacku_lo_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")] + UNSPEC_VUPKLUH))] + "TARGET_ALTIVEC" + " + { + rtx vzero = gen_reg_rtx (V4SImode); + rtx mask = gen_reg_rtx (V16QImode); + rtvec v = rtvec_alloc (16); + + emit_insn (gen_altivec_vspltisw (vzero, const0_rtx)); + + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, 17); + RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, 8); + RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, 9); + RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, 17); + RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, 10); + RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, 11); + RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, 17); + RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, 12); + RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, 13); + RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, 16); + RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, 17); + RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, 14); + RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, 15); + + emit_insn (gen_vec_initv16qi (mask, gen_rtx_PARALLEL (V16QImode, v))); + emit_insn (gen_vperm_v8hiv4si (operands[0], operands[1], vzero, mask)); + DONE; + }") + + (define_expand "vec_widen_umult_hi_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v") + (match_operand:V16QI 2 "register_operand" "v")] + UNSPEC_VMULWHUB))] + "TARGET_ALTIVEC" + " + { + rtx ve = gen_reg_rtx (V8HImode); + rtx vo = gen_reg_rtx (V8HImode); + + emit_insn (gen_altivec_vmuleub (ve, operands[1], operands[2])); + emit_insn (gen_altivec_vmuloub (vo, operands[1], operands[2])); + emit_insn (gen_altivec_vmrghh (operands[0], ve, vo)); + DONE; + }") + + (define_expand "vec_widen_umult_lo_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v") + (match_operand:V16QI 2 "register_operand" "v")] + UNSPEC_VMULWLUB))] + "TARGET_ALTIVEC" + " + { + rtx ve = gen_reg_rtx (V8HImode); + rtx vo = gen_reg_rtx (V8HImode); + + emit_insn (gen_altivec_vmuleub (ve, operands[1], operands[2])); + emit_insn (gen_altivec_vmuloub (vo, operands[1], operands[2])); + emit_insn (gen_altivec_vmrglh (operands[0], ve, vo)); + DONE; + }") + + (define_expand "vec_widen_smult_hi_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v") + (match_operand:V16QI 2 "register_operand" "v")] + UNSPEC_VMULWHSB))] + "TARGET_ALTIVEC" + " + { + rtx ve = gen_reg_rtx (V8HImode); + rtx vo = gen_reg_rtx (V8HImode); + + emit_insn (gen_altivec_vmulesb (ve, operands[1], operands[2])); + emit_insn (gen_altivec_vmulosb (vo, operands[1], operands[2])); + emit_insn (gen_altivec_vmrghh (operands[0], ve, vo)); + DONE; + }") + + (define_expand "vec_widen_smult_lo_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v") + (match_operand:V16QI 2 "register_operand" "v")] + UNSPEC_VMULWLSB))] + "TARGET_ALTIVEC" + " + { + rtx ve = gen_reg_rtx (V8HImode); + rtx vo = gen_reg_rtx (V8HImode); + + emit_insn (gen_altivec_vmulesb (ve, operands[1], operands[2])); + emit_insn (gen_altivec_vmulosb (vo, operands[1], operands[2])); + emit_insn (gen_altivec_vmrglh (operands[0], ve, vo)); + DONE; + }") + + (define_expand "vec_widen_umult_hi_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v") + (match_operand:V8HI 2 "register_operand" "v")] + UNSPEC_VMULWHUH))] + "TARGET_ALTIVEC" + " + { + rtx ve = gen_reg_rtx (V4SImode); + rtx vo = gen_reg_rtx (V4SImode); + + emit_insn (gen_altivec_vmuleuh (ve, operands[1], operands[2])); + emit_insn (gen_altivec_vmulouh (vo, operands[1], operands[2])); + emit_insn (gen_altivec_vmrghw (operands[0], ve, vo)); + DONE; + }") + + (define_expand "vec_widen_umult_lo_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v") + (match_operand:V8HI 2 "register_operand" "v")] + UNSPEC_VMULWLUH))] + "TARGET_ALTIVEC" + " + { + rtx ve = gen_reg_rtx (V4SImode); + rtx vo = gen_reg_rtx (V4SImode); + + emit_insn (gen_altivec_vmuleuh (ve, operands[1], operands[2])); + emit_insn (gen_altivec_vmulouh (vo, operands[1], operands[2])); + emit_insn (gen_altivec_vmrglw (operands[0], ve, vo)); + DONE; + }") + + (define_expand "vec_widen_smult_hi_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v") + (match_operand:V8HI 2 "register_operand" "v")] + UNSPEC_VMULWHSH))] + "TARGET_ALTIVEC" + " + { + rtx ve = gen_reg_rtx (V4SImode); + rtx vo = gen_reg_rtx (V4SImode); + + emit_insn (gen_altivec_vmulesh (ve, operands[1], operands[2])); + emit_insn (gen_altivec_vmulosh (vo, operands[1], operands[2])); + emit_insn (gen_altivec_vmrghw (operands[0], ve, vo)); + DONE; + }") + + (define_expand "vec_widen_smult_lo_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v") + (match_operand:V8HI 2 "register_operand" "v")] + UNSPEC_VMULWLSH))] + "TARGET_ALTIVEC" + " + { + rtx ve = gen_reg_rtx (V4SImode); + rtx vo = gen_reg_rtx (V4SImode); + + emit_insn (gen_altivec_vmulesh (ve, operands[1], operands[2])); + emit_insn (gen_altivec_vmulosh (vo, operands[1], operands[2])); + emit_insn (gen_altivec_vmrglw (operands[0], ve, vo)); + DONE; + }") + + (define_expand "vec_pack_mod_v8hi" + [(set (match_operand:V16QI 0 "register_operand" "=v") + (unspec:V16QI [(match_operand:V8HI 1 "register_operand" "v") + (match_operand:V8HI 2 "register_operand" "v")] + UNSPEC_VPKUHUM))] + "TARGET_ALTIVEC" + " + { + emit_insn (gen_altivec_vpkuhum (operands[0], operands[1], operands[2])); + DONE; + }") + + (define_expand "vec_pack_mod_v4si" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (unspec:V8HI [(match_operand:V4SI 1 "register_operand" "v") + (match_operand:V4SI 2 "register_operand" "v")] + UNSPEC_VPKUWUM))] + "TARGET_ALTIVEC" + " + { + emit_insn (gen_altivec_vpkuwum (operands[0], operands[1], operands[2])); + DONE; + }") + (define_expand "negv4sf2" [(use (match_operand:V4SF 0 "register_operand" "")) (use (match_operand:V4SF 1 "register_operand" ""))]