gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70
  71 static object_allocator<_slp_tree> *slp_tree_pool;
  72 static slp_tree slp_first_node;
  73
  74 void
  75 vect_slp_init (void)
  76 {
  77   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  78 }
  79
  80 void
  81 vect_slp_fini (void)
  82 {
  83   while (slp_first_node)
  84     delete slp_first_node;
  85   delete slp_tree_pool;
  86   slp_tree_pool = NULL;
  87 }
  88
  89 void *
  90 _slp_tree::operator new (size_t n)
  91 {
  92   gcc_assert (n == sizeof (_slp_tree));
  93   return slp_tree_pool->allocate_raw ();
  94 }
  95
  96 void
  97 _slp_tree::operator delete (void *node, size_t n)
  98 {
  99   gcc_assert (n == sizeof (_slp_tree));
 100   slp_tree_pool->remove_raw (node);
 101 }
 102
 103
 104 /* Initialize a SLP node.  */
 105
 106 _slp_tree::_slp_tree ()
 107 {
 108   this->prev_node = NULL;
 109   if (slp_first_node)
 110     slp_first_node->prev_node = this;
 111   this->next_node = slp_first_node;
 112   slp_first_node = this;
 113   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 114   SLP_TREE_SCALAR_OPS (this) = vNULL;
 115   SLP_TREE_VEC_DEFS (this) = vNULL;
 116   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 117   SLP_TREE_CHILDREN (this) = vNULL;
 118   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 119   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 120   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 121   SLP_TREE_CODE (this) = ERROR_MARK;
 122   SLP_TREE_VECTYPE (this) = NULL_TREE;
 123   SLP_TREE_REPRESENTATIVE (this) = NULL;
 124   SLP_TREE_REF_COUNT (this) = 1;
 125   this->failed = NULL;
 126   this->max_nunits = 1;
 127   this->lanes = 0;
 128 }
 129
 130 /* Tear down a SLP node.  */
 131
 132 _slp_tree::~_slp_tree ()
 133 {
 134   if (this->prev_node)
 135     this->prev_node->next_node = this->next_node;
 136   else
 137     slp_first_node = this->next_node;
 138   if (this->next_node)
 139     this->next_node->prev_node = this->prev_node;
 140   SLP_TREE_CHILDREN (this).release ();
 141   SLP_TREE_SCALAR_STMTS (this).release ();
 142   SLP_TREE_SCALAR_OPS (this).release ();
 143   SLP_TREE_VEC_DEFS (this).release ();
 144   SLP_TREE_LOAD_PERMUTATION (this).release ();
 145   SLP_TREE_LANE_PERMUTATION (this).release ();
 146   if (this->failed)
 147     free (failed);
 148 }
 149
 150 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 151
 152 void
 153 _slp_tree::push_vec_def (gimple *def)
 154 {
 155   if (gphi *phi = dyn_cast <gphi *> (def))
 156     vec_defs.quick_push (gimple_phi_result (phi));
 157   else
 158     {
 159       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 160       vec_defs.quick_push (get_def_from_ptr (defop));
 161     }
 162 }
 163
 164 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 165
 166 void
 167 vect_free_slp_tree (slp_tree node)
 168 {
 169   int i;
 170   slp_tree child;
 171
 172   if (--SLP_TREE_REF_COUNT (node) != 0)
 173     return;
 174
 175   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 176     if (child)
 177       vect_free_slp_tree (child);
 178
 179   /* If the node defines any SLP only patterns then those patterns are no
 180      longer valid and should be removed.  */
 181   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 182   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 183     {
 184       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 185       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 186       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 187     }
 188
 189   delete node;
 190 }
 191
 192 /* Return a location suitable for dumpings related to the SLP instance.  */
 193
 194 dump_user_location_t
 195 _slp_instance::location () const
 196 {
 197   if (!root_stmts.is_empty ())
 198     return root_stmts[0]->stmt;
 199   else
 200     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 201 }
 202
 203
 204 /* Free the memory allocated for the SLP instance.  */
 205
 206 void
 207 vect_free_slp_instance (slp_instance instance)
 208 {
 209   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 210   SLP_INSTANCE_LOADS (instance).release ();
 211   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 212   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 213   instance->subgraph_entries.release ();
 214   instance->cost_vec.release ();
 215   free (instance);
 216 }
 217
 218
 219 /* Create an SLP node for SCALAR_STMTS.  */
 220
 221 slp_tree
 222 vect_create_new_slp_node (unsigned nops, tree_code code)
 223 {
 224   slp_tree node = new _slp_tree;
 225   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 226   SLP_TREE_CHILDREN (node).create (nops);
 227   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 228   SLP_TREE_CODE (node) = code;
 229   return node;
 230 }
 231 /* Create an SLP node for SCALAR_STMTS.  */
 232
 233 static slp_tree
 234 vect_create_new_slp_node (slp_tree node,
 235                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 236 {
 237   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 238   SLP_TREE_CHILDREN (node).create (nops);
 239   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 240   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 241   SLP_TREE_LANES (node) = scalar_stmts.length ();
 242   return node;
 243 }
 244
 245 /* Create an SLP node for SCALAR_STMTS.  */
 246
 247 static slp_tree
 248 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 249 {
 250   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 251 }
 252
 253 /* Create an SLP node for OPS.  */
 254
 255 static slp_tree
 256 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 257 {
 258   SLP_TREE_SCALAR_OPS (node) = ops;
 259   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 260   SLP_TREE_LANES (node) = ops.length ();
 261   return node;
 262 }
 263
 264 /* Create an SLP node for OPS.  */
 265
 266 static slp_tree
 267 vect_create_new_slp_node (vec<tree> ops)
 268 {
 269   return vect_create_new_slp_node (new _slp_tree, ops);
 270 }
 271
 272
 273 /* This structure is used in creation of an SLP tree.  Each instance
 274    corresponds to the same operand in a group of scalar stmts in an SLP
 275    node.  */
 276 typedef struct _slp_oprnd_info
 277 {
 278   /* Def-stmts for the operands.  */
 279   vec<stmt_vec_info> def_stmts;
 280   /* Operands.  */
 281   vec<tree> ops;
 282   /* Information about the first statement, its vector def-type, type, the
 283      operand itself in case it's constant, and an indication if it's a pattern
 284      stmt.  */
 285   tree first_op_type;
 286   enum vect_def_type first_dt;
 287   bool any_pattern;
 288 } *slp_oprnd_info;
 289
 290
 291 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 292    operand.  */
 293 static vec<slp_oprnd_info>
 294 vect_create_oprnd_info (int nops, int group_size)
 295 {
 296   int i;
 297   slp_oprnd_info oprnd_info;
 298   vec<slp_oprnd_info> oprnds_info;
 299
 300   oprnds_info.create (nops);
 301   for (i = 0; i < nops; i++)
 302     {
 303       oprnd_info = XNEW (struct _slp_oprnd_info);
 304       oprnd_info->def_stmts.create (group_size);
 305       oprnd_info->ops.create (group_size);
 306       oprnd_info->first_dt = vect_uninitialized_def;
 307       oprnd_info->first_op_type = NULL_TREE;
 308       oprnd_info->any_pattern = false;
 309       oprnds_info.quick_push (oprnd_info);
 310     }
 311
 312   return oprnds_info;
 313 }
 314
 315
 316 /* Free operands info.  */
 317
 318 static void
 319 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 320 {
 321   int i;
 322   slp_oprnd_info oprnd_info;
 323
 324   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 325     {
 326       oprnd_info->def_stmts.release ();
 327       oprnd_info->ops.release ();
 328       XDELETE (oprnd_info);
 329     }
 330
 331   oprnds_info.release ();
 332 }
 333
 334 /* Return the execution frequency of NODE (so that a higher value indicates
 335    a "more important" node when optimizing for speed).  */
 336
 337 static sreal
 338 vect_slp_node_weight (slp_tree node)
 339 {
 340   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 341   basic_block bb = gimple_bb (stmt_info->stmt);
 342   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 343 }
 344
 345 /* Return true if STMTS contains a pattern statement.  */
 346
 347 static bool
 348 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 349 {
 350   stmt_vec_info stmt_info;
 351   unsigned int i;
 352   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 353     if (is_pattern_stmt_p (stmt_info))
 354       return true;
 355   return false;
 356 }
 357
 358 /* Return true when all lanes in the external or constant NODE have
 359    the same value.  */
 360
 361 static bool
 362 vect_slp_tree_uniform_p (slp_tree node)
 363 {
 364   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 365               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 366
 367   /* Pre-exsting vectors.  */
 368   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 369     return false;
 370
 371   unsigned i;
 372   tree op, first = NULL_TREE;
 373   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 374     if (!first)
 375       first = op;
 376     else if (!operand_equal_p (first, op, 0))
 377       return false;
 378
 379   return true;
 380 }
 381
 382 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 383    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 384    of the chain.  */
 385
 386 int
 387 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 388                                       stmt_vec_info first_stmt_info)
 389 {
 390   stmt_vec_info next_stmt_info = first_stmt_info;
 391   int result = 0;
 392
 393   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 394     return -1;
 395
 396   do
 397     {
 398       if (next_stmt_info == stmt_info)
 399         return result;
 400       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 401       if (next_stmt_info)
 402         result += DR_GROUP_GAP (next_stmt_info);
 403     }
 404   while (next_stmt_info);
 405
 406   return -1;
 407 }
 408
 409 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 410    using the method implemented by duplicate_and_interleave.  Return true
 411    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 412    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 413    (if nonnull).  */
 414
 415 bool
 416 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 417                                 tree elt_type, unsigned int *nvectors_out,
 418                                 tree *vector_type_out,
 419                                 tree *permutes)
 420 {
 421   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 422   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 423     return false;
 424
 425   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 426   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 427   unsigned int nvectors = 1;
 428   for (;;)
 429     {
 430       scalar_int_mode int_mode;
 431       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 432       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 433         {
 434           /* Get the natural vector type for this SLP group size.  */
 435           tree int_type = build_nonstandard_integer_type
 436             (GET_MODE_BITSIZE (int_mode), 1);
 437           tree vector_type
 438             = get_vectype_for_scalar_type (vinfo, int_type, count);
 439           poly_int64 half_nelts;
 440           if (vector_type
 441               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 442               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 443                            GET_MODE_SIZE (base_vector_mode))
 444               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 445                              2, &half_nelts))
 446             {
 447               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 448                  together into elements of type INT_TYPE and using the result
 449                  to build NVECTORS vectors.  */
 450               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 451               vec_perm_builder sel1 (nelts, 2, 3);
 452               vec_perm_builder sel2 (nelts, 2, 3);
 453
 454               for (unsigned int i = 0; i < 3; ++i)
 455                 {
 456                   sel1.quick_push (i);
 457                   sel1.quick_push (i + nelts);
 458                   sel2.quick_push (half_nelts + i);
 459                   sel2.quick_push (half_nelts + i + nelts);
 460                 }
 461               vec_perm_indices indices1 (sel1, 2, nelts);
 462               vec_perm_indices indices2 (sel2, 2, nelts);
 463               machine_mode vmode = TYPE_MODE (vector_type);
 464               if (can_vec_perm_const_p (vmode, vmode, indices1)
 465                   && can_vec_perm_const_p (vmode, vmode, indices2))
 466                 {
 467                   if (nvectors_out)
 468                     *nvectors_out = nvectors;
 469                   if (vector_type_out)
 470                     *vector_type_out = vector_type;
 471                   if (permutes)
 472                     {
 473                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 474                                                                 indices1);
 475                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 476                                                                 indices2);
 477                     }
 478                   return true;
 479                 }
 480             }
 481         }
 482       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 483         return false;
 484       nvectors *= 2;
 485     }
 486 }
 487
 488 /* Return true if DTA and DTB match.  */
 489
 490 static bool
 491 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 492 {
 493   return (dta == dtb
 494           || ((dta == vect_external_def || dta == vect_constant_def)
 495               && (dtb == vect_external_def || dtb == vect_constant_def)));
 496 }
 497
 498 static const int cond_expr_maps[3][5] = {
 499   { 4, -1, -2, 1, 2 },
 500   { 4, -2, -1, 1, 2 },
 501   { 4, -1, -2, 2, 1 }
 502 };
 503 static const int arg1_map[] = { 1, 1 };
 504 static const int arg2_map[] = { 1, 2 };
 505 static const int arg1_arg4_map[] = { 2, 1, 4 };
 506 static const int op1_op0_map[] = { 2, 1, 0 };
 507
 508 /* For most SLP statements, there is a one-to-one mapping between
 509    gimple arguments and child nodes.  If that is not true for STMT,
 510    return an array that contains:
 511
 512    - the number of child nodes, followed by
 513    - for each child node, the index of the argument associated with that node.
 514      The special index -1 is the first operand of an embedded comparison and
 515      the special index -2 is the second operand of an embedded comparison.
 516
 517    SWAP is as for vect_get_and_check_slp_defs.  */
 518
 519 static const int *
 520 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
 521 {
 522   if (auto assign = dyn_cast<const gassign *> (stmt))
 523     {
 524       if (gimple_assign_rhs_code (assign) == COND_EXPR
 525           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 526         return cond_expr_maps[swap];
 527       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 528           && swap)
 529         return op1_op0_map;
 530     }
 531   gcc_assert (!swap);
 532   if (auto call = dyn_cast<const gcall *> (stmt))
 533     {
 534       if (gimple_call_internal_p (call))
 535         switch (gimple_call_internal_fn (call))
 536           {
 537           case IFN_MASK_LOAD:
 538             return arg2_map;
 539
 540           case IFN_GATHER_LOAD:
 541             return arg1_map;
 542
 543           case IFN_MASK_GATHER_LOAD:
 544             return arg1_arg4_map;
 545
 546           default:
 547             break;
 548           }
 549     }
 550   return nullptr;
 551 }
 552
 553 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 554    they are of a valid type and that they match the defs of the first stmt of
 555    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 556    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 557    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 558    is 1 if STMT is cond and operands of comparison need to be swapped;
 559    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 560
 561    If there was a fatal error return -1; if the error could be corrected by
 562    swapping operands of father node of this one, return 1; if everything is
 563    ok return 0.  */
 564 static int
 565 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 566                              bool *skip_args,
 567                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 568                              vec<slp_oprnd_info> *oprnds_info)
 569 {
 570   stmt_vec_info stmt_info = stmts[stmt_num];
 571   tree oprnd;
 572   unsigned int i, number_of_oprnds;
 573   enum vect_def_type dt = vect_uninitialized_def;
 574   slp_oprnd_info oprnd_info;
 575   unsigned int commutative_op = -1U;
 576   bool first = stmt_num == 0;
 577
 578   if (!is_a<gcall *> (stmt_info->stmt)
 579       && !is_a<gassign *> (stmt_info->stmt)
 580       && !is_a<gphi *> (stmt_info->stmt))
 581     return -1;
 582
 583   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 584   const int *map = vect_get_operand_map (stmt_info->stmt, swap);
 585   if (map)
 586     number_of_oprnds = *map++;
 587   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 588     {
 589       if (gimple_call_internal_p (stmt))
 590         {
 591           internal_fn ifn = gimple_call_internal_fn (stmt);
 592           commutative_op = first_commutative_argument (ifn);
 593         }
 594     }
 595   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 596     {
 597       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 598         commutative_op = 0;
 599     }
 600
 601   bool swapped = (swap != 0);
 602   bool backedge = false;
 603   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 604   for (i = 0; i < number_of_oprnds; i++)
 605     {
 606       int opno = map ? map[i] : int (i);
 607       if (opno < 0)
 608         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 609       else
 610         {
 611           oprnd = gimple_arg (stmt_info->stmt, opno);
 612           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 613             backedge = dominated_by_p (CDI_DOMINATORS,
 614                                        gimple_phi_arg_edge (stmt, opno)->src,
 615                                        gimple_bb (stmt_info->stmt));
 616         }
 617       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 618         oprnd = TREE_OPERAND (oprnd, 0);
 619
 620       oprnd_info = (*oprnds_info)[i];
 621
 622       stmt_vec_info def_stmt_info;
 623       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 624         {
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 627                              "Build SLP failed: can't analyze def for %T\n",
 628                              oprnd);
 629
 630           return -1;
 631         }
 632
 633       if (skip_args[i])
 634         {
 635           oprnd_info->def_stmts.quick_push (NULL);
 636           oprnd_info->ops.quick_push (NULL_TREE);
 637           oprnd_info->first_dt = vect_uninitialized_def;
 638           continue;
 639         }
 640
 641       oprnd_info->def_stmts.quick_push (def_stmt_info);
 642       oprnd_info->ops.quick_push (oprnd);
 643
 644       if (def_stmt_info
 645           && is_pattern_stmt_p (def_stmt_info))
 646         {
 647           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 648               != def_stmt_info)
 649             oprnd_info->any_pattern = true;
 650           else
 651             /* If we promote this to external use the original stmt def.  */
 652             oprnd_info->ops.last ()
 653               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 654         }
 655
 656       /* If there's a extern def on a backedge make sure we can
 657          code-generate at the region start.
 658          ???  This is another case that could be fixed by adjusting
 659          how we split the function but at the moment we'd have conflicting
 660          goals there.  */
 661       if (backedge
 662           && dts[i] == vect_external_def
 663           && is_a <bb_vec_info> (vinfo)
 664           && TREE_CODE (oprnd) == SSA_NAME
 665           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 666           && !dominated_by_p (CDI_DOMINATORS,
 667                               as_a <bb_vec_info> (vinfo)->bbs[0],
 668                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 669         {
 670           if (dump_enabled_p ())
 671             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 672                              "Build SLP failed: extern def %T only defined "
 673                              "on backedge\n", oprnd);
 674           return -1;
 675         }
 676
 677       if (first)
 678         {
 679           tree type = TREE_TYPE (oprnd);
 680           dt = dts[i];
 681           if ((dt == vect_constant_def
 682                || dt == vect_external_def)
 683               && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 684               && (TREE_CODE (type) == BOOLEAN_TYPE
 685                   || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
 686                                                       type)))
 687             {
 688               if (dump_enabled_p ())
 689                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 690                                  "Build SLP failed: invalid type of def "
 691                                  "for variable-length SLP %T\n", oprnd);
 692               return -1;
 693             }
 694
 695           /* For the swapping logic below force vect_reduction_def
 696              for the reduction op in a SLP reduction group.  */
 697           if (!STMT_VINFO_DATA_REF (stmt_info)
 698               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 699               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 700               && def_stmt_info)
 701             dts[i] = dt = vect_reduction_def;
 702
 703           /* Check the types of the definition.  */
 704           switch (dt)
 705             {
 706             case vect_external_def:
 707             case vect_constant_def:
 708             case vect_internal_def:
 709             case vect_reduction_def:
 710             case vect_induction_def:
 711             case vect_nested_cycle:
 712             case vect_first_order_recurrence:
 713               break;
 714
 715             default:
 716               /* FORNOW: Not supported.  */
 717               if (dump_enabled_p ())
 718                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 719                                  "Build SLP failed: illegal type of def %T\n",
 720                                  oprnd);
 721               return -1;
 722             }
 723
 724           oprnd_info->first_dt = dt;
 725           oprnd_info->first_op_type = type;
 726         }
 727     }
 728   if (first)
 729     return 0;
 730
 731   /* Now match the operand definition types to that of the first stmt.  */
 732   for (i = 0; i < number_of_oprnds;)
 733     {
 734       if (skip_args[i])
 735         {
 736           ++i;
 737           continue;
 738         }
 739
 740       oprnd_info = (*oprnds_info)[i];
 741       dt = dts[i];
 742       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 743       oprnd = oprnd_info->ops[stmt_num];
 744       tree type = TREE_TYPE (oprnd);
 745
 746       if (!types_compatible_p (oprnd_info->first_op_type, type))
 747         {
 748           if (dump_enabled_p ())
 749             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 750                              "Build SLP failed: different operand types\n");
 751           return 1;
 752         }
 753
 754       /* Not first stmt of the group, check that the def-stmt/s match
 755          the def-stmt/s of the first stmt.  Allow different definition
 756          types for reduction chains: the first stmt must be a
 757          vect_reduction_def (a phi node), and the rest
 758          end in the reduction chain.  */
 759       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 760            && !(oprnd_info->first_dt == vect_reduction_def
 761                 && !STMT_VINFO_DATA_REF (stmt_info)
 762                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 763                 && def_stmt_info
 764                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 765                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 766                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 767           || (!STMT_VINFO_DATA_REF (stmt_info)
 768               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 769               && ((!def_stmt_info
 770                    || STMT_VINFO_DATA_REF (def_stmt_info)
 771                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 772                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 773                   != (oprnd_info->first_dt != vect_reduction_def))))
 774         {
 775           /* Try swapping operands if we got a mismatch.  For BB
 776              vectorization only in case it will clearly improve things.  */
 777           if (i == commutative_op && !swapped
 778               && (!is_a <bb_vec_info> (vinfo)
 779                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 780                                              dts[i+1])
 781                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 782                           || vect_def_types_match
 783                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 784             {
 785               if (dump_enabled_p ())
 786                 dump_printf_loc (MSG_NOTE, vect_location,
 787                                  "trying swapped operands\n");
 788               std::swap (dts[i], dts[i+1]);
 789               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 790                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 791               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 792                          (*oprnds_info)[i+1]->ops[stmt_num]);
 793               swapped = true;
 794               continue;
 795             }
 796
 797           if (is_a <bb_vec_info> (vinfo)
 798               && !oprnd_info->any_pattern)
 799             {
 800               /* Now for commutative ops we should see whether we can
 801                  make the other operand matching.  */
 802               if (dump_enabled_p ())
 803                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 804                                  "treating operand as external\n");
 805               oprnd_info->first_dt = dt = vect_external_def;
 806             }
 807           else
 808             {
 809               if (dump_enabled_p ())
 810                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 811                                  "Build SLP failed: different types\n");
 812               return 1;
 813             }
 814         }
 815
 816       /* Make sure to demote the overall operand to external.  */
 817       if (dt == vect_external_def)
 818         oprnd_info->first_dt = vect_external_def;
 819       /* For a SLP reduction chain we want to duplicate the reduction to
 820          each of the chain members.  That gets us a sane SLP graph (still
 821          the stmts are not 100% correct wrt the initial values).  */
 822       else if ((dt == vect_internal_def
 823                 || dt == vect_reduction_def)
 824                && oprnd_info->first_dt == vect_reduction_def
 825                && !STMT_VINFO_DATA_REF (stmt_info)
 826                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 827                && !STMT_VINFO_DATA_REF (def_stmt_info)
 828                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 829                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 830         {
 831           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 832           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 833         }
 834
 835       ++i;
 836     }
 837
 838   /* Swap operands.  */
 839   if (swapped)
 840     {
 841       if (dump_enabled_p ())
 842         dump_printf_loc (MSG_NOTE, vect_location,
 843                          "swapped operands to match def types in %G",
 844                          stmt_info->stmt);
 845     }
 846
 847   return 0;
 848 }
 849
 850 /* Return true if call statements CALL1 and CALL2 are similar enough
 851    to be combined into the same SLP group.  */
 852
 853 bool
 854 compatible_calls_p (gcall *call1, gcall *call2)
 855 {
 856   unsigned int nargs = gimple_call_num_args (call1);
 857   if (nargs != gimple_call_num_args (call2))
 858     return false;
 859
 860   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 861     return false;
 862
 863   if (gimple_call_internal_p (call1))
 864     {
 865       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 866                                TREE_TYPE (gimple_call_lhs (call2))))
 867         return false;
 868       for (unsigned int i = 0; i < nargs; ++i)
 869         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 870                                  TREE_TYPE (gimple_call_arg (call2, i))))
 871           return false;
 872     }
 873   else
 874     {
 875       if (!operand_equal_p (gimple_call_fn (call1),
 876                             gimple_call_fn (call2), 0))
 877         return false;
 878
 879       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 880         return false;
 881     }
 882
 883   /* Check that any unvectorized arguments are equal.  */
 884   if (const int *map = vect_get_operand_map (call1))
 885     {
 886       unsigned int nkept = *map++;
 887       unsigned int mapi = 0;
 888       for (unsigned int i = 0; i < nargs; ++i)
 889         if (mapi < nkept && map[mapi] == int (i))
 890           mapi += 1;
 891         else if (!operand_equal_p (gimple_call_arg (call1, i),
 892                                    gimple_call_arg (call2, i)))
 893           return false;
 894     }
 895
 896   return true;
 897 }
 898
 899 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
 900    caller's attempt to find the vector type in STMT_INFO with the narrowest
 901    element type.  Return true if VECTYPE is nonnull and if it is valid
 902    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
 903    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
 904    vect_build_slp_tree.  */
 905
 906 static bool
 907 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
 908                         unsigned int group_size,
 909                         tree vectype, poly_uint64 *max_nunits)
 910 {
 911   if (!vectype)
 912     {
 913       if (dump_enabled_p ())
 914         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 915                          "Build SLP failed: unsupported data-type in %G\n",
 916                          stmt_info->stmt);
 917       /* Fatal mismatch.  */
 918       return false;
 919     }
 920
 921   /* If populating the vector type requires unrolling then fail
 922      before adjusting *max_nunits for basic-block vectorization.  */
 923   if (is_a <bb_vec_info> (vinfo)
 924       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
 925     {
 926       if (dump_enabled_p ())
 927         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 928                          "Build SLP failed: unrolling required "
 929                          "in basic block SLP\n");
 930       /* Fatal mismatch.  */
 931       return false;
 932     }
 933
 934   /* In case of multiple types we need to detect the smallest type.  */
 935   vect_update_max_nunits (max_nunits, vectype);
 936   return true;
 937 }
 938
 939 /* Verify if the scalar stmts STMTS are isomorphic, require data
 940    permutation or are of unsupported types of operation.  Return
 941    true if they are, otherwise return false and indicate in *MATCHES
 942    which stmts are not isomorphic to the first one.  If MATCHES[0]
 943    is false then this indicates the comparison could not be
 944    carried out or the stmts will never be vectorized by SLP.
 945
 946    Note COND_EXPR is possibly isomorphic to another one after swapping its
 947    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
 948    the first stmt by swapping the two operands of comparison; set SWAP[i]
 949    to 2 if stmt I is isormorphic to the first stmt by inverting the code
 950    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
 951    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
 952
 953 static bool
 954 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 955                        vec<stmt_vec_info> stmts, unsigned int group_size,
 956                        poly_uint64 *max_nunits, bool *matches,
 957                        bool *two_operators, tree *node_vectype)
 958 {
 959   unsigned int i;
 960   stmt_vec_info first_stmt_info = stmts[0];
 961   code_helper first_stmt_code = ERROR_MARK;
 962   code_helper alt_stmt_code = ERROR_MARK;
 963   code_helper rhs_code = ERROR_MARK;
 964   code_helper first_cond_code = ERROR_MARK;
 965   tree lhs;
 966   bool need_same_oprnds = false;
 967   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
 968   stmt_vec_info first_load = NULL, prev_first_load = NULL;
 969   bool first_stmt_load_p = false, load_p = false;
 970   bool first_stmt_phi_p = false, phi_p = false;
 971   bool maybe_soft_fail = false;
 972   tree soft_fail_nunits_vectype = NULL_TREE;
 973
 974   /* For every stmt in NODE find its def stmt/s.  */
 975   stmt_vec_info stmt_info;
 976   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 977     {
 978       gimple *stmt = stmt_info->stmt;
 979       swap[i] = 0;
 980       matches[i] = false;
 981
 982       if (dump_enabled_p ())
 983         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
 984
 985       /* Fail to vectorize statements marked as unvectorizable, throw
 986          or are volatile.  */
 987       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
 988           || stmt_can_throw_internal (cfun, stmt)
 989           || gimple_has_volatile_ops (stmt))
 990         {
 991           if (dump_enabled_p ())
 992             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 993                              "Build SLP failed: unvectorizable statement %G",
 994                              stmt);
 995           /* ???  For BB vectorization we want to commutate operands in a way
 996              to shuffle all unvectorizable defs into one operand and have
 997              the other still vectorized.  The following doesn't reliably
 998              work for this though but it's the easiest we can do here.  */
 999           if (is_a <bb_vec_info> (vinfo) && i != 0)
1000             continue;
1001           /* Fatal mismatch.  */
1002           matches[0] = false;
1003           return false;
1004         }
1005
1006       lhs = gimple_get_lhs (stmt);
1007       if (lhs == NULL_TREE)
1008         {
1009           if (dump_enabled_p ())
1010             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1011                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1012                              "GIMPLE_CALL %G", stmt);
1013           if (is_a <bb_vec_info> (vinfo) && i != 0)
1014             continue;
1015           /* Fatal mismatch.  */
1016           matches[0] = false;
1017           return false;
1018         }
1019
1020       tree nunits_vectype;
1021       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1022                                            &nunits_vectype, group_size))
1023         {
1024           if (is_a <bb_vec_info> (vinfo) && i != 0)
1025             continue;
1026           /* Fatal mismatch.  */
1027           matches[0] = false;
1028           return false;
1029         }
1030       /* Record nunits required but continue analysis, producing matches[]
1031          as if nunits was not an issue.  This allows splitting of groups
1032          to happen.  */
1033       if (nunits_vectype
1034           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1035                                       nunits_vectype, max_nunits))
1036         {
1037           gcc_assert (is_a <bb_vec_info> (vinfo));
1038           maybe_soft_fail = true;
1039           soft_fail_nunits_vectype = nunits_vectype;
1040         }
1041
1042       gcc_assert (vectype);
1043
1044       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1045       if (call_stmt)
1046         {
1047           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1048           if (cfn != CFN_LAST)
1049             rhs_code = cfn;
1050           else
1051             rhs_code = CALL_EXPR;
1052
1053           if (cfn == CFN_MASK_LOAD
1054               || cfn == CFN_GATHER_LOAD
1055               || cfn == CFN_MASK_GATHER_LOAD)
1056             load_p = true;
1057           else if ((internal_fn_p (cfn)
1058                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1059                    || gimple_call_tail_p (call_stmt)
1060                    || gimple_call_noreturn_p (call_stmt)
1061                    || gimple_call_chain (call_stmt))
1062             {
1063               if (dump_enabled_p ())
1064                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1065                                  "Build SLP failed: unsupported call type %G",
1066                                  (gimple *) call_stmt);
1067               if (is_a <bb_vec_info> (vinfo) && i != 0)
1068                 continue;
1069               /* Fatal mismatch.  */
1070               matches[0] = false;
1071               return false;
1072             }
1073         }
1074       else if (gimple_code (stmt) == GIMPLE_PHI)
1075         {
1076           rhs_code = ERROR_MARK;
1077           phi_p = true;
1078         }
1079       else
1080         {
1081           rhs_code = gimple_assign_rhs_code (stmt);
1082           load_p = gimple_vuse (stmt);
1083         }
1084
1085       /* Check the operation.  */
1086       if (i == 0)
1087         {
1088           *node_vectype = vectype;
1089           first_stmt_code = rhs_code;
1090           first_stmt_load_p = load_p;
1091           first_stmt_phi_p = phi_p;
1092
1093           /* Shift arguments should be equal in all the packed stmts for a
1094              vector shift with scalar shift operand.  */
1095           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1096               || rhs_code == LROTATE_EXPR
1097               || rhs_code == RROTATE_EXPR)
1098             {
1099               /* First see if we have a vector/vector shift.  */
1100               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1101                 {
1102                   /* No vector/vector shift, try for a vector/scalar shift.  */
1103                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1104                     {
1105                       if (dump_enabled_p ())
1106                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1107                                          "Build SLP failed: "
1108                                          "op not supported by target.\n");
1109                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1110                         continue;
1111                       /* Fatal mismatch.  */
1112                       matches[0] = false;
1113                       return false;
1114                     }
1115                   need_same_oprnds = true;
1116                   first_op1 = gimple_assign_rhs2 (stmt);
1117                 }
1118             }
1119           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1120             {
1121               need_same_oprnds = true;
1122               first_op1 = gimple_assign_rhs2 (stmt);
1123             }
1124           else if (!load_p
1125                    && rhs_code == BIT_FIELD_REF)
1126             {
1127               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1128               if (!is_a <bb_vec_info> (vinfo)
1129                   || TREE_CODE (vec) != SSA_NAME
1130                   /* When the element types are not compatible we pun the
1131                      source to the target vectype which requires equal size.  */
1132                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1133                        || !types_compatible_p (TREE_TYPE (vectype),
1134                                                TREE_TYPE (TREE_TYPE (vec))))
1135                       && !operand_equal_p (TYPE_SIZE (vectype),
1136                                            TYPE_SIZE (TREE_TYPE (vec)))))
1137                 {
1138                   if (dump_enabled_p ())
1139                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140                                      "Build SLP failed: "
1141                                      "BIT_FIELD_REF not supported\n");
1142                   /* Fatal mismatch.  */
1143                   matches[0] = false;
1144                   return false;
1145                 }
1146             }
1147           else if (rhs_code == CFN_DIV_POW2)
1148             {
1149               need_same_oprnds = true;
1150               first_op1 = gimple_call_arg (call_stmt, 1);
1151             }
1152         }
1153       else
1154         {
1155           if (first_stmt_code != rhs_code
1156               && alt_stmt_code == ERROR_MARK)
1157             alt_stmt_code = rhs_code;
1158           if ((first_stmt_code != rhs_code
1159                && (first_stmt_code != IMAGPART_EXPR
1160                    || rhs_code != REALPART_EXPR)
1161                && (first_stmt_code != REALPART_EXPR
1162                    || rhs_code != IMAGPART_EXPR)
1163                /* Handle mismatches in plus/minus by computing both
1164                   and merging the results.  */
1165                && !((first_stmt_code == PLUS_EXPR
1166                      || first_stmt_code == MINUS_EXPR)
1167                     && (alt_stmt_code == PLUS_EXPR
1168                         || alt_stmt_code == MINUS_EXPR)
1169                     && rhs_code == alt_stmt_code)
1170                && !(first_stmt_code.is_tree_code ()
1171                     && rhs_code.is_tree_code ()
1172                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1173                         == tcc_comparison)
1174                     && (swap_tree_comparison (tree_code (first_stmt_code))
1175                         == tree_code (rhs_code)))
1176                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1177                     && (first_stmt_code == ARRAY_REF
1178                         || first_stmt_code == BIT_FIELD_REF
1179                         || first_stmt_code == INDIRECT_REF
1180                         || first_stmt_code == COMPONENT_REF
1181                         || first_stmt_code == MEM_REF)
1182                     && (rhs_code == ARRAY_REF
1183                         || rhs_code == BIT_FIELD_REF
1184                         || rhs_code == INDIRECT_REF
1185                         || rhs_code == COMPONENT_REF
1186                         || rhs_code == MEM_REF)))
1187               || first_stmt_load_p != load_p
1188               || first_stmt_phi_p != phi_p)
1189             {
1190               if (dump_enabled_p ())
1191                 {
1192                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193                                    "Build SLP failed: different operation "
1194                                    "in stmt %G", stmt);
1195                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196                                    "original stmt %G", first_stmt_info->stmt);
1197                 }
1198               /* Mismatch.  */
1199               continue;
1200             }
1201
1202           if (!load_p
1203               && first_stmt_code == BIT_FIELD_REF
1204               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1205                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1206             {
1207               if (dump_enabled_p ())
1208                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209                                  "Build SLP failed: different BIT_FIELD_REF "
1210                                  "arguments in %G", stmt);
1211               /* Mismatch.  */
1212               continue;
1213             }
1214
1215           if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1216             {
1217               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1218                                        call_stmt))
1219                 {
1220                   if (dump_enabled_p ())
1221                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222                                      "Build SLP failed: different calls in %G",
1223                                      stmt);
1224                   /* Mismatch.  */
1225                   continue;
1226                 }
1227             }
1228
1229           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1230               && (gimple_bb (first_stmt_info->stmt)
1231                   != gimple_bb (stmt_info->stmt)))
1232             {
1233               if (dump_enabled_p ())
1234                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235                                  "Build SLP failed: different BB for PHI "
1236                                  "or possibly trapping operation in %G", stmt);
1237               /* Mismatch.  */
1238               continue;
1239             }
1240
1241           if (need_same_oprnds)
1242             {
1243               tree other_op1 = gimple_arg (stmt, 1);
1244               if (!operand_equal_p (first_op1, other_op1, 0))
1245                 {
1246                   if (dump_enabled_p ())
1247                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248                                      "Build SLP failed: different shift "
1249                                      "arguments in %G", stmt);
1250                   /* Mismatch.  */
1251                   continue;
1252                 }
1253             }
1254
1255           if (!types_compatible_p (vectype, *node_vectype))
1256             {
1257               if (dump_enabled_p ())
1258                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1259                                  "Build SLP failed: different vector type "
1260                                  "in %G", stmt);
1261               /* Mismatch.  */
1262               continue;
1263             }
1264         }
1265
1266       /* Grouped store or load.  */
1267       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1268         {
1269           if (REFERENCE_CLASS_P (lhs))
1270             {
1271               /* Store.  */
1272               ;
1273             }
1274           else
1275             {
1276               /* Load.  */
1277               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1278               if (prev_first_load)
1279                 {
1280                   /* Check that there are no loads from different interleaving
1281                      chains in the same node.  */
1282                   if (prev_first_load != first_load)
1283                     {
1284                       if (dump_enabled_p ())
1285                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1286                                          vect_location,
1287                                          "Build SLP failed: different "
1288                                          "interleaving chains in one node %G",
1289                                          stmt);
1290                       /* Mismatch.  */
1291                       continue;
1292                     }
1293                 }
1294               else
1295                 prev_first_load = first_load;
1296            }
1297         } /* Grouped access.  */
1298       else
1299         {
1300           if (load_p
1301               && rhs_code != CFN_GATHER_LOAD
1302               && rhs_code != CFN_MASK_GATHER_LOAD
1303               /* Not grouped loads are handled as externals for BB
1304                  vectorization.  For loop vectorization we can handle
1305                  splats the same we handle single element interleaving.  */
1306               && (is_a <bb_vec_info> (vinfo)
1307                   || stmt_info != first_stmt_info
1308                   || STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
1309             {
1310               /* Not grouped load.  */
1311               if (dump_enabled_p ())
1312                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1313                                  "Build SLP failed: not grouped load %G", stmt);
1314
1315               if (i != 0)
1316                 continue;
1317               /* Fatal mismatch.  */
1318               matches[0] = false;
1319               return false;
1320             }
1321
1322           /* Not memory operation.  */
1323           if (!load_p
1324               && !phi_p
1325               && rhs_code.is_tree_code ()
1326               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1327               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1328               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1329               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1330               && rhs_code != VIEW_CONVERT_EXPR
1331               && rhs_code != CALL_EXPR
1332               && rhs_code != BIT_FIELD_REF)
1333             {
1334               if (dump_enabled_p ())
1335                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1336                                  "Build SLP failed: operation unsupported %G",
1337                                  stmt);
1338               if (is_a <bb_vec_info> (vinfo) && i != 0)
1339                 continue;
1340               /* Fatal mismatch.  */
1341               matches[0] = false;
1342               return false;
1343             }
1344
1345           if (rhs_code == COND_EXPR)
1346             {
1347               tree cond_expr = gimple_assign_rhs1 (stmt);
1348               enum tree_code cond_code = TREE_CODE (cond_expr);
1349               enum tree_code swap_code = ERROR_MARK;
1350               enum tree_code invert_code = ERROR_MARK;
1351
1352               if (i == 0)
1353                 first_cond_code = TREE_CODE (cond_expr);
1354               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1355                 {
1356                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1357                   swap_code = swap_tree_comparison (cond_code);
1358                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1359                 }
1360
1361               if (first_cond_code == cond_code)
1362                 ;
1363               /* Isomorphic can be achieved by swapping.  */
1364               else if (first_cond_code == swap_code)
1365                 swap[i] = 1;
1366               /* Isomorphic can be achieved by inverting.  */
1367               else if (first_cond_code == invert_code)
1368                 swap[i] = 2;
1369               else
1370                 {
1371                   if (dump_enabled_p ())
1372                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1373                                      "Build SLP failed: different"
1374                                      " operation %G", stmt);
1375                   /* Mismatch.  */
1376                   continue;
1377                 }
1378             }
1379
1380           if (rhs_code.is_tree_code ()
1381               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1382               && (swap_tree_comparison ((tree_code)first_stmt_code)
1383                   == (tree_code)rhs_code))
1384             swap[i] = 1;
1385         }
1386
1387       matches[i] = true;
1388     }
1389
1390   for (i = 0; i < group_size; ++i)
1391     if (!matches[i])
1392       return false;
1393
1394   /* If we allowed a two-operation SLP node verify the target can cope
1395      with the permute we are going to use.  */
1396   if (alt_stmt_code != ERROR_MARK
1397       && (!alt_stmt_code.is_tree_code ()
1398           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1399               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1400     {
1401       *two_operators = true;
1402     }
1403
1404   if (maybe_soft_fail)
1405     {
1406       unsigned HOST_WIDE_INT const_nunits;
1407       if (!TYPE_VECTOR_SUBPARTS
1408             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1409           || const_nunits > group_size)
1410         matches[0] = false;
1411       else
1412         {
1413           /* With constant vector elements simulate a mismatch at the
1414              point we need to split.  */
1415           unsigned tail = group_size & (const_nunits - 1);
1416           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1417         }
1418       return false;
1419     }
1420
1421   return true;
1422 }
1423
1424 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1425    Note we never remove apart from at destruction time so we do not
1426    need a special value for deleted that differs from empty.  */
1427 struct bst_traits
1428 {
1429   typedef vec <stmt_vec_info> value_type;
1430   typedef vec <stmt_vec_info> compare_type;
1431   static inline hashval_t hash (value_type);
1432   static inline bool equal (value_type existing, value_type candidate);
1433   static inline bool is_empty (value_type x) { return !x.exists (); }
1434   static inline bool is_deleted (value_type x) { return !x.exists (); }
1435   static const bool empty_zero_p = true;
1436   static inline void mark_empty (value_type &x) { x.release (); }
1437   static inline void mark_deleted (value_type &x) { x.release (); }
1438   static inline void remove (value_type &x) { x.release (); }
1439 };
1440 inline hashval_t
1441 bst_traits::hash (value_type x)
1442 {
1443   inchash::hash h;
1444   for (unsigned i = 0; i < x.length (); ++i)
1445     h.add_int (gimple_uid (x[i]->stmt));
1446   return h.end ();
1447 }
1448 inline bool
1449 bst_traits::equal (value_type existing, value_type candidate)
1450 {
1451   if (existing.length () != candidate.length ())
1452     return false;
1453   for (unsigned i = 0; i < existing.length (); ++i)
1454     if (existing[i] != candidate[i])
1455       return false;
1456   return true;
1457 }
1458
1459 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1460    but then vec::insert does memmove and that's not compatible with
1461    std::pair.  */
1462 struct chain_op_t
1463 {
1464   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1465       : code (code_), dt (dt_), op (op_) {}
1466   tree_code code;
1467   vect_def_type dt;
1468   tree op;
1469 };
1470
1471 /* Comparator for sorting associatable chains.  */
1472
1473 static int
1474 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1475 {
1476   auto *op1 = (const chain_op_t *) op1_;
1477   auto *op2 = (const chain_op_t *) op2_;
1478   if (op1->dt != op2->dt)
1479     return (int)op1->dt - (int)op2->dt;
1480   return (int)op1->code - (int)op2->code;
1481 }
1482
1483 /* Linearize the associatable expression chain at START with the
1484    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1485    filling CHAIN with the result and using WORKLIST as intermediate storage.
1486    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1487    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1488    stmts, starting with START.  */
1489
1490 static void
1491 vect_slp_linearize_chain (vec_info *vinfo,
1492                           vec<std::pair<tree_code, gimple *> > &worklist,
1493                           vec<chain_op_t> &chain,
1494                           enum tree_code code, gimple *start,
1495                           gimple *&code_stmt, gimple *&alt_code_stmt,
1496                           vec<gimple *> *chain_stmts)
1497 {
1498   /* For each lane linearize the addition/subtraction (or other
1499      uniform associatable operation) expression tree.  */
1500   worklist.safe_push (std::make_pair (code, start));
1501   while (!worklist.is_empty ())
1502     {
1503       auto entry = worklist.pop ();
1504       gassign *stmt = as_a <gassign *> (entry.second);
1505       enum tree_code in_code = entry.first;
1506       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1507       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1508       if (!code_stmt
1509           && gimple_assign_rhs_code (stmt) == code)
1510         code_stmt = stmt;
1511       else if (!alt_code_stmt
1512                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1513         alt_code_stmt = stmt;
1514       if (chain_stmts)
1515         chain_stmts->safe_push (stmt);
1516       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1517         {
1518           tree op = gimple_op (stmt, opnum);
1519           vect_def_type dt;
1520           stmt_vec_info def_stmt_info;
1521           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1522           gcc_assert (res);
1523           if (dt == vect_internal_def
1524               && is_pattern_stmt_p (def_stmt_info))
1525             op = gimple_get_lhs (def_stmt_info->stmt);
1526           gimple *use_stmt;
1527           use_operand_p use_p;
1528           if (dt == vect_internal_def
1529               && single_imm_use (op, &use_p, &use_stmt)
1530               && is_gimple_assign (def_stmt_info->stmt)
1531               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1532                   || (code == PLUS_EXPR
1533                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1534                           == MINUS_EXPR))))
1535             {
1536               tree_code op_def_code = this_code;
1537               if (op_def_code == MINUS_EXPR && opnum == 1)
1538                 op_def_code = PLUS_EXPR;
1539               if (in_code == MINUS_EXPR)
1540                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1541               worklist.safe_push (std::make_pair (op_def_code,
1542                                                   def_stmt_info->stmt));
1543             }
1544           else
1545             {
1546               tree_code op_def_code = this_code;
1547               if (op_def_code == MINUS_EXPR && opnum == 1)
1548                 op_def_code = PLUS_EXPR;
1549               if (in_code == MINUS_EXPR)
1550                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1551               chain.safe_push (chain_op_t (op_def_code, dt, op));
1552             }
1553         }
1554     }
1555 }
1556
1557 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1558                   simple_hashmap_traits <bst_traits, slp_tree> >
1559   scalar_stmts_to_slp_tree_map_t;
1560
1561 static slp_tree
1562 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1563                        vec<stmt_vec_info> stmts, unsigned int group_size,
1564                        poly_uint64 *max_nunits,
1565                        bool *matches, unsigned *limit, unsigned *tree_size,
1566                        scalar_stmts_to_slp_tree_map_t *bst_map);
1567
1568 static slp_tree
1569 vect_build_slp_tree (vec_info *vinfo,
1570                      vec<stmt_vec_info> stmts, unsigned int group_size,
1571                      poly_uint64 *max_nunits,
1572                      bool *matches, unsigned *limit, unsigned *tree_size,
1573                      scalar_stmts_to_slp_tree_map_t *bst_map)
1574 {
1575   if (slp_tree *leader = bst_map->get (stmts))
1576     {
1577       if (dump_enabled_p ())
1578         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1579                          !(*leader)->failed ? "" : "failed ",
1580                          (void *) *leader);
1581       if (!(*leader)->failed)
1582         {
1583           SLP_TREE_REF_COUNT (*leader)++;
1584           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1585           stmts.release ();
1586           return *leader;
1587         }
1588       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1589       return NULL;
1590     }
1591
1592   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1593      so we can pick up backedge destinations during discovery.  */
1594   slp_tree res = new _slp_tree;
1595   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1596   SLP_TREE_SCALAR_STMTS (res) = stmts;
1597   bst_map->put (stmts.copy (), res);
1598
1599   if (*limit == 0)
1600     {
1601       if (dump_enabled_p ())
1602         dump_printf_loc (MSG_NOTE, vect_location,
1603                          "SLP discovery limit exceeded\n");
1604       /* Mark the node invalid so we can detect those when still in use
1605          as backedge destinations.  */
1606       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1607       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1608       res->failed = XNEWVEC (bool, group_size);
1609       memset (res->failed, 0, sizeof (bool) * group_size);
1610       memset (matches, 0, sizeof (bool) * group_size);
1611       return NULL;
1612     }
1613   --*limit;
1614
1615   if (dump_enabled_p ())
1616     dump_printf_loc (MSG_NOTE, vect_location,
1617                      "starting SLP discovery for node %p\n", (void *) res);
1618
1619   poly_uint64 this_max_nunits = 1;
1620   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1621                                         &this_max_nunits,
1622                                         matches, limit, tree_size, bst_map);
1623   if (!res_)
1624     {
1625       if (dump_enabled_p ())
1626         dump_printf_loc (MSG_NOTE, vect_location,
1627                          "SLP discovery for node %p failed\n", (void *) res);
1628       /* Mark the node invalid so we can detect those when still in use
1629          as backedge destinations.  */
1630       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1631       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1632       res->failed = XNEWVEC (bool, group_size);
1633       if (flag_checking)
1634         {
1635           unsigned i;
1636           for (i = 0; i < group_size; ++i)
1637             if (!matches[i])
1638               break;
1639           gcc_assert (i < group_size);
1640         }
1641       memcpy (res->failed, matches, sizeof (bool) * group_size);
1642     }
1643   else
1644     {
1645       if (dump_enabled_p ())
1646         dump_printf_loc (MSG_NOTE, vect_location,
1647                          "SLP discovery for node %p succeeded\n",
1648                          (void *) res);
1649       gcc_assert (res_ == res);
1650       res->max_nunits = this_max_nunits;
1651       vect_update_max_nunits (max_nunits, this_max_nunits);
1652       /* Keep a reference for the bst_map use.  */
1653       SLP_TREE_REF_COUNT (res)++;
1654     }
1655   return res_;
1656 }
1657
1658 /* Helper for building an associated SLP node chain.  */
1659
1660 static void
1661 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1662                                    slp_tree op0, slp_tree op1,
1663                                    stmt_vec_info oper1, stmt_vec_info oper2,
1664                                    vec<std::pair<unsigned, unsigned> > lperm)
1665 {
1666   unsigned group_size = SLP_TREE_LANES (op1);
1667
1668   slp_tree child1 = new _slp_tree;
1669   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1670   SLP_TREE_VECTYPE (child1) = vectype;
1671   SLP_TREE_LANES (child1) = group_size;
1672   SLP_TREE_CHILDREN (child1).create (2);
1673   SLP_TREE_CHILDREN (child1).quick_push (op0);
1674   SLP_TREE_CHILDREN (child1).quick_push (op1);
1675   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1676
1677   slp_tree child2 = new _slp_tree;
1678   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1679   SLP_TREE_VECTYPE (child2) = vectype;
1680   SLP_TREE_LANES (child2) = group_size;
1681   SLP_TREE_CHILDREN (child2).create (2);
1682   SLP_TREE_CHILDREN (child2).quick_push (op0);
1683   SLP_TREE_REF_COUNT (op0)++;
1684   SLP_TREE_CHILDREN (child2).quick_push (op1);
1685   SLP_TREE_REF_COUNT (op1)++;
1686   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1687
1688   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1689   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1690   SLP_TREE_VECTYPE (perm) = vectype;
1691   SLP_TREE_LANES (perm) = group_size;
1692   /* ???  We should set this NULL but that's not expected.  */
1693   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1694   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1695   SLP_TREE_CHILDREN (perm).quick_push (child1);
1696   SLP_TREE_CHILDREN (perm).quick_push (child2);
1697 }
1698
1699 /* Recursively build an SLP tree starting from NODE.
1700    Fail (and return a value not equal to zero) if def-stmts are not
1701    isomorphic, require data permutation or are of unsupported types of
1702    operation.  Otherwise, return 0.
1703    The value returned is the depth in the SLP tree where a mismatch
1704    was found.  */
1705
1706 static slp_tree
1707 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1708                        vec<stmt_vec_info> stmts, unsigned int group_size,
1709                        poly_uint64 *max_nunits,
1710                        bool *matches, unsigned *limit, unsigned *tree_size,
1711                        scalar_stmts_to_slp_tree_map_t *bst_map)
1712 {
1713   unsigned nops, i, this_tree_size = 0;
1714   poly_uint64 this_max_nunits = *max_nunits;
1715
1716   matches[0] = false;
1717
1718   stmt_vec_info stmt_info = stmts[0];
1719   if (!is_a<gcall *> (stmt_info->stmt)
1720       && !is_a<gassign *> (stmt_info->stmt)
1721       && !is_a<gphi *> (stmt_info->stmt))
1722     return NULL;
1723
1724   nops = gimple_num_args (stmt_info->stmt);
1725   if (const int *map = vect_get_operand_map (stmt_info->stmt))
1726     nops = map[0];
1727
1728   /* If the SLP node is a PHI (induction or reduction), terminate
1729      the recursion.  */
1730   bool *skip_args = XALLOCAVEC (bool, nops);
1731   memset (skip_args, 0, sizeof (bool) * nops);
1732   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1733     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1734       {
1735         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1736         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1737                                                     group_size);
1738         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1739                                      max_nunits))
1740           return NULL;
1741
1742         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1743         if (def_type == vect_induction_def)
1744           {
1745             /* Induction PHIs are not cycles but walk the initial
1746                value.  Only for inner loops through, for outer loops
1747                we need to pick up the value from the actual PHIs
1748                to more easily support peeling and epilogue vectorization.  */
1749             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1750             if (!nested_in_vect_loop_p (loop, stmt_info))
1751               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1752             else
1753               loop = loop->inner;
1754             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1755           }
1756         else if (def_type == vect_reduction_def
1757                  || def_type == vect_double_reduction_def
1758                  || def_type == vect_nested_cycle
1759                  || def_type == vect_first_order_recurrence)
1760           {
1761             /* Else def types have to match.  */
1762             stmt_vec_info other_info;
1763             bool all_same = true;
1764             FOR_EACH_VEC_ELT (stmts, i, other_info)
1765               {
1766                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1767                   return NULL;
1768                 if (other_info != stmt_info)
1769                   all_same = false;
1770               }
1771             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1772             /* Reduction initial values are not explicitely represented.  */
1773             if (def_type != vect_first_order_recurrence
1774                 && !nested_in_vect_loop_p (loop, stmt_info))
1775               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1776             /* Reduction chain backedge defs are filled manually.
1777                ???  Need a better way to identify a SLP reduction chain PHI.
1778                Or a better overall way to SLP match those.  */
1779             if (all_same && def_type == vect_reduction_def)
1780               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1781           }
1782         else if (def_type != vect_internal_def)
1783           return NULL;
1784       }
1785
1786
1787   bool two_operators = false;
1788   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1789   tree vectype = NULL_TREE;
1790   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1791                               &this_max_nunits, matches, &two_operators,
1792                               &vectype))
1793     return NULL;
1794
1795   /* If the SLP node is a load, terminate the recursion unless masked.  */
1796   if (STMT_VINFO_DATA_REF (stmt_info)
1797       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1798     {
1799       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1800         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1801                     || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1802                     || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1803       else
1804         {
1805           *max_nunits = this_max_nunits;
1806           (*tree_size)++;
1807           node = vect_create_new_slp_node (node, stmts, 0);
1808           SLP_TREE_VECTYPE (node) = vectype;
1809           /* And compute the load permutation.  Whether it is actually
1810              a permutation depends on the unrolling factor which is
1811              decided later.  */
1812           vec<unsigned> load_permutation;
1813           int j;
1814           stmt_vec_info load_info;
1815           load_permutation.create (group_size);
1816           stmt_vec_info first_stmt_info
1817             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1818           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1819             {
1820               int load_place;
1821               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1822                 load_place = vect_get_place_in_interleaving_chain
1823                                 (load_info, first_stmt_info);
1824               else
1825                 load_place = 0;
1826               gcc_assert (load_place != -1);
1827               load_permutation.safe_push (load_place);
1828             }
1829           SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1830           return node;
1831         }
1832     }
1833   else if (gimple_assign_single_p (stmt_info->stmt)
1834            && !gimple_vuse (stmt_info->stmt)
1835            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1836     {
1837       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1838          the same SSA name vector of a compatible type to vectype.  */
1839       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1840       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1841       stmt_vec_info estmt_info;
1842       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1843         {
1844           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1845           tree bfref = gimple_assign_rhs1 (estmt);
1846           HOST_WIDE_INT lane;
1847           if (!known_eq (bit_field_size (bfref),
1848                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1849               || !constant_multiple_p (bit_field_offset (bfref),
1850                                        bit_field_size (bfref), &lane))
1851             {
1852               lperm.release ();
1853               matches[0] = false;
1854               return NULL;
1855             }
1856           lperm.safe_push (std::make_pair (0, (unsigned)lane));
1857         }
1858       slp_tree vnode = vect_create_new_slp_node (vNULL);
1859       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1860         /* ???  We record vectype here but we hide eventually necessary
1861            punning and instead rely on code generation to materialize
1862            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1863            this explicit somehow.  */
1864         SLP_TREE_VECTYPE (vnode) = vectype;
1865       else
1866         {
1867           /* For different size but compatible elements we can still
1868              use VEC_PERM_EXPR without punning.  */
1869           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1870                       && types_compatible_p (TREE_TYPE (vectype),
1871                                              TREE_TYPE (TREE_TYPE (vec))));
1872           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1873         }
1874       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1875       unsigned HOST_WIDE_INT const_nunits;
1876       if (nunits.is_constant (&const_nunits))
1877         SLP_TREE_LANES (vnode) = const_nunits;
1878       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1879       /* We are always building a permutation node even if it is an identity
1880          permute to shield the rest of the vectorizer from the odd node
1881          representing an actual vector without any scalar ops.
1882          ???  We could hide it completely with making the permute node
1883          external?  */
1884       node = vect_create_new_slp_node (node, stmts, 1);
1885       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1886       SLP_TREE_LANE_PERMUTATION (node) = lperm;
1887       SLP_TREE_VECTYPE (node) = vectype;
1888       SLP_TREE_CHILDREN (node).quick_push (vnode);
1889       return node;
1890     }
1891   /* When discovery reaches an associatable operation see whether we can
1892      improve that to match up lanes in a way superior to the operand
1893      swapping code which at most looks at two defs.
1894      ???  For BB vectorization we cannot do the brute-force search
1895      for matching as we can succeed by means of builds from scalars
1896      and have no good way to "cost" one build against another.  */
1897   else if (is_a <loop_vec_info> (vinfo)
1898            /* ???  We don't handle !vect_internal_def defs below.  */
1899            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1900            && is_gimple_assign (stmt_info->stmt)
1901            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1902                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1903            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1904                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1905                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1906     {
1907       /* See if we have a chain of (mixed) adds or subtracts or other
1908          associatable ops.  */
1909       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1910       if (code == MINUS_EXPR)
1911         code = PLUS_EXPR;
1912       stmt_vec_info other_op_stmt_info = NULL;
1913       stmt_vec_info op_stmt_info = NULL;
1914       unsigned chain_len = 0;
1915       auto_vec<chain_op_t> chain;
1916       auto_vec<std::pair<tree_code, gimple *> > worklist;
1917       auto_vec<vec<chain_op_t> > chains (group_size);
1918       auto_vec<slp_tree, 4> children;
1919       bool hard_fail = true;
1920       for (unsigned lane = 0; lane < group_size; ++lane)
1921         {
1922           /* For each lane linearize the addition/subtraction (or other
1923              uniform associatable operation) expression tree.  */
1924           gimple *op_stmt = NULL, *other_op_stmt = NULL;
1925           vect_slp_linearize_chain (vinfo, worklist, chain, code,
1926                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
1927                                     NULL);
1928           if (!op_stmt_info && op_stmt)
1929             op_stmt_info = vinfo->lookup_stmt (op_stmt);
1930           if (!other_op_stmt_info && other_op_stmt)
1931             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1932           if (chain.length () == 2)
1933             {
1934               /* In a chain of just two elements resort to the regular
1935                  operand swapping scheme.  If we run into a length
1936                  mismatch still hard-FAIL.  */
1937               if (chain_len == 0)
1938                 hard_fail = false;
1939               else
1940                 {
1941                   matches[lane] = false;
1942                   /* ???  We might want to process the other lanes, but
1943                      make sure to not give false matching hints to the
1944                      caller for lanes we did not process.  */
1945                   if (lane != group_size - 1)
1946                     matches[0] = false;
1947                 }
1948               break;
1949             }
1950           else if (chain_len == 0)
1951             chain_len = chain.length ();
1952           else if (chain.length () != chain_len)
1953             {
1954               /* ???  Here we could slip in magic to compensate with
1955                  neutral operands.  */
1956               matches[lane] = false;
1957               if (lane != group_size - 1)
1958                 matches[0] = false;
1959               break;
1960             }
1961           chains.quick_push (chain.copy ());
1962           chain.truncate (0);
1963         }
1964       if (chains.length () == group_size)
1965         {
1966           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
1967           if (!op_stmt_info)
1968             {
1969               hard_fail = false;
1970               goto out;
1971             }
1972           /* Now we have a set of chains with the same length.  */
1973           /* 1. pre-sort according to def_type and operation.  */
1974           for (unsigned lane = 0; lane < group_size; ++lane)
1975             chains[lane].stablesort (dt_sort_cmp, vinfo);
1976           if (dump_enabled_p ())
1977             {
1978               dump_printf_loc (MSG_NOTE, vect_location,
1979                                "pre-sorted chains of %s\n",
1980                                get_tree_code_name (code));
1981               for (unsigned lane = 0; lane < group_size; ++lane)
1982                 {
1983                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1984                     dump_printf (MSG_NOTE, "%s %T ",
1985                                  get_tree_code_name (chains[lane][opnum].code),
1986                                  chains[lane][opnum].op);
1987                   dump_printf (MSG_NOTE, "\n");
1988                 }
1989             }
1990           /* 2. try to build children nodes, associating as necessary.  */
1991           for (unsigned n = 0; n < chain_len; ++n)
1992             {
1993               vect_def_type dt = chains[0][n].dt;
1994               unsigned lane;
1995               for (lane = 0; lane < group_size; ++lane)
1996                 if (chains[lane][n].dt != dt)
1997                   {
1998                     if (dt == vect_constant_def
1999                         && chains[lane][n].dt == vect_external_def)
2000                       dt = vect_external_def;
2001                     else if (dt == vect_external_def
2002                              && chains[lane][n].dt == vect_constant_def)
2003                       ;
2004                     else
2005                       break;
2006                   }
2007               if (lane != group_size)
2008                 {
2009                   if (dump_enabled_p ())
2010                     dump_printf_loc (MSG_NOTE, vect_location,
2011                                      "giving up on chain due to mismatched "
2012                                      "def types\n");
2013                   matches[lane] = false;
2014                   if (lane != group_size - 1)
2015                     matches[0] = false;
2016                   goto out;
2017                 }
2018               if (dt == vect_constant_def
2019                   || dt == vect_external_def)
2020                 {
2021                   /* Check whether we can build the invariant.  If we can't
2022                      we never will be able to.  */
2023                   tree type = TREE_TYPE (chains[0][n].op);
2024                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2025                       && (TREE_CODE (type) == BOOLEAN_TYPE
2026                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2027                                                               type)))
2028                     {
2029                       matches[0] = false;
2030                       goto out;
2031                     }
2032                   vec<tree> ops;
2033                   ops.create (group_size);
2034                   for (lane = 0; lane < group_size; ++lane)
2035                     ops.quick_push (chains[lane][n].op);
2036                   slp_tree child = vect_create_new_slp_node (ops);
2037                   SLP_TREE_DEF_TYPE (child) = dt;
2038                   children.safe_push (child);
2039                 }
2040               else if (dt != vect_internal_def)
2041                 {
2042                   /* Not sure, we might need sth special.
2043                      gcc.dg/vect/pr96854.c,
2044                      gfortran.dg/vect/fast-math-pr37021.f90
2045                      and gfortran.dg/vect/pr61171.f trigger.  */
2046                   /* Soft-fail for now.  */
2047                   hard_fail = false;
2048                   goto out;
2049                 }
2050               else
2051                 {
2052                   vec<stmt_vec_info> op_stmts;
2053                   op_stmts.create (group_size);
2054                   slp_tree child = NULL;
2055                   /* Brute-force our way.  We have to consider a lane
2056                      failing after fixing an earlier fail up in the
2057                      SLP discovery recursion.  So track the current
2058                      permute per lane.  */
2059                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2060                   memset (perms, 0, sizeof (unsigned) * group_size);
2061                   do
2062                     {
2063                       op_stmts.truncate (0);
2064                       for (lane = 0; lane < group_size; ++lane)
2065                         op_stmts.quick_push
2066                           (vinfo->lookup_def (chains[lane][n].op));
2067                       child = vect_build_slp_tree (vinfo, op_stmts,
2068                                                    group_size, &this_max_nunits,
2069                                                    matches, limit,
2070                                                    &this_tree_size, bst_map);
2071                       /* ???  We're likely getting too many fatal mismatches
2072                          here so maybe we want to ignore them (but then we
2073                          have no idea which lanes fatally mismatched).  */
2074                       if (child || !matches[0])
2075                         break;
2076                       /* Swap another lane we have not yet matched up into
2077                          lanes that did not match.  If we run out of
2078                          permute possibilities for a lane terminate the
2079                          search.  */
2080                       bool term = false;
2081                       for (lane = 1; lane < group_size; ++lane)
2082                         if (!matches[lane])
2083                           {
2084                             if (n + perms[lane] + 1 == chain_len)
2085                               {
2086                                 term = true;
2087                                 break;
2088                               }
2089                             std::swap (chains[lane][n],
2090                                        chains[lane][n + perms[lane] + 1]);
2091                             perms[lane]++;
2092                           }
2093                       if (term)
2094                         break;
2095                     }
2096                   while (1);
2097                   if (!child)
2098                     {
2099                       if (dump_enabled_p ())
2100                         dump_printf_loc (MSG_NOTE, vect_location,
2101                                          "failed to match up op %d\n", n);
2102                       op_stmts.release ();
2103                       if (lane != group_size - 1)
2104                         matches[0] = false;
2105                       else
2106                         matches[lane] = false;
2107                       goto out;
2108                     }
2109                   if (dump_enabled_p ())
2110                     {
2111                       dump_printf_loc (MSG_NOTE, vect_location,
2112                                        "matched up op %d to\n", n);
2113                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2114                     }
2115                   children.safe_push (child);
2116                 }
2117             }
2118           /* 3. build SLP nodes to combine the chain.  */
2119           for (unsigned lane = 0; lane < group_size; ++lane)
2120             if (chains[lane][0].code != code)
2121               {
2122                 /* See if there's any alternate all-PLUS entry.  */
2123                 unsigned n;
2124                 for (n = 1; n < chain_len; ++n)
2125                   {
2126                     for (lane = 0; lane < group_size; ++lane)
2127                       if (chains[lane][n].code != code)
2128                         break;
2129                     if (lane == group_size)
2130                       break;
2131                   }
2132                 if (n != chain_len)
2133                   {
2134                     /* Swap that in at first position.  */
2135                     std::swap (children[0], children[n]);
2136                     for (lane = 0; lane < group_size; ++lane)
2137                       std::swap (chains[lane][0], chains[lane][n]);
2138                   }
2139                 else
2140                   {
2141                     /* ???  When this triggers and we end up with two
2142                        vect_constant/external_def up-front things break (ICE)
2143                        spectacularly finding an insertion place for the
2144                        all-constant op.  We should have a fully
2145                        vect_internal_def operand though(?) so we can swap
2146                        that into first place and then prepend the all-zero
2147                        constant.  */
2148                     if (dump_enabled_p ())
2149                       dump_printf_loc (MSG_NOTE, vect_location,
2150                                        "inserting constant zero to compensate "
2151                                        "for (partially) negated first "
2152                                        "operand\n");
2153                     chain_len++;
2154                     for (lane = 0; lane < group_size; ++lane)
2155                       chains[lane].safe_insert
2156                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2157                     vec<tree> zero_ops;
2158                     zero_ops.create (group_size);
2159                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2160                     for (lane = 1; lane < group_size; ++lane)
2161                       zero_ops.quick_push (zero_ops[0]);
2162                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2163                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2164                     children.safe_insert (0, zero);
2165                   }
2166                 break;
2167               }
2168           for (unsigned i = 1; i < children.length (); ++i)
2169             {
2170               slp_tree op0 = children[i - 1];
2171               slp_tree op1 = children[i];
2172               bool this_two_op = false;
2173               for (unsigned lane = 0; lane < group_size; ++lane)
2174                 if (chains[lane][i].code != chains[0][i].code)
2175                   {
2176                     this_two_op = true;
2177                     break;
2178                   }
2179               slp_tree child;
2180               if (i == children.length () - 1)
2181                 child = vect_create_new_slp_node (node, stmts, 2);
2182               else
2183                 child = vect_create_new_slp_node (2, ERROR_MARK);
2184               if (this_two_op)
2185                 {
2186                   vec<std::pair<unsigned, unsigned> > lperm;
2187                   lperm.create (group_size);
2188                   for (unsigned lane = 0; lane < group_size; ++lane)
2189                     lperm.quick_push (std::make_pair
2190                       (chains[lane][i].code != chains[0][i].code, lane));
2191                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2192                                                      (chains[0][i].code == code
2193                                                       ? op_stmt_info
2194                                                       : other_op_stmt_info),
2195                                                      (chains[0][i].code == code
2196                                                       ? other_op_stmt_info
2197                                                       : op_stmt_info),
2198                                                      lperm);
2199                 }
2200               else
2201                 {
2202                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2203                   SLP_TREE_VECTYPE (child) = vectype;
2204                   SLP_TREE_LANES (child) = group_size;
2205                   SLP_TREE_CHILDREN (child).quick_push (op0);
2206                   SLP_TREE_CHILDREN (child).quick_push (op1);
2207                   SLP_TREE_REPRESENTATIVE (child)
2208                     = (chains[0][i].code == code
2209                        ? op_stmt_info : other_op_stmt_info);
2210                 }
2211               children[i] = child;
2212             }
2213           *tree_size += this_tree_size + 1;
2214           *max_nunits = this_max_nunits;
2215           while (!chains.is_empty ())
2216             chains.pop ().release ();
2217           return node;
2218         }
2219 out:
2220       while (!children.is_empty ())
2221         vect_free_slp_tree (children.pop ());
2222       while (!chains.is_empty ())
2223         chains.pop ().release ();
2224       /* Hard-fail, otherwise we might run into quadratic processing of the
2225          chains starting one stmt into the chain again.  */
2226       if (hard_fail)
2227         return NULL;
2228       /* Fall thru to normal processing.  */
2229     }
2230
2231   /* Get at the operands, verifying they are compatible.  */
2232   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2233   slp_oprnd_info oprnd_info;
2234   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2235     {
2236       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2237                                              stmts, i, &oprnds_info);
2238       if (res != 0)
2239         matches[(res == -1) ? 0 : i] = false;
2240       if (!matches[0])
2241         break;
2242     }
2243   for (i = 0; i < group_size; ++i)
2244     if (!matches[i])
2245       {
2246         vect_free_oprnd_info (oprnds_info);
2247         return NULL;
2248       }
2249   swap = NULL;
2250
2251   auto_vec<slp_tree, 4> children;
2252
2253   stmt_info = stmts[0];
2254
2255   /* Create SLP_TREE nodes for the definition node/s.  */
2256   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2257     {
2258       slp_tree child;
2259       unsigned int j;
2260
2261       /* We're skipping certain operands from processing, for example
2262          outer loop reduction initial defs.  */
2263       if (skip_args[i])
2264         {
2265           children.safe_push (NULL);
2266           continue;
2267         }
2268
2269       if (oprnd_info->first_dt == vect_uninitialized_def)
2270         {
2271           /* COND_EXPR have one too many eventually if the condition
2272              is a SSA name.  */
2273           gcc_assert (i == 3 && nops == 4);
2274           continue;
2275         }
2276
2277       if (is_a <bb_vec_info> (vinfo)
2278           && oprnd_info->first_dt == vect_internal_def
2279           && !oprnd_info->any_pattern)
2280         {
2281           /* For BB vectorization, if all defs are the same do not
2282              bother to continue the build along the single-lane
2283              graph but use a splat of the scalar value.  */
2284           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2285           for (j = 1; j < group_size; ++j)
2286             if (oprnd_info->def_stmts[j] != first_def)
2287               break;
2288           if (j == group_size
2289               /* But avoid doing this for loads where we may be
2290                  able to CSE things, unless the stmt is not
2291                  vectorizable.  */
2292               && (!STMT_VINFO_VECTORIZABLE (first_def)
2293                   || !gimple_vuse (first_def->stmt)))
2294             {
2295               if (dump_enabled_p ())
2296                 dump_printf_loc (MSG_NOTE, vect_location,
2297                                  "Using a splat of the uniform operand %G",
2298                                  first_def->stmt);
2299               oprnd_info->first_dt = vect_external_def;
2300             }
2301         }
2302
2303       if (oprnd_info->first_dt == vect_external_def
2304           || oprnd_info->first_dt == vect_constant_def)
2305         {
2306           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2307           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2308           oprnd_info->ops = vNULL;
2309           children.safe_push (invnode);
2310           continue;
2311         }
2312
2313       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2314                                         group_size, &this_max_nunits,
2315                                         matches, limit,
2316                                         &this_tree_size, bst_map)) != NULL)
2317         {
2318           oprnd_info->def_stmts = vNULL;
2319           children.safe_push (child);
2320           continue;
2321         }
2322
2323       /* If the SLP build for operand zero failed and operand zero
2324          and one can be commutated try that for the scalar stmts
2325          that failed the match.  */
2326       if (i == 0
2327           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2328           && matches[0]
2329           /* ???  For COND_EXPRs we can swap the comparison operands
2330              as well as the arms under some constraints.  */
2331           && nops == 2
2332           && oprnds_info[1]->first_dt == vect_internal_def
2333           && is_gimple_assign (stmt_info->stmt)
2334           /* Swapping operands for reductions breaks assumptions later on.  */
2335           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2336           && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2337         {
2338           /* See whether we can swap the matching or the non-matching
2339              stmt operands.  */
2340           bool swap_not_matching = true;
2341           do
2342             {
2343               for (j = 0; j < group_size; ++j)
2344                 {
2345                   if (matches[j] != !swap_not_matching)
2346                     continue;
2347                   stmt_vec_info stmt_info = stmts[j];
2348                   /* Verify if we can swap operands of this stmt.  */
2349                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2350                   if (!stmt
2351                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2352                     {
2353                       if (!swap_not_matching)
2354                         goto fail;
2355                       swap_not_matching = false;
2356                       break;
2357                     }
2358                 }
2359             }
2360           while (j != group_size);
2361
2362           /* Swap mismatched definition stmts.  */
2363           if (dump_enabled_p ())
2364             dump_printf_loc (MSG_NOTE, vect_location,
2365                              "Re-trying with swapped operands of stmts ");
2366           for (j = 0; j < group_size; ++j)
2367             if (matches[j] == !swap_not_matching)
2368               {
2369                 std::swap (oprnds_info[0]->def_stmts[j],
2370                            oprnds_info[1]->def_stmts[j]);
2371                 std::swap (oprnds_info[0]->ops[j],
2372                            oprnds_info[1]->ops[j]);
2373                 if (dump_enabled_p ())
2374                   dump_printf (MSG_NOTE, "%d ", j);
2375               }
2376           if (dump_enabled_p ())
2377             dump_printf (MSG_NOTE, "\n");
2378           /* After swapping some operands we lost track whether an
2379              operand has any pattern defs so be conservative here.  */
2380           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2381             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2382           /* And try again with scratch 'matches' ... */
2383           bool *tem = XALLOCAVEC (bool, group_size);
2384           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2385                                             group_size, &this_max_nunits,
2386                                             tem, limit,
2387                                             &this_tree_size, bst_map)) != NULL)
2388             {
2389               oprnd_info->def_stmts = vNULL;
2390               children.safe_push (child);
2391               continue;
2392             }
2393         }
2394 fail:
2395
2396       /* If the SLP build failed and we analyze a basic-block
2397          simply treat nodes we fail to build as externally defined
2398          (and thus build vectors from the scalar defs).
2399          The cost model will reject outright expensive cases.
2400          ???  This doesn't treat cases where permutation ultimatively
2401          fails (or we don't try permutation below).  Ideally we'd
2402          even compute a permutation that will end up with the maximum
2403          SLP tree size...  */
2404       if (is_a <bb_vec_info> (vinfo)
2405           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2406              do extra work to cancel the pattern so the uses see the
2407              scalar version.  */
2408           && !is_pattern_stmt_p (stmt_info)
2409           && !oprnd_info->any_pattern)
2410         {
2411           /* But if there's a leading vector sized set of matching stmts
2412              fail here so we can split the group.  This matches the condition
2413              vect_analyze_slp_instance uses.  */
2414           /* ???  We might want to split here and combine the results to support
2415              multiple vector sizes better.  */
2416           for (j = 0; j < group_size; ++j)
2417             if (!matches[j])
2418               break;
2419           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2420             {
2421               if (dump_enabled_p ())
2422                 dump_printf_loc (MSG_NOTE, vect_location,
2423                                  "Building vector operands from scalars\n");
2424               this_tree_size++;
2425               child = vect_create_new_slp_node (oprnd_info->ops);
2426               children.safe_push (child);
2427               oprnd_info->ops = vNULL;
2428               continue;
2429             }
2430         }
2431
2432       gcc_assert (child == NULL);
2433       FOR_EACH_VEC_ELT (children, j, child)
2434         if (child)
2435           vect_free_slp_tree (child);
2436       vect_free_oprnd_info (oprnds_info);
2437       return NULL;
2438     }
2439
2440   vect_free_oprnd_info (oprnds_info);
2441
2442   /* If we have all children of a child built up from uniform scalars
2443      or does more than one possibly expensive vector construction then
2444      just throw that away, causing it built up from scalars.
2445      The exception is the SLP node for the vector store.  */
2446   if (is_a <bb_vec_info> (vinfo)
2447       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2448       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2449          do extra work to cancel the pattern so the uses see the
2450          scalar version.  */
2451       && !is_pattern_stmt_p (stmt_info))
2452     {
2453       slp_tree child;
2454       unsigned j;
2455       bool all_uniform_p = true;
2456       unsigned n_vector_builds = 0;
2457       FOR_EACH_VEC_ELT (children, j, child)
2458         {
2459           if (!child)
2460             ;
2461           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2462             all_uniform_p = false;
2463           else if (!vect_slp_tree_uniform_p (child))
2464             {
2465               all_uniform_p = false;
2466               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2467                 n_vector_builds++;
2468             }
2469         }
2470       if (all_uniform_p
2471           || n_vector_builds > 1
2472           || (n_vector_builds == children.length ()
2473               && is_a <gphi *> (stmt_info->stmt)))
2474         {
2475           /* Roll back.  */
2476           matches[0] = false;
2477           FOR_EACH_VEC_ELT (children, j, child)
2478             if (child)
2479               vect_free_slp_tree (child);
2480
2481           if (dump_enabled_p ())
2482             dump_printf_loc (MSG_NOTE, vect_location,
2483                              "Building parent vector operands from "
2484                              "scalars instead\n");
2485           return NULL;
2486         }
2487     }
2488
2489   *tree_size += this_tree_size + 1;
2490   *max_nunits = this_max_nunits;
2491
2492   if (two_operators)
2493     {
2494       /* ???  We'd likely want to either cache in bst_map sth like
2495          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2496          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2497          explicit stmts to put in so the keying on 'stmts' doesn't
2498          work (but we have the same issue with nodes that use 'ops').  */
2499       slp_tree one = new _slp_tree;
2500       slp_tree two = new _slp_tree;
2501       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2502       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2503       SLP_TREE_VECTYPE (one) = vectype;
2504       SLP_TREE_VECTYPE (two) = vectype;
2505       SLP_TREE_CHILDREN (one).safe_splice (children);
2506       SLP_TREE_CHILDREN (two).safe_splice (children);
2507       slp_tree child;
2508       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2509         SLP_TREE_REF_COUNT (child)++;
2510
2511       /* Here we record the original defs since this
2512          node represents the final lane configuration.  */
2513       node = vect_create_new_slp_node (node, stmts, 2);
2514       SLP_TREE_VECTYPE (node) = vectype;
2515       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2516       SLP_TREE_CHILDREN (node).quick_push (one);
2517       SLP_TREE_CHILDREN (node).quick_push (two);
2518       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2519       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2520       enum tree_code ocode = ERROR_MARK;
2521       stmt_vec_info ostmt_info;
2522       unsigned j = 0;
2523       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2524         {
2525           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2526           if (gimple_assign_rhs_code (ostmt) != code0)
2527             {
2528               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2529               ocode = gimple_assign_rhs_code (ostmt);
2530               j = i;
2531             }
2532           else
2533             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2534         }
2535       SLP_TREE_CODE (one) = code0;
2536       SLP_TREE_CODE (two) = ocode;
2537       SLP_TREE_LANES (one) = stmts.length ();
2538       SLP_TREE_LANES (two) = stmts.length ();
2539       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2540       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2541       return node;
2542     }
2543
2544   node = vect_create_new_slp_node (node, stmts, nops);
2545   SLP_TREE_VECTYPE (node) = vectype;
2546   SLP_TREE_CHILDREN (node).splice (children);
2547   return node;
2548 }
2549
2550 /* Dump a single SLP tree NODE.  */
2551
2552 static void
2553 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2554                      slp_tree node)
2555 {
2556   unsigned i, j;
2557   slp_tree child;
2558   stmt_vec_info stmt_info;
2559   tree op;
2560
2561   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2562   dump_user_location_t user_loc = loc.get_user_location ();
2563   dump_printf_loc (metadata, user_loc,
2564                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2565                    ", refcnt=%u)",
2566                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2567                    ? " (external)"
2568                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2569                       ? " (constant)"
2570                       : ""), (void *) node,
2571                    estimated_poly_value (node->max_nunits),
2572                                          SLP_TREE_REF_COUNT (node));
2573   if (SLP_TREE_VECTYPE (node))
2574     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2575   dump_printf (metadata, "\n");
2576   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2577     {
2578       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2579         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2580       else
2581         dump_printf_loc (metadata, user_loc, "op template: %G",
2582                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2583     }
2584   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2585     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2586       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2587   else
2588     {
2589       dump_printf_loc (metadata, user_loc, "\t{ ");
2590       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2591         dump_printf (metadata, "%T%s ", op,
2592                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2593       dump_printf (metadata, "}\n");
2594     }
2595   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2596     {
2597       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2598       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2599         dump_printf (dump_kind, " %u", j);
2600       dump_printf (dump_kind, " }\n");
2601     }
2602   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2603     {
2604       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2605       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2606         dump_printf (dump_kind, " %u[%u]",
2607                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2608                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2609       dump_printf (dump_kind, " }\n");
2610     }
2611   if (SLP_TREE_CHILDREN (node).is_empty ())
2612     return;
2613   dump_printf_loc (metadata, user_loc, "\tchildren");
2614   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2615     dump_printf (dump_kind, " %p", (void *)child);
2616   dump_printf (dump_kind, "\n");
2617 }
2618
2619 DEBUG_FUNCTION void
2620 debug (slp_tree node)
2621 {
2622   debug_dump_context ctx;
2623   vect_print_slp_tree (MSG_NOTE,
2624                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2625                        node);
2626 }
2627
2628 /* Recursive helper for the dot producer below.  */
2629
2630 static void
2631 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2632 {
2633   if (visited.add (node))
2634     return;
2635
2636   fprintf (f, "\"%p\" [label=\"", (void *)node);
2637   vect_print_slp_tree (MSG_NOTE,
2638                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2639                        node);
2640   fprintf (f, "\"];\n");
2641
2642
2643   for (slp_tree child : SLP_TREE_CHILDREN (node))
2644     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2645
2646   for (slp_tree child : SLP_TREE_CHILDREN (node))
2647     if (child)
2648       dot_slp_tree (f, child, visited);
2649 }
2650
2651 DEBUG_FUNCTION void
2652 dot_slp_tree (const char *fname, slp_tree node)
2653 {
2654   FILE *f = fopen (fname, "w");
2655   fprintf (f, "digraph {\n");
2656   fflush (f);
2657     {
2658       debug_dump_context ctx (f);
2659       hash_set<slp_tree> visited;
2660       dot_slp_tree (f, node, visited);
2661     }
2662   fflush (f);
2663   fprintf (f, "}\n");
2664   fclose (f);
2665 }
2666
2667 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2668
2669 static void
2670 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2671                       slp_tree node, hash_set<slp_tree> &visited)
2672 {
2673   unsigned i;
2674   slp_tree child;
2675
2676   if (visited.add (node))
2677     return;
2678
2679   vect_print_slp_tree (dump_kind, loc, node);
2680
2681   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2682     if (child)
2683       vect_print_slp_graph (dump_kind, loc, child, visited);
2684 }
2685
2686 static void
2687 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2688                       slp_tree entry)
2689 {
2690   hash_set<slp_tree> visited;
2691   vect_print_slp_graph (dump_kind, loc, entry, visited);
2692 }
2693
2694 /* Mark the tree rooted at NODE with PURE_SLP.  */
2695
2696 static void
2697 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2698 {
2699   int i;
2700   stmt_vec_info stmt_info;
2701   slp_tree child;
2702
2703   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2704     return;
2705
2706   if (visited.add (node))
2707     return;
2708
2709   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2710     STMT_SLP_TYPE (stmt_info) = pure_slp;
2711
2712   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2713     if (child)
2714       vect_mark_slp_stmts (child, visited);
2715 }
2716
2717 static void
2718 vect_mark_slp_stmts (slp_tree node)
2719 {
2720   hash_set<slp_tree> visited;
2721   vect_mark_slp_stmts (node, visited);
2722 }
2723
2724 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2725
2726 static void
2727 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2728 {
2729   int i;
2730   stmt_vec_info stmt_info;
2731   slp_tree child;
2732
2733   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2734     return;
2735
2736   if (visited.add (node))
2737     return;
2738
2739   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2740     {
2741       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2742                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2743       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2744     }
2745
2746   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2747     if (child)
2748       vect_mark_slp_stmts_relevant (child, visited);
2749 }
2750
2751 static void
2752 vect_mark_slp_stmts_relevant (slp_tree node)
2753 {
2754   hash_set<slp_tree> visited;
2755   vect_mark_slp_stmts_relevant (node, visited);
2756 }
2757
2758
2759 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2760
2761 static void
2762 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2763                        hash_set<slp_tree> &visited)
2764 {
2765   if (!node || visited.add (node))
2766     return;
2767
2768   if (SLP_TREE_CHILDREN (node).length () == 0)
2769     {
2770       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2771         return;
2772       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2773       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2774           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2775         loads.safe_push (node);
2776     }
2777   else
2778     {
2779       unsigned i;
2780       slp_tree child;
2781       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2782         vect_gather_slp_loads (loads, child, visited);
2783     }
2784 }
2785
2786
2787 /* Find the last store in SLP INSTANCE.  */
2788
2789 stmt_vec_info
2790 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2791 {
2792   stmt_vec_info last = NULL;
2793   stmt_vec_info stmt_vinfo;
2794
2795   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2796     {
2797       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2798       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2799     }
2800
2801   return last;
2802 }
2803
2804 /* Find the first stmt in NODE.  */
2805
2806 stmt_vec_info
2807 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2808 {
2809   stmt_vec_info first = NULL;
2810   stmt_vec_info stmt_vinfo;
2811
2812   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2813     {
2814       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2815       if (!first
2816           || get_later_stmt (stmt_vinfo, first) == first)
2817         first = stmt_vinfo;
2818     }
2819
2820   return first;
2821 }
2822
2823 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2824    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2825    (also containing the first GROUP1_SIZE stmts, since stores are
2826    consecutive), the second containing the remainder.
2827    Return the first stmt in the second group.  */
2828
2829 static stmt_vec_info
2830 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2831 {
2832   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2833   gcc_assert (group1_size > 0);
2834   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2835   gcc_assert (group2_size > 0);
2836   DR_GROUP_SIZE (first_vinfo) = group1_size;
2837
2838   stmt_vec_info stmt_info = first_vinfo;
2839   for (unsigned i = group1_size; i > 1; i--)
2840     {
2841       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2842       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2843     }
2844   /* STMT is now the last element of the first group.  */
2845   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2846   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2847
2848   DR_GROUP_SIZE (group2) = group2_size;
2849   for (stmt_info = group2; stmt_info;
2850        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2851     {
2852       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2853       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2854     }
2855
2856   /* For the second group, the DR_GROUP_GAP is that before the original group,
2857      plus skipping over the first vector.  */
2858   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2859
2860   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2861   DR_GROUP_GAP (first_vinfo) += group2_size;
2862
2863   if (dump_enabled_p ())
2864     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2865                      group1_size, group2_size);
2866
2867   return group2;
2868 }
2869
2870 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2871    statements and a vector of NUNITS elements.  */
2872
2873 static poly_uint64
2874 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2875 {
2876   return exact_div (common_multiple (nunits, group_size), group_size);
2877 }
2878
2879 /* Helper that checks to see if a node is a load node.  */
2880
2881 static inline bool
2882 vect_is_slp_load_node  (slp_tree root)
2883 {
2884   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2885          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2886          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2887 }
2888
2889
2890 /* Helper function of optimize_load_redistribution that performs the operation
2891    recursively.  */
2892
2893 static slp_tree
2894 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2895                                 vec_info *vinfo, unsigned int group_size,
2896                                 hash_map<slp_tree, slp_tree> *load_map,
2897                                 slp_tree root)
2898 {
2899   if (slp_tree *leader = load_map->get (root))
2900     return *leader;
2901
2902   slp_tree node;
2903   unsigned i;
2904
2905   /* For now, we don't know anything about externals so do not do anything.  */
2906   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2907     return NULL;
2908   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2909     {
2910       /* First convert this node into a load node and add it to the leaves
2911          list and flatten the permute from a lane to a load one.  If it's
2912          unneeded it will be elided later.  */
2913       vec<stmt_vec_info> stmts;
2914       stmts.create (SLP_TREE_LANES (root));
2915       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2916       for (unsigned j = 0; j < lane_perm.length (); j++)
2917         {
2918           std::pair<unsigned, unsigned> perm = lane_perm[j];
2919           node = SLP_TREE_CHILDREN (root)[perm.first];
2920
2921           if (!vect_is_slp_load_node (node)
2922               || SLP_TREE_CHILDREN (node).exists ())
2923             {
2924               stmts.release ();
2925               goto next;
2926             }
2927
2928           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2929         }
2930
2931       if (dump_enabled_p ())
2932         dump_printf_loc (MSG_NOTE, vect_location,
2933                          "converting stmts on permute node %p\n",
2934                          (void *) root);
2935
2936       bool *matches = XALLOCAVEC (bool, group_size);
2937       poly_uint64 max_nunits = 1;
2938       unsigned tree_size = 0, limit = 1;
2939       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2940                                   matches, &limit, &tree_size, bst_map);
2941       if (!node)
2942         stmts.release ();
2943
2944       load_map->put (root, node);
2945       return node;
2946     }
2947
2948 next:
2949   load_map->put (root, NULL);
2950
2951   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2952     {
2953       slp_tree value
2954         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2955                                           node);
2956       if (value)
2957         {
2958           SLP_TREE_REF_COUNT (value)++;
2959           SLP_TREE_CHILDREN (root)[i] = value;
2960           /* ???  We know the original leafs of the replaced nodes will
2961              be referenced by bst_map, only the permutes created by
2962              pattern matching are not.  */
2963           if (SLP_TREE_REF_COUNT (node) == 1)
2964             load_map->remove (node);
2965           vect_free_slp_tree (node);
2966         }
2967     }
2968
2969   return NULL;
2970 }
2971
2972 /* Temporary workaround for loads not being CSEd during SLP build.  This
2973    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2974    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2975    same DR such that the final operation is equal to a permuted load.  Such
2976    NODES are then directly converted into LOADS themselves.  The nodes are
2977    CSEd using BST_MAP.  */
2978
2979 static void
2980 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2981                               vec_info *vinfo, unsigned int group_size,
2982                               hash_map<slp_tree, slp_tree> *load_map,
2983                               slp_tree root)
2984 {
2985   slp_tree node;
2986   unsigned i;
2987
2988   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2989     {
2990       slp_tree value
2991         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2992                                           node);
2993       if (value)
2994         {
2995           SLP_TREE_REF_COUNT (value)++;
2996           SLP_TREE_CHILDREN (root)[i] = value;
2997           /* ???  We know the original leafs of the replaced nodes will
2998              be referenced by bst_map, only the permutes created by
2999              pattern matching are not.  */
3000           if (SLP_TREE_REF_COUNT (node) == 1)
3001             load_map->remove (node);
3002           vect_free_slp_tree (node);
3003         }
3004     }
3005 }
3006
3007 /* Helper function of vect_match_slp_patterns.
3008
3009    Attempts to match patterns against the slp tree rooted in REF_NODE using
3010    VINFO.  Patterns are matched in post-order traversal.
3011
3012    If matching is successful the value in REF_NODE is updated and returned, if
3013    not then it is returned unchanged.  */
3014
3015 static bool
3016 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3017                            slp_tree_to_load_perm_map_t *perm_cache,
3018                            slp_compat_nodes_map_t *compat_cache,
3019                            hash_set<slp_tree> *visited)
3020 {
3021   unsigned i;
3022   slp_tree node = *ref_node;
3023   bool found_p = false;
3024   if (!node || visited->add (node))
3025     return false;
3026
3027   slp_tree child;
3028   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3029     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3030                                           vinfo, perm_cache, compat_cache,
3031                                           visited);
3032
3033   for (unsigned x = 0; x < num__slp_patterns; x++)
3034     {
3035       vect_pattern *pattern
3036         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3037       if (pattern)
3038         {
3039           pattern->build (vinfo);
3040           delete pattern;
3041           found_p = true;
3042         }
3043     }
3044
3045   return found_p;
3046 }
3047
3048 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3049    vec_info VINFO.
3050
3051    The modified tree is returned.  Patterns are tried in order and multiple
3052    patterns may match.  */
3053
3054 static bool
3055 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3056                          hash_set<slp_tree> *visited,
3057                          slp_tree_to_load_perm_map_t *perm_cache,
3058                          slp_compat_nodes_map_t *compat_cache)
3059 {
3060   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3061   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3062
3063   if (dump_enabled_p ())
3064     dump_printf_loc (MSG_NOTE, vect_location,
3065                      "Analyzing SLP tree %p for patterns\n",
3066                      (void *) SLP_INSTANCE_TREE (instance));
3067
3068   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3069                                     visited);
3070 }
3071
3072 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3073    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3074    Return true if we could use IFN_STORE_LANES instead and if that appears
3075    to be the better approach.  */
3076
3077 static bool
3078 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3079                                unsigned int group_size,
3080                                unsigned int new_group_size)
3081 {
3082   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3083   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3084   if (!vectype)
3085     return false;
3086   /* Allow the split if one of the two new groups would operate on full
3087      vectors *within* rather than across one scalar loop iteration.
3088      This is purely a heuristic, but it should work well for group
3089      sizes of 3 and 4, where the possible splits are:
3090
3091        3->2+1:  OK if the vector has exactly two elements
3092        4->2+2:  Likewise
3093        4->3+1:  Less clear-cut.  */
3094   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3095       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3096     return false;
3097   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3098 }
3099
3100 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3101    vect_build_slp_tree to build a tree of packed stmts if possible.
3102    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3103
3104 static bool
3105 vect_analyze_slp_instance (vec_info *vinfo,
3106                            scalar_stmts_to_slp_tree_map_t *bst_map,
3107                            stmt_vec_info stmt_info, slp_instance_kind kind,
3108                            unsigned max_tree_size, unsigned *limit);
3109
3110 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3111    of KIND.  Return true if successful.  */
3112
3113 static bool
3114 vect_build_slp_instance (vec_info *vinfo,
3115                          slp_instance_kind kind,
3116                          vec<stmt_vec_info> &scalar_stmts,
3117                          vec<stmt_vec_info> &root_stmt_infos,
3118                          vec<tree> &remain,
3119                          unsigned max_tree_size, unsigned *limit,
3120                          scalar_stmts_to_slp_tree_map_t *bst_map,
3121                          /* ???  We need stmt_info for group splitting.  */
3122                          stmt_vec_info stmt_info_)
3123 {
3124   if (kind == slp_inst_kind_ctor)
3125     {
3126       if (dump_enabled_p ())
3127         dump_printf_loc (MSG_NOTE, vect_location,
3128                          "Analyzing vectorizable constructor: %G\n",
3129                          root_stmt_infos[0]->stmt);
3130     }
3131
3132   if (dump_enabled_p ())
3133     {
3134       dump_printf_loc (MSG_NOTE, vect_location,
3135                        "Starting SLP discovery for\n");
3136       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3137         dump_printf_loc (MSG_NOTE, vect_location,
3138                          "  %G", scalar_stmts[i]->stmt);
3139     }
3140
3141   /* When a BB reduction doesn't have an even number of lanes
3142      strip it down, treating the remaining lane as scalar.
3143      ???  Selecting the optimal set of lanes to vectorize would be nice
3144      but SLP build for all lanes will fail quickly because we think
3145      we're going to need unrolling.  */
3146   if (kind == slp_inst_kind_bb_reduc
3147       && (scalar_stmts.length () & 1))
3148     remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3149
3150   /* Build the tree for the SLP instance.  */
3151   unsigned int group_size = scalar_stmts.length ();
3152   bool *matches = XALLOCAVEC (bool, group_size);
3153   poly_uint64 max_nunits = 1;
3154   unsigned tree_size = 0;
3155   unsigned i;
3156   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3157                                        &max_nunits, matches, limit,
3158                                        &tree_size, bst_map);
3159   if (node != NULL)
3160     {
3161       /* Calculate the unrolling factor based on the smallest type.  */
3162       poly_uint64 unrolling_factor
3163         = calculate_unrolling_factor (max_nunits, group_size);
3164
3165       if (maybe_ne (unrolling_factor, 1U)
3166           && is_a <bb_vec_info> (vinfo))
3167         {
3168           unsigned HOST_WIDE_INT const_max_nunits;
3169           if (!max_nunits.is_constant (&const_max_nunits)
3170               || const_max_nunits > group_size)
3171             {
3172               if (dump_enabled_p ())
3173                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3174                                  "Build SLP failed: store group "
3175                                  "size not a multiple of the vector size "
3176                                  "in basic block SLP\n");
3177               vect_free_slp_tree (node);
3178               return false;
3179             }
3180           /* Fatal mismatch.  */
3181           if (dump_enabled_p ())
3182             dump_printf_loc (MSG_NOTE, vect_location,
3183                              "SLP discovery succeeded but node needs "
3184                              "splitting\n");
3185           memset (matches, true, group_size);
3186           matches[group_size / const_max_nunits * const_max_nunits] = false;
3187           vect_free_slp_tree (node);
3188         }
3189       else
3190         {
3191           /* Create a new SLP instance.  */
3192           slp_instance new_instance = XNEW (class _slp_instance);
3193           SLP_INSTANCE_TREE (new_instance) = node;
3194           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3195           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3196           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3197           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3198           SLP_INSTANCE_KIND (new_instance) = kind;
3199           new_instance->reduc_phis = NULL;
3200           new_instance->cost_vec = vNULL;
3201           new_instance->subgraph_entries = vNULL;
3202
3203           if (dump_enabled_p ())
3204             dump_printf_loc (MSG_NOTE, vect_location,
3205                              "SLP size %u vs. limit %u.\n",
3206                              tree_size, max_tree_size);
3207
3208           /* Fixup SLP reduction chains.  */
3209           if (kind == slp_inst_kind_reduc_chain)
3210             {
3211               /* If this is a reduction chain with a conversion in front
3212                  amend the SLP tree with a node for that.  */
3213               gimple *scalar_def
3214                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3215               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3216                 {
3217                   /* Get at the conversion stmt - we know it's the single use
3218                      of the last stmt of the reduction chain.  */
3219                   use_operand_p use_p;
3220                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3221                                            &use_p, &scalar_def);
3222                   gcc_assert (r);
3223                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3224                   next_info = vect_stmt_to_vectorize (next_info);
3225                   scalar_stmts = vNULL;
3226                   scalar_stmts.create (group_size);
3227                   for (unsigned i = 0; i < group_size; ++i)
3228                     scalar_stmts.quick_push (next_info);
3229                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3230                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3231                   SLP_TREE_CHILDREN (conv).quick_push (node);
3232                   SLP_INSTANCE_TREE (new_instance) = conv;
3233                   /* We also have to fake this conversion stmt as SLP reduction
3234                      group so we don't have to mess with too much code
3235                      elsewhere.  */
3236                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3237                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3238                 }
3239               /* Fill the backedge child of the PHI SLP node.  The
3240                  general matching code cannot find it because the
3241                  scalar code does not reflect how we vectorize the
3242                  reduction.  */
3243               use_operand_p use_p;
3244               imm_use_iterator imm_iter;
3245               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3246               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3247                                      gimple_get_lhs (scalar_def))
3248                 /* There are exactly two non-debug uses, the reduction
3249                    PHI and the loop-closed PHI node.  */
3250                 if (!is_gimple_debug (USE_STMT (use_p))
3251                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3252                   {
3253                     auto_vec<stmt_vec_info, 64> phis (group_size);
3254                     stmt_vec_info phi_info
3255                       = vinfo->lookup_stmt (USE_STMT (use_p));
3256                     for (unsigned i = 0; i < group_size; ++i)
3257                       phis.quick_push (phi_info);
3258                     slp_tree *phi_node = bst_map->get (phis);
3259                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3260                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3261                       = SLP_INSTANCE_TREE (new_instance);
3262                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3263                   }
3264             }
3265
3266           vinfo->slp_instances.safe_push (new_instance);
3267
3268           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3269              the number of scalar stmts in the root in a few places.
3270              Verify that assumption holds.  */
3271           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3272                         .length () == group_size);
3273
3274           if (dump_enabled_p ())
3275             {
3276               dump_printf_loc (MSG_NOTE, vect_location,
3277                                "Final SLP tree for instance %p:\n",
3278                                (void *) new_instance);
3279               vect_print_slp_graph (MSG_NOTE, vect_location,
3280                                     SLP_INSTANCE_TREE (new_instance));
3281             }
3282
3283           return true;
3284         }
3285     }
3286   else
3287     {
3288       /* Failed to SLP.  */
3289       /* Free the allocated memory.  */
3290       scalar_stmts.release ();
3291     }
3292
3293   stmt_vec_info stmt_info = stmt_info_;
3294   /* Try to break the group up into pieces.  */
3295   if (kind == slp_inst_kind_store)
3296     {
3297       /* ???  We could delay all the actual splitting of store-groups
3298          until after SLP discovery of the original group completed.
3299          Then we can recurse to vect_build_slp_instance directly.  */
3300       for (i = 0; i < group_size; i++)
3301         if (!matches[i])
3302           break;
3303
3304       /* For basic block SLP, try to break the group up into multiples of
3305          a vector size.  */
3306       if (is_a <bb_vec_info> (vinfo)
3307           && (i > 1 && i < group_size))
3308         {
3309           tree scalar_type
3310             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3311           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3312                                                       1 << floor_log2 (i));
3313           unsigned HOST_WIDE_INT const_nunits;
3314           if (vectype
3315               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3316             {
3317               /* Split into two groups at the first vector boundary.  */
3318               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3319               unsigned group1_size = i & ~(const_nunits - 1);
3320
3321               if (dump_enabled_p ())
3322                 dump_printf_loc (MSG_NOTE, vect_location,
3323                                  "Splitting SLP group at stmt %u\n", i);
3324               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3325                                                                group1_size);
3326               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3327                                                     kind, max_tree_size,
3328                                                     limit);
3329               /* Split the rest at the failure point and possibly
3330                  re-analyze the remaining matching part if it has
3331                  at least two lanes.  */
3332               if (group1_size < i
3333                   && (i + 1 < group_size
3334                       || i - group1_size > 1))
3335                 {
3336                   stmt_vec_info rest2 = rest;
3337                   rest = vect_split_slp_store_group (rest, i - group1_size);
3338                   if (i - group1_size > 1)
3339                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3340                                                       kind, max_tree_size,
3341                                                       limit);
3342                 }
3343               /* Re-analyze the non-matching tail if it has at least
3344                  two lanes.  */
3345               if (i + 1 < group_size)
3346                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3347                                                   rest, kind, max_tree_size,
3348                                                   limit);
3349               return res;
3350             }
3351         }
3352
3353       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3354       if (is_a <loop_vec_info> (vinfo)
3355           && (i > 1 && i < group_size)
3356           && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3357         {
3358           unsigned group1_size = i;
3359
3360           if (dump_enabled_p ())
3361             dump_printf_loc (MSG_NOTE, vect_location,
3362                              "Splitting SLP group at stmt %u\n", i);
3363
3364           stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3365                                                            group1_size);
3366           /* Loop vectorization cannot handle gaps in stores, make sure
3367              the split group appears as strided.  */
3368           STMT_VINFO_STRIDED_P (rest) = 1;
3369           DR_GROUP_GAP (rest) = 0;
3370           STMT_VINFO_STRIDED_P (stmt_info) = 1;
3371           DR_GROUP_GAP (stmt_info) = 0;
3372
3373           bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3374                                                 kind, max_tree_size, limit);
3375           if (i + 1 < group_size)
3376             res |= vect_analyze_slp_instance (vinfo, bst_map,
3377                                               rest, kind, max_tree_size, limit);
3378
3379           return res;
3380         }
3381
3382       /* Even though the first vector did not all match, we might be able to SLP
3383          (some) of the remainder.  FORNOW ignore this possibility.  */
3384     }
3385
3386   /* Failed to SLP.  */
3387   if (dump_enabled_p ())
3388     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3389   return false;
3390 }
3391
3392
3393 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3394    vect_build_slp_tree to build a tree of packed stmts if possible.
3395    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3396
3397 static bool
3398 vect_analyze_slp_instance (vec_info *vinfo,
3399                            scalar_stmts_to_slp_tree_map_t *bst_map,
3400                            stmt_vec_info stmt_info,
3401                            slp_instance_kind kind,
3402                            unsigned max_tree_size, unsigned *limit)
3403 {
3404   unsigned int i;
3405   vec<stmt_vec_info> scalar_stmts;
3406
3407   if (is_a <bb_vec_info> (vinfo))
3408     vect_location = stmt_info->stmt;
3409
3410   stmt_vec_info next_info = stmt_info;
3411   if (kind == slp_inst_kind_store)
3412     {
3413       /* Collect the stores and store them in scalar_stmts.  */
3414       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3415       while (next_info)
3416         {
3417           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3418           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3419         }
3420     }
3421   else if (kind == slp_inst_kind_reduc_chain)
3422     {
3423       /* Collect the reduction stmts and store them in scalar_stmts.  */
3424       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3425       while (next_info)
3426         {
3427           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3428           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3429         }
3430       /* Mark the first element of the reduction chain as reduction to properly
3431          transform the node.  In the reduction analysis phase only the last
3432          element of the chain is marked as reduction.  */
3433       STMT_VINFO_DEF_TYPE (stmt_info)
3434         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3435       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3436         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3437     }
3438   else if (kind == slp_inst_kind_reduc_group)
3439     {
3440       /* Collect reduction statements.  */
3441       const vec<stmt_vec_info> &reductions
3442         = as_a <loop_vec_info> (vinfo)->reductions;
3443       scalar_stmts.create (reductions.length ());
3444       for (i = 0; reductions.iterate (i, &next_info); i++)
3445         if ((STMT_VINFO_RELEVANT_P (next_info)
3446              || STMT_VINFO_LIVE_P (next_info))
3447             /* ???  Make sure we didn't skip a conversion around a reduction
3448                path.  In that case we'd have to reverse engineer that conversion
3449                stmt following the chain using reduc_idx and from the PHI
3450                using reduc_def.  */
3451             && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3452           scalar_stmts.quick_push (next_info);
3453       /* If less than two were relevant/live there's nothing to SLP.  */
3454       if (scalar_stmts.length () < 2)
3455         return false;
3456     }
3457   else
3458     gcc_unreachable ();
3459
3460   vec<stmt_vec_info> roots = vNULL;
3461   vec<tree> remain = vNULL;
3462   /* Build the tree for the SLP instance.  */
3463   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3464                                       roots, remain,
3465                                       max_tree_size, limit, bst_map,
3466                                       kind == slp_inst_kind_store
3467                                       ? stmt_info : NULL);
3468
3469   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3470      where we should do store group splitting.  */
3471
3472   return res;
3473 }
3474
3475 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3476    trees of packed scalar stmts if SLP is possible.  */
3477
3478 opt_result
3479 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3480 {
3481   unsigned int i;
3482   stmt_vec_info first_element;
3483   slp_instance instance;
3484
3485   DUMP_VECT_SCOPE ("vect_analyze_slp");
3486
3487   unsigned limit = max_tree_size;
3488
3489   scalar_stmts_to_slp_tree_map_t *bst_map
3490     = new scalar_stmts_to_slp_tree_map_t ();
3491
3492   /* Find SLP sequences starting from groups of grouped stores.  */
3493   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3494     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3495                                slp_inst_kind_store, max_tree_size, &limit);
3496
3497   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3498     {
3499       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3500         {
3501           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3502           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3503                                        bb_vinfo->roots[i].stmts,
3504                                        bb_vinfo->roots[i].roots,
3505                                        bb_vinfo->roots[i].remain,
3506                                        max_tree_size, &limit, bst_map, NULL))
3507             {
3508               bb_vinfo->roots[i].stmts = vNULL;
3509               bb_vinfo->roots[i].roots = vNULL;
3510               bb_vinfo->roots[i].remain = vNULL;
3511             }
3512         }
3513     }
3514
3515   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3516     {
3517       /* Find SLP sequences starting from reduction chains.  */
3518       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3519         if (! STMT_VINFO_RELEVANT_P (first_element)
3520             && ! STMT_VINFO_LIVE_P (first_element))
3521           ;
3522         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3523                                               slp_inst_kind_reduc_chain,
3524                                               max_tree_size, &limit))
3525           {
3526             /* Dissolve reduction chain group.  */
3527             stmt_vec_info vinfo = first_element;
3528             stmt_vec_info last = NULL;
3529             while (vinfo)
3530               {
3531                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3532                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3533                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3534                 last = vinfo;
3535                 vinfo = next;
3536               }
3537             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3538             /* It can be still vectorized as part of an SLP reduction.  */
3539             loop_vinfo->reductions.safe_push (last);
3540           }
3541
3542       /* Find SLP sequences starting from groups of reductions.  */
3543       if (loop_vinfo->reductions.length () > 1)
3544         vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3545                                    slp_inst_kind_reduc_group, max_tree_size,
3546                                    &limit);
3547     }
3548
3549   hash_set<slp_tree> visited_patterns;
3550   slp_tree_to_load_perm_map_t perm_cache;
3551   slp_compat_nodes_map_t compat_cache;
3552
3553   /* See if any patterns can be found in the SLP tree.  */
3554   bool pattern_found = false;
3555   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3556     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3557                                               &visited_patterns, &perm_cache,
3558                                               &compat_cache);
3559
3560   /* If any were found optimize permutations of loads.  */
3561   if (pattern_found)
3562     {
3563       hash_map<slp_tree, slp_tree> load_map;
3564       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3565         {
3566           slp_tree root = SLP_INSTANCE_TREE (instance);
3567           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3568                                         &load_map, root);
3569         }
3570     }
3571
3572
3573
3574   /* The map keeps a reference on SLP nodes built, release that.  */
3575   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3576        it != bst_map->end (); ++it)
3577     if ((*it).second)
3578       vect_free_slp_tree ((*it).second);
3579   delete bst_map;
3580
3581   if (pattern_found && dump_enabled_p ())
3582     {
3583       dump_printf_loc (MSG_NOTE, vect_location,
3584                        "Pattern matched SLP tree\n");
3585       hash_set<slp_tree> visited;
3586       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3587         vect_print_slp_graph (MSG_NOTE, vect_location,
3588                               SLP_INSTANCE_TREE (instance), visited);
3589     }
3590
3591   return opt_result::success ();
3592 }
3593
3594 /* Estimates the cost of inserting layout changes into the SLP graph.
3595    It can also say that the insertion is impossible.  */
3596
3597 struct slpg_layout_cost
3598 {
3599   slpg_layout_cost () = default;
3600   slpg_layout_cost (sreal, bool);
3601
3602   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3603   bool is_possible () const { return depth != sreal::max (); }
3604
3605   bool operator== (const slpg_layout_cost &) const;
3606   bool operator!= (const slpg_layout_cost &) const;
3607
3608   bool is_better_than (const slpg_layout_cost &, bool) const;
3609
3610   void add_parallel_cost (const slpg_layout_cost &);
3611   void add_serial_cost (const slpg_layout_cost &);
3612   void split (unsigned int);
3613
3614   /* The longest sequence of layout changes needed during any traversal
3615      of the partition dag, weighted by execution frequency.
3616
3617      This is the most important metric when optimizing for speed, since
3618      it helps to ensure that we keep the number of operations on
3619      critical paths to a minimum.  */
3620   sreal depth = 0;
3621
3622   /* An estimate of the total number of operations needed.  It is weighted by
3623      execution frequency when optimizing for speed but not when optimizing for
3624      size.  In order to avoid double-counting, a node with a fanout of N will
3625      distribute 1/N of its total cost to each successor.
3626
3627      This is the most important metric when optimizing for size, since
3628      it helps to keep the total number of operations to a minimum,  */
3629   sreal total = 0;
3630 };
3631
3632 /* Construct costs for a node with weight WEIGHT.  A higher weight
3633    indicates more frequent execution.  IS_FOR_SIZE is true if we are
3634    optimizing for size rather than speed.  */
3635
3636 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3637   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3638 {
3639 }
3640
3641 bool
3642 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3643 {
3644   return depth == other.depth && total == other.total;
3645 }
3646
3647 bool
3648 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3649 {
3650   return !operator== (other);
3651 }
3652
3653 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
3654    true if we are optimizing for size rather than speed.  */
3655
3656 bool
3657 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3658                                   bool is_for_size) const
3659 {
3660   if (is_for_size)
3661     {
3662       if (total != other.total)
3663         return total < other.total;
3664       return depth < other.depth;
3665     }
3666   else
3667     {
3668       if (depth != other.depth)
3669         return depth < other.depth;
3670       return total < other.total;
3671     }
3672 }
3673
3674 /* Increase the costs to account for something with cost INPUT_COST
3675    happening in parallel with the current costs.  */
3676
3677 void
3678 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3679 {
3680   depth = std::max (depth, input_cost.depth);
3681   total += input_cost.total;
3682 }
3683
3684 /* Increase the costs to account for something with cost INPUT_COST
3685    happening in series with the current costs.  */
3686
3687 void
3688 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3689 {
3690   depth += other.depth;
3691   total += other.total;
3692 }
3693
3694 /* Split the total cost among TIMES successors or predecessors.  */
3695
3696 void
3697 slpg_layout_cost::split (unsigned int times)
3698 {
3699   if (times > 1)
3700     total /= times;
3701 }
3702
3703 /* Information about one node in the SLP graph, for use during
3704    vect_optimize_slp_pass.  */
3705
3706 struct slpg_vertex
3707 {
3708   slpg_vertex (slp_tree node_) : node (node_) {}
3709
3710   /* The node itself.  */
3711   slp_tree node;
3712
3713   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
3714      partitions are flexible; they can have whichever layout consumers
3715      want them to have.  */
3716   int partition = -1;
3717
3718   /* The number of nodes that directly use the result of this one
3719      (i.e. the number of nodes that count this one as a child).  */
3720   unsigned int out_degree = 0;
3721
3722   /* The execution frequency of the node.  */
3723   sreal weight = 0;
3724
3725   /* The total execution frequency of all nodes that directly use the
3726      result of this one.  */
3727   sreal out_weight = 0;
3728 };
3729
3730 /* Information about one partition of the SLP graph, for use during
3731    vect_optimize_slp_pass.  */
3732
3733 struct slpg_partition_info
3734 {
3735   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3736      of m_partitioned_nodes.  */
3737   unsigned int node_begin = 0;
3738   unsigned int node_end = 0;
3739
3740   /* Which layout we've chosen to use for this partition, or -1 if
3741      we haven't picked one yet.  */
3742   int layout = -1;
3743
3744   /* The number of predecessors and successors in the partition dag.
3745      The predecessors always have lower partition numbers and the
3746      successors always have higher partition numbers.
3747
3748      Note that the directions of these edges are not necessarily the
3749      same as in the data flow graph.  For example, if an SCC has separate
3750      partitions for an inner loop and an outer loop, the inner loop's
3751      partition will have at least two incoming edges from the outer loop's
3752      partition: one for a live-in value and one for a live-out value.
3753      In data flow terms, one of these edges would also be from the outer loop
3754      to the inner loop, but the other would be in the opposite direction.  */
3755   unsigned int in_degree = 0;
3756   unsigned int out_degree = 0;
3757 };
3758
3759 /* Information about the costs of using a particular layout for a
3760    particular partition.  It can also say that the combination is
3761    impossible.  */
3762
3763 struct slpg_partition_layout_costs
3764 {
3765   bool is_possible () const { return internal_cost.is_possible (); }
3766   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3767
3768   /* The costs inherited from predecessor partitions.  */
3769   slpg_layout_cost in_cost;
3770
3771   /* The inherent cost of the layout within the node itself.  For example,
3772      this is nonzero for a load if choosing a particular layout would require
3773      the load to permute the loaded elements.  It is nonzero for a
3774      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3775      to full-vector moves.  */
3776   slpg_layout_cost internal_cost;
3777
3778   /* The costs inherited from successor partitions.  */
3779   slpg_layout_cost out_cost;
3780 };
3781
3782 /* This class tries to optimize the layout of vectors in order to avoid
3783    unnecessary shuffling.  At the moment, the set of possible layouts are
3784    restricted to bijective permutations.
3785
3786    The goal of the pass depends on whether we're optimizing for size or
3787    for speed.  When optimizing for size, the goal is to reduce the overall
3788    number of layout changes (including layout changes implied by things
3789    like load permutations).  When optimizing for speed, the goal is to
3790    reduce the maximum latency attributable to layout changes on any
3791    non-cyclical path through the data flow graph.
3792
3793    For example, when optimizing a loop nest for speed, we will prefer
3794    to make layout changes outside of a loop rather than inside of a loop,
3795    and will prefer to make layout changes in parallel rather than serially,
3796    even if that increases the overall number of layout changes.
3797
3798    The high-level procedure is:
3799
3800    (1) Build a graph in which edges go from uses (parents) to definitions
3801        (children).
3802
3803    (2) Divide the graph into a dag of strongly-connected components (SCCs).
3804
3805    (3) When optimizing for speed, partition the nodes in each SCC based
3806        on their containing cfg loop.  When optimizing for size, treat
3807        each SCC as a single partition.
3808
3809        This gives us a dag of partitions.  The goal is now to assign a
3810        layout to each partition.
3811
3812    (4) Construct a set of vector layouts that are worth considering.
3813        Record which nodes must keep their current layout.
3814
3815    (5) Perform a forward walk over the partition dag (from loads to stores)
3816        accumulating the "forward" cost of using each layout.  When visiting
3817        each partition, assign a tentative choice of layout to the partition
3818        and use that choice when calculating the cost of using a different
3819        layout in successor partitions.
3820
3821    (6) Perform a backward walk over the partition dag (from stores to loads),
3822        accumulating the "backward" cost of using each layout.  When visiting
3823        each partition, make a final choice of layout for that partition based
3824        on the accumulated forward costs (from (5)) and backward costs
3825        (from (6)).
3826
3827    (7) Apply the chosen layouts to the SLP graph.
3828
3829    For example, consider the SLP statements:
3830
3831    S1:      a_1 = load
3832        loop:
3833    S2:      a_2 = PHI<a_1, a_3>
3834    S3:      b_1 = load
3835    S4:      a_3 = a_2 + b_1
3836        exit:
3837    S5:      a_4 = PHI<a_3>
3838    S6:      store a_4
3839
3840    S2 and S4 form an SCC and are part of the same loop.  Every other
3841    statement is in a singleton SCC.  In this example there is a one-to-one
3842    mapping between SCCs and partitions and the partition dag looks like this;
3843
3844         S1     S3
3845          \     /
3846           S2+S4
3847             |
3848            S5
3849             |
3850            S6
3851
3852    S2, S3 and S4 will have a higher execution frequency than the other
3853    statements, so when optimizing for speed, the goal is to avoid any
3854    layout changes:
3855
3856    - within S3
3857    - within S2+S4
3858    - on the S3->S2+S4 edge
3859
3860    For example, if S3 was originally a reversing load, the goal of the
3861    pass is to make it an unreversed load and change the layout on the
3862    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
3863    on S1->S2+S4 and S5->S6 would also be acceptable.)
3864
3865    The difference between SCCs and partitions becomes important if we
3866    add an outer loop:
3867
3868    S1:      a_1 = ...
3869        loop1:
3870    S2:      a_2 = PHI<a_1, a_6>
3871    S3:      b_1 = load
3872    S4:      a_3 = a_2 + b_1
3873        loop2:
3874    S5:      a_4 = PHI<a_3, a_5>
3875    S6:      c_1 = load
3876    S7:      a_5 = a_4 + c_1
3877        exit2:
3878    S8:      a_6 = PHI<a_5>
3879    S9:      store a_6
3880        exit1:
3881
3882    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
3883    for speed, we usually do not want restrictions in the outer loop to "infect"
3884    the decision for the inner loop.  For example, if an outer-loop node
3885    in the SCC contains a statement with a fixed layout, that should not
3886    prevent the inner loop from using a different layout.  Conversely,
3887    the inner loop should not dictate a layout to the outer loop: if the
3888    outer loop does a lot of computation, then it may not be efficient to
3889    do all of that computation in the inner loop's preferred layout.
3890
3891    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3892    and S5+S7 (inner).  We also try to arrange partitions so that:
3893
3894    - the partition for an outer loop comes before the partition for
3895      an inner loop
3896
3897    - if a sibling loop A dominates a sibling loop B, A's partition
3898      comes before B's
3899
3900    This gives the following partition dag for the example above:
3901
3902         S1        S3
3903          \        /
3904           S2+S4+S8   S6
3905            |   \\    /
3906            |    S5+S7
3907            |
3908           S9
3909
3910    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3911    one for a reversal of the edge S7->S8.
3912
3913    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
3914    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3915    preferred layout against the cost of changing the layout on entry to the
3916    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3917
3918    Although this works well when optimizing for speed, it has the downside
3919    when optimizing for size that the choice of layout for S5+S7 is completely
3920    independent of S9, which lessens the chance of reducing the overall number
3921    of permutations.  We therefore do not partition SCCs when optimizing
3922    for size.
3923
3924    To give a concrete example of the difference between optimizing
3925    for size and speed, consider:
3926
3927    a[0] = (b[1] << c[3]) - d[1];
3928    a[1] = (b[0] << c[2]) - d[0];
3929    a[2] = (b[3] << c[1]) - d[3];
3930    a[3] = (b[2] << c[0]) - d[2];
3931
3932    There are three different layouts here: one for a, one for b and d,
3933    and one for c.  When optimizing for speed it is better to permute each
3934    of b, c and d into the order required by a, since those permutations
3935    happen in parallel.  But when optimizing for size, it is better to:
3936
3937    - permute c into the same order as b
3938    - do the arithmetic
3939    - permute the result into the order required by a
3940
3941    This gives 2 permutations rather than 3.  */
3942
3943 class vect_optimize_slp_pass
3944 {
3945 public:
3946   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3947   void run ();
3948
3949 private:
3950   /* Graph building.  */
3951   struct loop *containing_loop (slp_tree);
3952   bool is_cfg_latch_edge (graph_edge *);
3953   void build_vertices (hash_set<slp_tree> &, slp_tree);
3954   void build_vertices ();
3955   void build_graph ();
3956
3957   /* Partitioning.  */
3958   void create_partitions ();
3959   template<typename T> void for_each_partition_edge (unsigned int, T);
3960
3961   /* Layout selection.  */
3962   bool is_compatible_layout (slp_tree, unsigned int);
3963   int change_layout_cost (slp_tree, unsigned int, unsigned int);
3964   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
3965                                                        unsigned int);
3966   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
3967                                int, unsigned int);
3968   int internal_node_cost (slp_tree, int, unsigned int);
3969   void start_choosing_layouts ();
3970
3971   /* Cost propagation.  */
3972   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
3973                                      unsigned int, unsigned int);
3974   slpg_layout_cost total_in_cost (unsigned int);
3975   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
3976   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
3977   void forward_pass ();
3978   void backward_pass ();
3979
3980   /* Rematerialization.  */
3981   slp_tree get_result_with_layout (slp_tree, unsigned int);
3982   void materialize ();
3983
3984   /* Clean-up.  */
3985   void remove_redundant_permutations ();
3986
3987   void dump ();
3988
3989   vec_info *m_vinfo;
3990
3991   /* True if we should optimize the graph for size, false if we should
3992      optimize it for speed.  (It wouldn't be easy to make this decision
3993      more locally.)  */
3994   bool m_optimize_size;
3995
3996   /* A graph of all SLP nodes, with edges leading from uses to definitions.
3997      In other words, a node's predecessors are its slp_tree parents and
3998      a node's successors are its slp_tree children.  */
3999   graph *m_slpg = nullptr;
4000
4001   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
4002   auto_vec<slpg_vertex> m_vertices;
4003
4004   /* The list of all leaves of M_SLPG. such as external definitions, constants,
4005      and loads.  */
4006   auto_vec<int> m_leafs;
4007
4008   /* This array has one entry for every vector layout that we're considering.
4009      Element 0 is null and indicates "no change".  Other entries describe
4010      permutations that are inherent in the current graph and that we would
4011      like to reverse if possible.
4012
4013      For example, a permutation { 1, 2, 3, 0 } means that something has
4014      effectively been permuted in that way, such as a load group
4015      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4016      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4017      in order to put things "back" in order.  */
4018   auto_vec<vec<unsigned> > m_perms;
4019
4020   /* A partitioning of the nodes for which a layout must be chosen.
4021      Each partition represents an <SCC, cfg loop> pair; that is,
4022      nodes in different SCCs belong to different partitions, and nodes
4023      within an SCC can be further partitioned according to a containing
4024      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
4025
4026      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4027        from leaves (such as loads) to roots (such as stores).
4028
4029      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
4030   auto_vec<slpg_partition_info> m_partitions;
4031
4032   /* The list of all nodes for which a layout must be chosen.  Nodes for
4033      partition P come before the nodes for partition P+1.  Nodes within a
4034      partition are in reverse postorder.  */
4035   auto_vec<unsigned int> m_partitioned_nodes;
4036
4037   /* Index P * num-layouts + L contains the cost of using layout L
4038      for partition P.  */
4039   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4040
4041   /* Index N * num-layouts + L, if nonnull, is a node that provides the
4042      original output of node N adjusted to have layout L.  */
4043   auto_vec<slp_tree> m_node_layouts;
4044 };
4045
4046 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4047    Also record whether we should optimize anything for speed rather
4048    than size.  */
4049
4050 void
4051 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4052                                         slp_tree node)
4053 {
4054   unsigned i;
4055   slp_tree child;
4056
4057   if (visited.add (node))
4058     return;
4059
4060   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4061     {
4062       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4063       if (optimize_bb_for_speed_p (bb))
4064         m_optimize_size = false;
4065     }
4066
4067   node->vertex = m_vertices.length ();
4068   m_vertices.safe_push (slpg_vertex (node));
4069
4070   bool leaf = true;
4071   bool force_leaf = false;
4072   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4073     if (child)
4074       {
4075         leaf = false;
4076         build_vertices (visited, child);
4077       }
4078     else
4079       force_leaf = true;
4080   /* Since SLP discovery works along use-def edges all cycles have an
4081      entry - but there's the exception of cycles where we do not handle
4082      the entry explicitely (but with a NULL SLP node), like some reductions
4083      and inductions.  Force those SLP PHIs to act as leafs to make them
4084      backwards reachable.  */
4085   if (leaf || force_leaf)
4086     m_leafs.safe_push (node->vertex);
4087 }
4088
4089 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
4090
4091 void
4092 vect_optimize_slp_pass::build_vertices ()
4093 {
4094   hash_set<slp_tree> visited;
4095   unsigned i;
4096   slp_instance instance;
4097   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4098     build_vertices (visited, SLP_INSTANCE_TREE (instance));
4099 }
4100
4101 /* Apply (reverse) bijectite PERM to VEC.  */
4102
4103 template <class T>
4104 static void
4105 vect_slp_permute (vec<unsigned> perm,
4106                   vec<T> &vec, bool reverse)
4107 {
4108   auto_vec<T, 64> saved;
4109   saved.create (vec.length ());
4110   for (unsigned i = 0; i < vec.length (); ++i)
4111     saved.quick_push (vec[i]);
4112
4113   if (reverse)
4114     {
4115       for (unsigned i = 0; i < vec.length (); ++i)
4116         vec[perm[i]] = saved[i];
4117       for (unsigned i = 0; i < vec.length (); ++i)
4118         gcc_assert (vec[perm[i]] == saved[i]);
4119     }
4120   else
4121     {
4122       for (unsigned i = 0; i < vec.length (); ++i)
4123         vec[i] = saved[perm[i]];
4124       for (unsigned i = 0; i < vec.length (); ++i)
4125         gcc_assert (vec[i] == saved[perm[i]]);
4126     }
4127 }
4128
4129 /* Return the cfg loop that contains NODE.  */
4130
4131 struct loop *
4132 vect_optimize_slp_pass::containing_loop (slp_tree node)
4133 {
4134   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4135   if (!rep)
4136     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4137   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4138 }
4139
4140 /* Return true if UD (an edge from a use to a definition) is associated
4141    with a loop latch edge in the cfg.  */
4142
4143 bool
4144 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4145 {
4146   slp_tree use = m_vertices[ud->src].node;
4147   slp_tree def = m_vertices[ud->dest].node;
4148   if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4149       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4150     return false;
4151
4152   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4153   return (is_a<gphi *> (use_rep->stmt)
4154           && bb_loop_header_p (gimple_bb (use_rep->stmt))
4155           && containing_loop (def) == containing_loop (use));
4156 }
4157
4158 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
4159    a nonnull data field.  */
4160
4161 void
4162 vect_optimize_slp_pass::build_graph ()
4163 {
4164   m_optimize_size = true;
4165   build_vertices ();
4166
4167   m_slpg = new_graph (m_vertices.length ());
4168   for (slpg_vertex &v : m_vertices)
4169     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4170       if (child)
4171         {
4172           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4173           if (is_cfg_latch_edge (ud))
4174             ud->data = this;
4175         }
4176 }
4177
4178 /* Return true if E corresponds to a loop latch edge in the cfg.  */
4179
4180 static bool
4181 skip_cfg_latch_edges (graph_edge *e)
4182 {
4183   return e->data;
4184 }
4185
4186 /* Create the node partitions.  */
4187
4188 void
4189 vect_optimize_slp_pass::create_partitions ()
4190 {
4191   /* Calculate a postorder of the graph, ignoring edges that correspond
4192      to natural latch edges in the cfg.  Reading the vector from the end
4193      to the beginning gives the reverse postorder.  */
4194   auto_vec<int> initial_rpo;
4195   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4196                false, NULL, skip_cfg_latch_edges);
4197   gcc_assert (initial_rpo.length () == m_vertices.length ());
4198
4199   /* Calculate the strongly connected components of the graph.  */
4200   auto_vec<int> scc_grouping;
4201   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4202
4203   /* Create a new index order in which all nodes from the same SCC are
4204      consecutive.  Use scc_pos to record the index of the first node in
4205      each SCC.  */
4206   auto_vec<unsigned int> scc_pos (num_sccs);
4207   int last_component = -1;
4208   unsigned int node_count = 0;
4209   for (unsigned int node_i : scc_grouping)
4210     {
4211       if (last_component != m_slpg->vertices[node_i].component)
4212         {
4213           last_component = m_slpg->vertices[node_i].component;
4214           gcc_assert (last_component == int (scc_pos.length ()));
4215           scc_pos.quick_push (node_count);
4216         }
4217       node_count += 1;
4218     }
4219   gcc_assert (node_count == initial_rpo.length ()
4220               && last_component + 1 == int (num_sccs));
4221
4222   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4223      inside each SCC following the RPO we calculated above.  The fact that
4224      we ignored natural latch edges when calculating the RPO should ensure
4225      that, for natural loop nests:
4226
4227      - the first node that we encounter in a cfg loop is the loop header phi
4228      - the loop header phis are in dominance order
4229
4230      Arranging for this is an optimization (see below) rather than a
4231      correctness issue.  Unnatural loops with a tangled mess of backedges
4232      will still work correctly, but might give poorer results.
4233
4234      Also update scc_pos so that it gives 1 + the index of the last node
4235      in the SCC.  */
4236   m_partitioned_nodes.safe_grow (node_count);
4237   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4238     {
4239       unsigned int node_i = initial_rpo[old_i];
4240       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4241       m_partitioned_nodes[new_i] = node_i;
4242     }
4243
4244   /* When optimizing for speed, partition each SCC based on the containing
4245      cfg loop. The order we constructed above should ensure that, for natural
4246      cfg loops, we'll create sub-SCC partitions for outer loops before
4247      the corresponding sub-SCC partitions for inner loops.  Similarly,
4248      when one sibling loop A dominates another sibling loop B, we should
4249      create a sub-SCC partition for A before a sub-SCC partition for B.
4250
4251      As above, nothing depends for correctness on whether this achieves
4252      a natural nesting, but we should get better results when it does.  */
4253   m_partitions.reserve (m_vertices.length ());
4254   unsigned int next_partition_i = 0;
4255   hash_map<struct loop *, int> loop_partitions;
4256   unsigned int rpo_begin = 0;
4257   unsigned int num_partitioned_nodes = 0;
4258   for (unsigned int rpo_end : scc_pos)
4259     {
4260       loop_partitions.empty ();
4261       unsigned int partition_i = next_partition_i;
4262       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4263         {
4264           /* Handle externals and constants optimistically throughout.
4265              But treat existing vectors as fixed since we do not handle
4266              permuting them.  */
4267           unsigned int node_i = m_partitioned_nodes[rpo_i];
4268           auto &vertex = m_vertices[node_i];
4269           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4270                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4271               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4272             vertex.partition = -1;
4273           else
4274             {
4275               bool existed;
4276               if (m_optimize_size)
4277                 existed = next_partition_i > partition_i;
4278               else
4279                 {
4280                   struct loop *loop = containing_loop (vertex.node);
4281                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
4282                   if (!existed)
4283                     entry = next_partition_i;
4284                   partition_i = entry;
4285                 }
4286               if (!existed)
4287                 {
4288                   m_partitions.quick_push (slpg_partition_info ());
4289                   next_partition_i += 1;
4290                 }
4291               vertex.partition = partition_i;
4292               num_partitioned_nodes += 1;
4293               m_partitions[partition_i].node_end += 1;
4294             }
4295         }
4296       rpo_begin = rpo_end;
4297     }
4298
4299   /* Assign ranges of consecutive node indices to each partition,
4300      in partition order.  Start with node_end being the same as
4301      node_begin so that the next loop can use it as a counter.  */
4302   unsigned int node_begin = 0;
4303   for (auto &partition : m_partitions)
4304     {
4305       partition.node_begin = node_begin;
4306       node_begin += partition.node_end;
4307       partition.node_end = partition.node_begin;
4308     }
4309   gcc_assert (node_begin == num_partitioned_nodes);
4310
4311   /* Finally build the list of nodes in partition order.  */
4312   m_partitioned_nodes.truncate (num_partitioned_nodes);
4313   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4314     {
4315       int partition_i = m_vertices[node_i].partition;
4316       if (partition_i >= 0)
4317         {
4318           unsigned int order_i = m_partitions[partition_i].node_end++;
4319           m_partitioned_nodes[order_i] = node_i;
4320         }
4321     }
4322 }
4323
4324 /* Look for edges from earlier partitions into node NODE_I and edges from
4325    node NODE_I into later partitions.  Call:
4326
4327       FN (ud, other_node_i)
4328
4329    for each such use-to-def edge ud, where other_node_i is the node at the
4330    other end of the edge.  */
4331
4332 template<typename T>
4333 void
4334 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4335 {
4336   int partition_i = m_vertices[node_i].partition;
4337   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4338        pred; pred = pred->pred_next)
4339     {
4340       int src_partition_i = m_vertices[pred->src].partition;
4341       if (src_partition_i >= 0 && src_partition_i != partition_i)
4342         fn (pred, pred->src);
4343     }
4344   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4345        succ; succ = succ->succ_next)
4346     {
4347       int dest_partition_i = m_vertices[succ->dest].partition;
4348       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4349         fn (succ, succ->dest);
4350     }
4351 }
4352
4353 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4354    that NODE would operate on.  This test is independent of NODE's actual
4355    operation.  */
4356
4357 bool
4358 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4359                                               unsigned int layout_i)
4360 {
4361   if (layout_i == 0)
4362     return true;
4363
4364   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4365     return false;
4366
4367   return true;
4368 }
4369
4370 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4371    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
4372    layouts is incompatible with NODE or if the change is not possible for
4373    some other reason.
4374
4375    The properties taken from NODE include the number of lanes and the
4376    vector type.  The actual operation doesn't matter.  */
4377
4378 int
4379 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4380                                             unsigned int from_layout_i,
4381                                             unsigned int to_layout_i)
4382 {
4383   if (!is_compatible_layout (node, from_layout_i)
4384       || !is_compatible_layout (node, to_layout_i))
4385     return -1;
4386
4387   if (from_layout_i == to_layout_i)
4388     return 0;
4389
4390   auto_vec<slp_tree, 1> children (1);
4391   children.quick_push (node);
4392   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4393   if (from_layout_i > 0)
4394     for (unsigned int i : m_perms[from_layout_i])
4395       perm.quick_push ({ 0, i });
4396   else
4397     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4398       perm.quick_push ({ 0, i });
4399   if (to_layout_i > 0)
4400     vect_slp_permute (m_perms[to_layout_i], perm, true);
4401   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4402                                                children, false);
4403   if (count >= 0)
4404     return MAX (count, 1);
4405
4406   /* ??? In principle we could try changing via layout 0, giving two
4407      layout changes rather than 1.  Doing that would require
4408      corresponding support in get_result_with_layout.  */
4409   return -1;
4410 }
4411
4412 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
4413
4414 inline slpg_partition_layout_costs &
4415 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4416                                                 unsigned int layout_i)
4417 {
4418   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4419 }
4420
4421 /* Change PERM in one of two ways:
4422
4423    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4424      chosen for child I of NODE.
4425
4426    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4427
4428    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
4429
4430 void
4431 vect_optimize_slp_pass::
4432 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4433                         int in_layout_i, unsigned int out_layout_i)
4434 {
4435   for (auto &entry : perm)
4436     {
4437       int this_in_layout_i = in_layout_i;
4438       if (this_in_layout_i < 0)
4439         {
4440           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4441           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4442           this_in_layout_i = m_partitions[in_partition_i].layout;
4443         }
4444       if (this_in_layout_i > 0)
4445         entry.second = m_perms[this_in_layout_i][entry.second];
4446     }
4447   if (out_layout_i > 0)
4448     vect_slp_permute (m_perms[out_layout_i], perm, true);
4449 }
4450
4451 /* Check whether the target allows NODE to be rearranged so that the node's
4452    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
4453    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
4454
4455    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4456    NODE can adapt to the layout changes that have (perhaps provisionally)
4457    been chosen for NODE's children, so that no extra permutations are
4458    needed on either the input or the output of NODE.
4459
4460    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4461    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4462
4463    IN_LAYOUT_I has no meaning for other types of node.
4464
4465    Keeping the node as-is is always valid.  If the target doesn't appear
4466    to support the node as-is, but might realistically support other layouts,
4467    then layout 0 instead has the cost of a worst-case permutation.  On the
4468    one hand, this ensures that every node has at least one valid layout,
4469    avoiding what would otherwise be an awkward special case.  On the other,
4470    it still encourages the pass to change an invalid pre-existing layout
4471    choice into a valid one.  */
4472
4473 int
4474 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4475                                             unsigned int out_layout_i)
4476 {
4477   const int fallback_cost = 1;
4478
4479   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4480     {
4481       auto_lane_permutation_t tmp_perm;
4482       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4483
4484       /* Check that the child nodes support the chosen layout.  Checking
4485          the first child is enough, since any second child would have the
4486          same shape.  */
4487       auto first_child = SLP_TREE_CHILDREN (node)[0];
4488       if (in_layout_i > 0
4489           && !is_compatible_layout (first_child, in_layout_i))
4490         return -1;
4491
4492       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4493       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4494                                                   node, tmp_perm,
4495                                                   SLP_TREE_CHILDREN (node),
4496                                                   false);
4497       if (count < 0)
4498         {
4499           if (in_layout_i == 0 && out_layout_i == 0)
4500             {
4501               /* Use the fallback cost if the node could in principle support
4502                  some nonzero layout for both the inputs and the outputs.
4503                  Otherwise assume that the node will be rejected later
4504                  and rebuilt from scalars.  */
4505               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4506                 return fallback_cost;
4507               return 0;
4508             }
4509           return -1;
4510         }
4511
4512       /* We currently have no way of telling whether the new layout is cheaper
4513          or more expensive than the old one.  But at least in principle,
4514          it should be worth making zero permutations (whole-vector shuffles)
4515          cheaper than real permutations, in case the pass is able to remove
4516          the latter.  */
4517       return count == 0 ? 0 : 1;
4518     }
4519
4520   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4521   if (rep
4522       && STMT_VINFO_DATA_REF (rep)
4523       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4524       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4525     {
4526       auto_load_permutation_t tmp_perm;
4527       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4528       if (out_layout_i > 0)
4529         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4530
4531       poly_uint64 vf = 1;
4532       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4533         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4534       unsigned int n_perms;
4535       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4536                                            nullptr, vf, true, false, &n_perms))
4537         {
4538           auto rep = SLP_TREE_REPRESENTATIVE (node);
4539           if (out_layout_i == 0)
4540             {
4541               /* Use the fallback cost if the load is an N-to-N permutation.
4542                  Otherwise assume that the node will be rejected later
4543                  and rebuilt from scalars.  */
4544               if (STMT_VINFO_GROUPED_ACCESS (rep)
4545                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4546                       == SLP_TREE_LANES (node)))
4547                 return fallback_cost;
4548               return 0;
4549             }
4550           return -1;
4551         }
4552
4553       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
4554       return n_perms == 0 ? 0 : 1;
4555     }
4556
4557   return 0;
4558 }
4559
4560 /* Decide which element layouts we should consider using.  Calculate the
4561    weights associated with inserting layout changes on partition edges.
4562    Also mark partitions that cannot change layout, by setting their
4563    layout to zero.  */
4564
4565 void
4566 vect_optimize_slp_pass::start_choosing_layouts ()
4567 {
4568   /* Used to assign unique permutation indices.  */
4569   using perm_hash = unbounded_hashmap_traits<
4570     vec_free_hash_base<int_hash_base<unsigned>>,
4571     int_hash<int, -1, -2>
4572   >;
4573   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4574
4575   /* Layout 0 is "no change".  */
4576   m_perms.safe_push (vNULL);
4577
4578   /* Create layouts from existing permutations.  */
4579   auto_load_permutation_t tmp_perm;
4580   for (unsigned int node_i : m_partitioned_nodes)
4581     {
4582       /* Leafs also double as entries to the reverse graph.  Allow the
4583          layout of those to be changed.  */
4584       auto &vertex = m_vertices[node_i];
4585       auto &partition = m_partitions[vertex.partition];
4586       if (!m_slpg->vertices[node_i].succ)
4587         partition.layout = 0;
4588
4589       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
4590       slp_tree node = vertex.node;
4591       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4592       slp_tree child;
4593       unsigned HOST_WIDE_INT imin, imax = 0;
4594       bool any_permute = false;
4595       tmp_perm.truncate (0);
4596       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4597         {
4598           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4599              unpermuted, record a layout that reverses this permutation.
4600
4601              We would need more work to cope with loads that are internally
4602              permuted and also have inputs (such as masks for
4603              IFN_MASK_LOADs).  */
4604           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4605           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4606             {
4607               partition.layout = -1;
4608               continue;
4609             }
4610           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4611           imin = DR_GROUP_SIZE (dr_stmt) + 1;
4612           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4613         }
4614       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4615                && SLP_TREE_CHILDREN (node).length () == 1
4616                && (child = SLP_TREE_CHILDREN (node)[0])
4617                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4618                    .is_constant (&imin)))
4619         {
4620           /* If the child has the same vector size as this node,
4621              reversing the permutation can make the permutation a no-op.
4622              In other cases it can change a true permutation into a
4623              full-vector extract.  */
4624           tmp_perm.reserve (SLP_TREE_LANES (node));
4625           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4626             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4627         }
4628       else
4629         continue;
4630
4631       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4632         {
4633           unsigned idx = tmp_perm[j];
4634           imin = MIN (imin, idx);
4635           imax = MAX (imax, idx);
4636           if (idx - tmp_perm[0] != j)
4637             any_permute = true;
4638         }
4639       /* If the span doesn't match we'd disrupt VF computation, avoid
4640          that for now.  */
4641       if (imax - imin + 1 != SLP_TREE_LANES (node))
4642         continue;
4643       /* If there's no permute no need to split one out.  In this case
4644          we can consider turning a load into a permuted load, if that
4645          turns out to be cheaper than alternatives.  */
4646       if (!any_permute)
4647         {
4648           partition.layout = -1;
4649           continue;
4650         }
4651
4652       /* For now only handle true permutes, like
4653          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
4654          when permuting constants and invariants keeping the permute
4655          bijective.  */
4656       auto_sbitmap load_index (SLP_TREE_LANES (node));
4657       bitmap_clear (load_index);
4658       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4659         bitmap_set_bit (load_index, tmp_perm[j] - imin);
4660       unsigned j;
4661       for (j = 0; j < SLP_TREE_LANES (node); ++j)
4662         if (!bitmap_bit_p (load_index, j))
4663           break;
4664       if (j != SLP_TREE_LANES (node))
4665         continue;
4666
4667       vec<unsigned> perm = vNULL;
4668       perm.safe_grow (SLP_TREE_LANES (node), true);
4669       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4670         perm[j] = tmp_perm[j] - imin;
4671
4672       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4673         {
4674           /* Continue to use existing layouts, but don't add any more.  */
4675           int *entry = layout_ids.get (perm);
4676           partition.layout = entry ? *entry : 0;
4677           perm.release ();
4678         }
4679       else
4680         {
4681           bool existed;
4682           int &layout_i = layout_ids.get_or_insert (perm, &existed);
4683           if (existed)
4684             perm.release ();
4685           else
4686             {
4687               layout_i = m_perms.length ();
4688               m_perms.safe_push (perm);
4689             }
4690           partition.layout = layout_i;
4691         }
4692     }
4693
4694   /* Initially assume that every layout is possible and has zero cost
4695      in every partition.  */
4696   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4697                                               * m_perms.length ());
4698
4699   /* We have to mark outgoing permutations facing non-associating-reduction
4700      graph entries that are not represented as to be materialized.
4701      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
4702   for (slp_instance instance : m_vinfo->slp_instances)
4703     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4704       {
4705         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4706         m_partitions[m_vertices[node_i].partition].layout = 0;
4707       }
4708     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4709       {
4710         stmt_vec_info stmt_info
4711           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4712         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4713         if (needs_fold_left_reduction_p (TREE_TYPE
4714                                            (gimple_get_lhs (stmt_info->stmt)),
4715                                          STMT_VINFO_REDUC_CODE (reduc_info)))
4716           {
4717             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4718             m_partitions[m_vertices[node_i].partition].layout = 0;
4719           }
4720       }
4721
4722   /* Check which layouts each node and partition can handle.  Calculate the
4723      weights associated with inserting layout changes on edges.  */
4724   for (unsigned int node_i : m_partitioned_nodes)
4725     {
4726       auto &vertex = m_vertices[node_i];
4727       auto &partition = m_partitions[vertex.partition];
4728       slp_tree node = vertex.node;
4729
4730       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4731         {
4732           vertex.weight = vect_slp_node_weight (node);
4733
4734           /* We do not handle stores with a permutation, so all
4735              incoming permutations must have been materialized.
4736
4737              We also don't handle masked grouped loads, which lack a
4738              permutation vector.  In this case the memory locations
4739              form an implicit second input to the loads, on top of the
4740              explicit mask input, and the memory input's layout cannot
4741              be changed.
4742
4743              On the other hand, we do support permuting gather loads and
4744              masked gather loads, where each scalar load is independent
4745              of the others.  This can be useful if the address/index input
4746              benefits from permutation.  */
4747           if (STMT_VINFO_DATA_REF (rep)
4748               && STMT_VINFO_GROUPED_ACCESS (rep)
4749               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4750             partition.layout = 0;
4751
4752           /* We cannot change the layout of an operation that is
4753              not independent on lanes.  Note this is an explicit
4754              negative list since that's much shorter than the respective
4755              positive one but it's critical to keep maintaining it.  */
4756           if (is_gimple_call (STMT_VINFO_STMT (rep)))
4757             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4758               {
4759               case CFN_COMPLEX_ADD_ROT90:
4760               case CFN_COMPLEX_ADD_ROT270:
4761               case CFN_COMPLEX_MUL:
4762               case CFN_COMPLEX_MUL_CONJ:
4763               case CFN_VEC_ADDSUB:
4764               case CFN_VEC_FMADDSUB:
4765               case CFN_VEC_FMSUBADD:
4766                 partition.layout = 0;
4767               default:;
4768               }
4769         }
4770
4771       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4772         {
4773           auto &other_vertex = m_vertices[other_node_i];
4774
4775           /* Count the number of edges from earlier partitions and the number
4776              of edges to later partitions.  */
4777           if (other_vertex.partition < vertex.partition)
4778             partition.in_degree += 1;
4779           else
4780             partition.out_degree += 1;
4781
4782           /* If the current node uses the result of OTHER_NODE_I, accumulate
4783              the effects of that.  */
4784           if (ud->src == int (node_i))
4785             {
4786               other_vertex.out_weight += vertex.weight;
4787               other_vertex.out_degree += 1;
4788             }
4789         };
4790       for_each_partition_edge (node_i, process_edge);
4791     }
4792 }
4793
4794 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4795    its current (provisional) choice of layout.  The inputs do not necessarily
4796    have the same layout as each other.  */
4797
4798 slpg_layout_cost
4799 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4800 {
4801   auto &vertex = m_vertices[node_i];
4802   slpg_layout_cost cost;
4803   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4804     {
4805       auto &other_vertex = m_vertices[other_node_i];
4806       if (other_vertex.partition < vertex.partition)
4807         {
4808           auto &other_partition = m_partitions[other_vertex.partition];
4809           auto &other_costs = partition_layout_costs (other_vertex.partition,
4810                                                       other_partition.layout);
4811           slpg_layout_cost this_cost = other_costs.in_cost;
4812           this_cost.add_serial_cost (other_costs.internal_cost);
4813           this_cost.split (other_partition.out_degree);
4814           cost.add_parallel_cost (this_cost);
4815         }
4816     };
4817   for_each_partition_edge (node_i, add_cost);
4818   return cost;
4819 }
4820
4821 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4822    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
4823    slpg_layout_cost::impossible () if the change isn't possible.  */
4824
4825 slpg_layout_cost
4826 vect_optimize_slp_pass::
4827 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4828                   unsigned int layout2_i)
4829 {
4830   auto &def_vertex = m_vertices[ud->dest];
4831   auto &use_vertex = m_vertices[ud->src];
4832   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4833   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4834   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4835                                     use_layout_i);
4836   if (factor < 0)
4837     return slpg_layout_cost::impossible ();
4838
4839   /* We have a choice of putting the layout change at the site of the
4840      definition or at the site of the use.  Prefer the former when
4841      optimizing for size or when the execution frequency of the
4842      definition is no greater than the combined execution frequencies of
4843      the uses.  When putting the layout change at the site of the definition,
4844      divvy up the cost among all consumers.  */
4845   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4846     {
4847       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4848       cost.split (def_vertex.out_degree);
4849       return cost;
4850     }
4851   return { use_vertex.weight * factor, m_optimize_size };
4852 }
4853
4854 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4855    partition; FROM_NODE_I could be the definition node or the use node.
4856    The node at the other end of the link wants to use layout TO_LAYOUT_I.
4857    Return the cost of any necessary fix-ups on edge UD, or return
4858    slpg_layout_cost::impossible () if the change isn't possible.
4859
4860    At this point, FROM_NODE_I's partition has chosen the cheapest
4861    layout based on the information available so far, but this choice
4862    is only provisional.  */
4863
4864 slpg_layout_cost
4865 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4866                                       unsigned int to_layout_i)
4867 {
4868   auto &from_vertex = m_vertices[from_node_i];
4869   unsigned int from_partition_i = from_vertex.partition;
4870   slpg_partition_info &from_partition = m_partitions[from_partition_i];
4871   gcc_assert (from_partition.layout >= 0);
4872
4873   /* First calculate the cost on the assumption that FROM_PARTITION sticks
4874      with its current layout preference.  */
4875   slpg_layout_cost cost = slpg_layout_cost::impossible ();
4876   auto edge_cost = edge_layout_cost (ud, from_node_i,
4877                                      from_partition.layout, to_layout_i);
4878   if (edge_cost.is_possible ())
4879     {
4880       auto &from_costs = partition_layout_costs (from_partition_i,
4881                                                  from_partition.layout);
4882       cost = from_costs.in_cost;
4883       cost.add_serial_cost (from_costs.internal_cost);
4884       cost.split (from_partition.out_degree);
4885       cost.add_serial_cost (edge_cost);
4886     }
4887
4888   /* Take the minimum of that cost and the cost that applies if
4889      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
4890   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4891                                                       to_layout_i);
4892   if (direct_layout_costs.is_possible ())
4893     {
4894       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4895       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4896       direct_cost.split (from_partition.out_degree);
4897       if (!cost.is_possible ()
4898           || direct_cost.is_better_than (cost, m_optimize_size))
4899         cost = direct_cost;
4900     }
4901
4902   return cost;
4903 }
4904
4905 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4906    partition; TO_NODE_I could be the definition node or the use node.
4907    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4908    return the cost of any necessary fix-ups on edge UD, or
4909    slpg_layout_cost::impossible () if the choice cannot be made.
4910
4911    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
4912
4913 slpg_layout_cost
4914 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4915                                        unsigned int from_layout_i)
4916 {
4917   auto &to_vertex = m_vertices[to_node_i];
4918   unsigned int to_partition_i = to_vertex.partition;
4919   slpg_partition_info &to_partition = m_partitions[to_partition_i];
4920   gcc_assert (to_partition.layout >= 0);
4921
4922   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4923      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
4924      any other inputs keep their current choice of layout.  */
4925   auto &to_costs = partition_layout_costs (to_partition_i,
4926                                            to_partition.layout);
4927   if (ud->src == int (to_node_i)
4928       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4929     {
4930       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4931       auto old_layout = from_partition.layout;
4932       from_partition.layout = from_layout_i;
4933       int factor = internal_node_cost (to_vertex.node, -1,
4934                                        to_partition.layout);
4935       from_partition.layout = old_layout;
4936       if (factor >= 0)
4937         {
4938           slpg_layout_cost cost = to_costs.out_cost;
4939           cost.add_serial_cost ({ to_vertex.weight * factor,
4940                                   m_optimize_size });
4941           cost.split (to_partition.in_degree);
4942           return cost;
4943         }
4944     }
4945
4946   /* Compute the cost if we insert any necessary layout change on edge UD.  */
4947   auto edge_cost = edge_layout_cost (ud, to_node_i,
4948                                      to_partition.layout, from_layout_i);
4949   if (edge_cost.is_possible ())
4950     {
4951       slpg_layout_cost cost = to_costs.out_cost;
4952       cost.add_serial_cost (to_costs.internal_cost);
4953       cost.split (to_partition.in_degree);
4954       cost.add_serial_cost (edge_cost);
4955       return cost;
4956     }
4957
4958   return slpg_layout_cost::impossible ();
4959 }
4960
4961 /* Make a forward pass through the partitions, accumulating input costs.
4962    Make a tentative (provisional) choice of layout for each partition,
4963    ensuring that this choice still allows later partitions to keep
4964    their original layout.  */
4965
4966 void
4967 vect_optimize_slp_pass::forward_pass ()
4968 {
4969   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
4970        ++partition_i)
4971     {
4972       auto &partition = m_partitions[partition_i];
4973
4974       /* If the partition consists of a single VEC_PERM_EXPR, precompute
4975          the incoming cost that would apply if every predecessor partition
4976          keeps its current layout.  This is used within the loop below.  */
4977       slpg_layout_cost in_cost;
4978       slp_tree single_node = nullptr;
4979       if (partition.node_end == partition.node_begin + 1)
4980         {
4981           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
4982           single_node = m_vertices[node_i].node;
4983           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
4984             in_cost = total_in_cost (node_i);
4985         }
4986
4987       /* Go through the possible layouts.  Decide which ones are valid
4988          for this partition and record which of the valid layouts has
4989          the lowest cost.  */
4990       unsigned int min_layout_i = 0;
4991       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
4992       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
4993         {
4994           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
4995           if (!layout_costs.is_possible ())
4996             continue;
4997
4998           /* If the recorded layout is already 0 then the layout cannot
4999              change.  */
5000           if (partition.layout == 0 && layout_i != 0)
5001             {
5002               layout_costs.mark_impossible ();
5003               continue;
5004             }
5005
5006           bool is_possible = true;
5007           for (unsigned int order_i = partition.node_begin;
5008                order_i < partition.node_end; ++order_i)
5009             {
5010               unsigned int node_i = m_partitioned_nodes[order_i];
5011               auto &vertex = m_vertices[node_i];
5012
5013               /* Reject the layout if it is individually incompatible
5014                  with any node in the partition.  */
5015               if (!is_compatible_layout (vertex.node, layout_i))
5016                 {
5017                   is_possible = false;
5018                   break;
5019                 }
5020
5021               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5022                 {
5023                   auto &other_vertex = m_vertices[other_node_i];
5024                   if (other_vertex.partition < vertex.partition)
5025                     {
5026                       /* Accumulate the incoming costs from earlier
5027                          partitions, plus the cost of any layout changes
5028                          on UD itself.  */
5029                       auto cost = forward_cost (ud, other_node_i, layout_i);
5030                       if (!cost.is_possible ())
5031                         is_possible = false;
5032                       else
5033                         layout_costs.in_cost.add_parallel_cost (cost);
5034                     }
5035                   else
5036                     /* Reject the layout if it would make layout 0 impossible
5037                        for later partitions.  This amounts to testing that the
5038                        target supports reversing the layout change on edges
5039                        to later partitions.
5040
5041                        In principle, it might be possible to push a layout
5042                        change all the way down a graph, so that it never
5043                        needs to be reversed and so that the target doesn't
5044                        need to support the reverse operation.  But it would
5045                        be awkward to bail out if we hit a partition that
5046                        does not support the new layout, especially since
5047                        we are not dealing with a lattice.  */
5048                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
5049                                                      layout_i).is_possible ();
5050                 };
5051               for_each_partition_edge (node_i, add_cost);
5052
5053               /* Accumulate the cost of using LAYOUT_I within NODE,
5054                  both for the inputs and the outputs.  */
5055               int factor = internal_node_cost (vertex.node, layout_i,
5056                                                layout_i);
5057               if (factor < 0)
5058                 {
5059                   is_possible = false;
5060                   break;
5061                 }
5062               else if (factor)
5063                 layout_costs.internal_cost.add_serial_cost
5064                   ({ vertex.weight * factor, m_optimize_size });
5065             }
5066           if (!is_possible)
5067             {
5068               layout_costs.mark_impossible ();
5069               continue;
5070             }
5071
5072           /* Combine the incoming and partition-internal costs.  */
5073           slpg_layout_cost combined_cost = layout_costs.in_cost;
5074           combined_cost.add_serial_cost (layout_costs.internal_cost);
5075
5076           /* If this partition consists of a single VEC_PERM_EXPR, see
5077              if the VEC_PERM_EXPR can be changed to support output layout
5078              LAYOUT_I while keeping all the provisional choices of input
5079              layout.  */
5080           if (single_node
5081               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5082             {
5083               int factor = internal_node_cost (single_node, -1, layout_i);
5084               if (factor >= 0)
5085                 {
5086                   auto weight = m_vertices[single_node->vertex].weight;
5087                   slpg_layout_cost internal_cost
5088                     = { weight * factor, m_optimize_size };
5089
5090                   slpg_layout_cost alt_cost = in_cost;
5091                   alt_cost.add_serial_cost (internal_cost);
5092                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5093                     {
5094                       combined_cost = alt_cost;
5095                       layout_costs.in_cost = in_cost;
5096                       layout_costs.internal_cost = internal_cost;
5097                     }
5098                 }
5099             }
5100
5101           /* Record the layout with the lowest cost.  Prefer layout 0 in
5102              the event of a tie between it and another layout.  */
5103           if (!min_layout_cost.is_possible ()
5104               || combined_cost.is_better_than (min_layout_cost,
5105                                                m_optimize_size))
5106             {
5107               min_layout_i = layout_i;
5108               min_layout_cost = combined_cost;
5109             }
5110         }
5111
5112       /* This loop's handling of earlier partitions should ensure that
5113          choosing the original layout for the current partition is no
5114          less valid than it was in the original graph, even with the
5115          provisional layout choices for those earlier partitions.  */
5116       gcc_assert (min_layout_cost.is_possible ());
5117       partition.layout = min_layout_i;
5118     }
5119 }
5120
5121 /* Make a backward pass through the partitions, accumulating output costs.
5122    Make a final choice of layout for each partition.  */
5123
5124 void
5125 vect_optimize_slp_pass::backward_pass ()
5126 {
5127   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5128     {
5129       auto &partition = m_partitions[partition_i];
5130
5131       unsigned int min_layout_i = 0;
5132       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5133       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5134         {
5135           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5136           if (!layout_costs.is_possible ())
5137             continue;
5138
5139           /* Accumulate the costs from successor partitions.  */
5140           bool is_possible = true;
5141           for (unsigned int order_i = partition.node_begin;
5142                order_i < partition.node_end; ++order_i)
5143             {
5144               unsigned int node_i = m_partitioned_nodes[order_i];
5145               auto &vertex = m_vertices[node_i];
5146               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5147                 {
5148                   auto &other_vertex = m_vertices[other_node_i];
5149                   auto &other_partition = m_partitions[other_vertex.partition];
5150                   if (other_vertex.partition > vertex.partition)
5151                     {
5152                       /* Accumulate the incoming costs from later
5153                          partitions, plus the cost of any layout changes
5154                          on UD itself.  */
5155                       auto cost = backward_cost (ud, other_node_i, layout_i);
5156                       if (!cost.is_possible ())
5157                         is_possible = false;
5158                       else
5159                         layout_costs.out_cost.add_parallel_cost (cost);
5160                     }
5161                   else
5162                     /* Make sure that earlier partitions can (if necessary
5163                        or beneficial) keep the layout that they chose in
5164                        the forward pass.  This ensures that there is at
5165                        least one valid choice of layout.  */
5166                     is_possible &= edge_layout_cost (ud, other_node_i,
5167                                                      other_partition.layout,
5168                                                      layout_i).is_possible ();
5169                 };
5170               for_each_partition_edge (node_i, add_cost);
5171             }
5172           if (!is_possible)
5173             {
5174               layout_costs.mark_impossible ();
5175               continue;
5176             }
5177
5178           /* Locally combine the costs from the forward and backward passes.
5179              (This combined cost is not passed on, since that would lead
5180              to double counting.)  */
5181           slpg_layout_cost combined_cost = layout_costs.in_cost;
5182           combined_cost.add_serial_cost (layout_costs.internal_cost);
5183           combined_cost.add_serial_cost (layout_costs.out_cost);
5184
5185           /* Record the layout with the lowest cost.  Prefer layout 0 in
5186              the event of a tie between it and another layout.  */
5187           if (!min_layout_cost.is_possible ()
5188               || combined_cost.is_better_than (min_layout_cost,
5189                                                m_optimize_size))
5190             {
5191               min_layout_i = layout_i;
5192               min_layout_cost = combined_cost;
5193             }
5194         }
5195
5196       gcc_assert (min_layout_cost.is_possible ());
5197       partition.layout = min_layout_i;
5198     }
5199 }
5200
5201 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5202    NODE already has the layout that was selected for its partition.  */
5203
5204 slp_tree
5205 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5206                                                 unsigned int to_layout_i)
5207 {
5208   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5209   slp_tree result = m_node_layouts[result_i];
5210   if (result)
5211     return result;
5212
5213   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5214       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5215           /* We can't permute vector defs in place.  */
5216           && SLP_TREE_VEC_DEFS (node).is_empty ()))
5217     {
5218       /* If the vector is uniform or unchanged, there's nothing to do.  */
5219       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5220         result = node;
5221       else
5222         {
5223           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5224           result = vect_create_new_slp_node (scalar_ops);
5225           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5226         }
5227     }
5228   else
5229     {
5230       unsigned int partition_i = m_vertices[node->vertex].partition;
5231       unsigned int from_layout_i = m_partitions[partition_i].layout;
5232       if (from_layout_i == to_layout_i)
5233         return node;
5234
5235       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5236          permutation instead of a serial one.  Leave the new permutation
5237          in TMP_PERM on success.  */
5238       auto_lane_permutation_t tmp_perm;
5239       unsigned int num_inputs = 1;
5240       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5241         {
5242           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5243           if (from_layout_i != 0)
5244             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5245           if (to_layout_i != 0)
5246             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5247           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5248                                               tmp_perm,
5249                                               SLP_TREE_CHILDREN (node),
5250                                               false) >= 0)
5251             num_inputs = SLP_TREE_CHILDREN (node).length ();
5252           else
5253             tmp_perm.truncate (0);
5254         }
5255
5256       if (dump_enabled_p ())
5257         {
5258           if (tmp_perm.length () > 0)
5259             dump_printf_loc (MSG_NOTE, vect_location,
5260                              "duplicating permutation node %p with"
5261                              " layout %d\n",
5262                              (void *) node, to_layout_i);
5263           else
5264             dump_printf_loc (MSG_NOTE, vect_location,
5265                              "inserting permutation node in place of %p\n",
5266                              (void *) node);
5267         }
5268
5269       unsigned int num_lanes = SLP_TREE_LANES (node);
5270       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5271       if (SLP_TREE_SCALAR_STMTS (node).length ())
5272         {
5273           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5274           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5275           if (from_layout_i != 0)
5276             vect_slp_permute (m_perms[from_layout_i], stmts, false);
5277           if (to_layout_i != 0)
5278             vect_slp_permute (m_perms[to_layout_i], stmts, true);
5279         }
5280       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5281       SLP_TREE_LANES (result) = num_lanes;
5282       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5283       result->vertex = -1;
5284
5285       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5286       if (tmp_perm.length ())
5287         {
5288           lane_perm.safe_splice (tmp_perm);
5289           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5290         }
5291       else
5292         {
5293           lane_perm.create (num_lanes);
5294           for (unsigned j = 0; j < num_lanes; ++j)
5295             lane_perm.quick_push ({ 0, j });
5296           if (from_layout_i != 0)
5297             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5298           if (to_layout_i != 0)
5299             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5300           SLP_TREE_CHILDREN (result).safe_push (node);
5301         }
5302       for (slp_tree child : SLP_TREE_CHILDREN (result))
5303         child->refcnt++;
5304     }
5305   m_node_layouts[result_i] = result;
5306   return result;
5307 }
5308
5309 /* Apply the chosen vector layouts to the SLP graph.  */
5310
5311 void
5312 vect_optimize_slp_pass::materialize ()
5313 {
5314   /* We no longer need the costs, so avoid having two O(N * P) arrays
5315      live at the same time.  */
5316   m_partition_layout_costs.release ();
5317   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5318
5319   auto_sbitmap fully_folded (m_vertices.length ());
5320   bitmap_clear (fully_folded);
5321   for (unsigned int node_i : m_partitioned_nodes)
5322     {
5323       auto &vertex = m_vertices[node_i];
5324       slp_tree node = vertex.node;
5325       int layout_i = m_partitions[vertex.partition].layout;
5326       gcc_assert (layout_i >= 0);
5327
5328       /* Rearrange the scalar statements to match the chosen layout.  */
5329       if (layout_i > 0)
5330         vect_slp_permute (m_perms[layout_i],
5331                           SLP_TREE_SCALAR_STMTS (node), true);
5332
5333       /* Update load and lane permutations.  */
5334       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5335         {
5336           /* First try to absorb the input vector layouts.  If that fails,
5337              force the inputs to have layout LAYOUT_I too.  We checked that
5338              that was possible before deciding to use nonzero output layouts.
5339              (Note that at this stage we don't really have any guarantee that
5340              the target supports the original VEC_PERM_EXPR.)  */
5341           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5342           auto_lane_permutation_t tmp_perm;
5343           tmp_perm.safe_splice (perm);
5344           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5345           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5346                                               tmp_perm,
5347                                               SLP_TREE_CHILDREN (node),
5348                                               false) >= 0)
5349             {
5350               if (dump_enabled_p ()
5351                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5352                                   perm.begin ()))
5353                 dump_printf_loc (MSG_NOTE, vect_location,
5354                                  "absorbing input layouts into %p\n",
5355                                  (void *) node);
5356               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5357               bitmap_set_bit (fully_folded, node_i);
5358             }
5359           else
5360             {
5361               /* Not MSG_MISSED because it would make no sense to users.  */
5362               if (dump_enabled_p ())
5363                 dump_printf_loc (MSG_NOTE, vect_location,
5364                                  "failed to absorb input layouts into %p\n",
5365                                  (void *) node);
5366               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5367             }
5368         }
5369       else
5370         {
5371           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5372           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5373           if (layout_i > 0)
5374             /* ???  When we handle non-bijective permutes the idea
5375                is that we can force the load-permutation to be
5376                { min, min + 1, min + 2, ... max }.  But then the
5377                scalar defs might no longer match the lane content
5378                which means wrong-code with live lane vectorization.
5379                So we possibly have to have NULL entries for those.  */
5380             vect_slp_permute (m_perms[layout_i], load_perm, true);
5381         }
5382     }
5383
5384   /* Do this before any nodes disappear, since it involves a walk
5385      over the leaves.  */
5386   remove_redundant_permutations ();
5387
5388   /* Replace each child with a correctly laid-out version.  */
5389   for (unsigned int node_i : m_partitioned_nodes)
5390     {
5391       /* Skip nodes that have already been handled above.  */
5392       if (bitmap_bit_p (fully_folded, node_i))
5393         continue;
5394
5395       auto &vertex = m_vertices[node_i];
5396       int in_layout_i = m_partitions[vertex.partition].layout;
5397       gcc_assert (in_layout_i >= 0);
5398
5399       unsigned j;
5400       slp_tree child;
5401       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5402         {
5403           if (!child)
5404             continue;
5405
5406           slp_tree new_child = get_result_with_layout (child, in_layout_i);
5407           if (new_child != child)
5408             {
5409               vect_free_slp_tree (child);
5410               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5411               new_child->refcnt += 1;
5412             }
5413         }
5414     }
5415 }
5416
5417 /* Elide load permutations that are not necessary.  Such permutations might
5418    be pre-existing, rather than created by the layout optimizations.  */
5419
5420 void
5421 vect_optimize_slp_pass::remove_redundant_permutations ()
5422 {
5423   for (unsigned int node_i : m_leafs)
5424     {
5425       slp_tree node = m_vertices[node_i].node;
5426       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5427         continue;
5428
5429       /* In basic block vectorization we allow any subchain of an interleaving
5430          chain.
5431          FORNOW: not in loop SLP because of realignment complications.  */
5432       if (is_a <bb_vec_info> (m_vinfo))
5433         {
5434           bool subchain_p = true;
5435           stmt_vec_info next_load_info = NULL;
5436           stmt_vec_info load_info;
5437           unsigned j;
5438           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5439             {
5440               if (j != 0
5441                   && (next_load_info != load_info
5442                       || DR_GROUP_GAP (load_info) != 1))
5443                 {
5444                   subchain_p = false;
5445                   break;
5446                 }
5447               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5448             }
5449           if (subchain_p)
5450             {
5451               SLP_TREE_LOAD_PERMUTATION (node).release ();
5452               continue;
5453             }
5454         }
5455       else
5456         {
5457           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5458           stmt_vec_info load_info;
5459           bool this_load_permuted = false;
5460           unsigned j;
5461           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5462             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5463               {
5464                 this_load_permuted = true;
5465                 break;
5466               }
5467           /* When this isn't a grouped access we know it's single element
5468              and contiguous.  */
5469           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5470             {
5471               if (!this_load_permuted
5472                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5473                       || SLP_TREE_LANES (node) == 1))
5474                 SLP_TREE_LOAD_PERMUTATION (node).release ();
5475               continue;
5476             }
5477           stmt_vec_info first_stmt_info
5478             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5479           if (!this_load_permuted
5480               /* The load requires permutation when unrolling exposes
5481                  a gap either because the group is larger than the SLP
5482                  group-size or because there is a gap between the groups.  */
5483               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5484                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5485                       && DR_GROUP_GAP (first_stmt_info) == 0)))
5486             {
5487               SLP_TREE_LOAD_PERMUTATION (node).release ();
5488               continue;
5489             }
5490         }
5491     }
5492 }
5493
5494 /* Print the partition graph and layout information to the dump file.  */
5495
5496 void
5497 vect_optimize_slp_pass::dump ()
5498 {
5499   dump_printf_loc (MSG_NOTE, vect_location,
5500                    "SLP optimize permutations:\n");
5501   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5502     {
5503       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
5504       const char *sep = "";
5505       for (unsigned int idx : m_perms[layout_i])
5506         {
5507           dump_printf (MSG_NOTE, "%s%d", sep, idx);
5508           sep = ", ";
5509         }
5510       dump_printf (MSG_NOTE, " }\n");
5511     }
5512   dump_printf_loc (MSG_NOTE, vect_location,
5513                    "SLP optimize partitions:\n");
5514   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5515        ++partition_i)
5516     {
5517       auto &partition = m_partitions[partition_i];
5518       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
5519       dump_printf_loc (MSG_NOTE, vect_location,
5520                        "  partition %d (layout %d):\n",
5521                        partition_i, partition.layout);
5522       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
5523       for (unsigned int order_i = partition.node_begin;
5524            order_i < partition.node_end; ++order_i)
5525         {
5526           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5527           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
5528                            (void *) vertex.node);
5529           dump_printf_loc (MSG_NOTE, vect_location,
5530                            "          weight: %f\n",
5531                            vertex.weight.to_double ());
5532           if (vertex.out_degree)
5533             dump_printf_loc (MSG_NOTE, vect_location,
5534                              "          out weight: %f (degree %d)\n",
5535                              vertex.out_weight.to_double (),
5536                              vertex.out_degree);
5537           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5538             dump_printf_loc (MSG_NOTE, vect_location,
5539                              "          op: VEC_PERM_EXPR\n");
5540           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5541             dump_printf_loc (MSG_NOTE, vect_location,
5542                              "          op template: %G", rep->stmt);
5543         }
5544       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
5545       for (unsigned int order_i = partition.node_begin;
5546            order_i < partition.node_end; ++order_i)
5547         {
5548           unsigned int node_i = m_partitioned_nodes[order_i];
5549           auto &vertex = m_vertices[node_i];
5550           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5551             {
5552               auto &other_vertex = m_vertices[other_node_i];
5553               if (other_vertex.partition < vertex.partition)
5554                 dump_printf_loc (MSG_NOTE, vect_location,
5555                                  "      - %p [%d] --> %p\n",
5556                                  (void *) other_vertex.node,
5557                                  other_vertex.partition,
5558                                  (void *) vertex.node);
5559               else
5560                 dump_printf_loc (MSG_NOTE, vect_location,
5561                                  "      - %p --> [%d] %p\n",
5562                                  (void *) vertex.node,
5563                                  other_vertex.partition,
5564                                  (void *) other_vertex.node);
5565             };
5566           for_each_partition_edge (node_i, print_edge);
5567         }
5568
5569       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5570         {
5571           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5572           if (layout_costs.is_possible ())
5573             {
5574               dump_printf_loc (MSG_NOTE, vect_location,
5575                                "    layout %d:%s\n", layout_i,
5576                                partition.layout == int (layout_i)
5577                                ? " (*)" : "");
5578               slpg_layout_cost combined_cost = layout_costs.in_cost;
5579               combined_cost.add_serial_cost (layout_costs.internal_cost);
5580               combined_cost.add_serial_cost (layout_costs.out_cost);
5581 #define TEMPLATE "{depth: %f, total: %f}"
5582               dump_printf_loc (MSG_NOTE, vect_location,
5583                                "        " TEMPLATE "\n",
5584                                layout_costs.in_cost.depth.to_double (),
5585                                layout_costs.in_cost.total.to_double ());
5586               dump_printf_loc (MSG_NOTE, vect_location,
5587                                "      + " TEMPLATE "\n",
5588                                layout_costs.internal_cost.depth.to_double (),
5589                                layout_costs.internal_cost.total.to_double ());
5590               dump_printf_loc (MSG_NOTE, vect_location,
5591                                "      + " TEMPLATE "\n",
5592                                layout_costs.out_cost.depth.to_double (),
5593                                layout_costs.out_cost.total.to_double ());
5594               dump_printf_loc (MSG_NOTE, vect_location,
5595                                "      = " TEMPLATE "\n",
5596                                combined_cost.depth.to_double (),
5597                                combined_cost.total.to_double ());
5598 #undef TEMPLATE
5599             }
5600           else
5601             dump_printf_loc (MSG_NOTE, vect_location,
5602                              "    layout %d: rejected\n", layout_i);
5603         }
5604     }
5605 }
5606
5607 /* Main entry point for the SLP graph optimization pass.  */
5608
5609 void
5610 vect_optimize_slp_pass::run ()
5611 {
5612   build_graph ();
5613   create_partitions ();
5614   start_choosing_layouts ();
5615   if (m_perms.length () > 1)
5616     {
5617       forward_pass ();
5618       backward_pass ();
5619       if (dump_enabled_p ())
5620         dump ();
5621       materialize ();
5622       while (!m_perms.is_empty ())
5623         m_perms.pop ().release ();
5624     }
5625   else
5626     remove_redundant_permutations ();
5627   free_graph (m_slpg);
5628 }
5629
5630 /* Optimize the SLP graph of VINFO.  */
5631
5632 void
5633 vect_optimize_slp (vec_info *vinfo)
5634 {
5635   if (vinfo->slp_instances.is_empty ())
5636     return;
5637   vect_optimize_slp_pass (vinfo).run ();
5638 }
5639
5640 /* Gather loads reachable from the individual SLP graph entries.  */
5641
5642 void
5643 vect_gather_slp_loads (vec_info *vinfo)
5644 {
5645   unsigned i;
5646   slp_instance instance;
5647   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5648     {
5649       hash_set<slp_tree> visited;
5650       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5651                              SLP_INSTANCE_TREE (instance), visited);
5652     }
5653 }
5654
5655
5656 /* For each possible SLP instance decide whether to SLP it and calculate overall
5657    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
5658    least one instance.  */
5659
5660 bool
5661 vect_make_slp_decision (loop_vec_info loop_vinfo)
5662 {
5663   unsigned int i;
5664   poly_uint64 unrolling_factor = 1;
5665   const vec<slp_instance> &slp_instances
5666     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5667   slp_instance instance;
5668   int decided_to_slp = 0;
5669
5670   DUMP_VECT_SCOPE ("vect_make_slp_decision");
5671
5672   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5673     {
5674       /* FORNOW: SLP if you can.  */
5675       /* All unroll factors have the form:
5676
5677            GET_MODE_SIZE (vinfo->vector_mode) * X
5678
5679          for some rational X, so they must have a common multiple.  */
5680       unrolling_factor
5681         = force_common_multiple (unrolling_factor,
5682                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
5683
5684       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
5685          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5686          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
5687       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5688       decided_to_slp++;
5689     }
5690
5691   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5692
5693   if (decided_to_slp && dump_enabled_p ())
5694     {
5695       dump_printf_loc (MSG_NOTE, vect_location,
5696                        "Decided to SLP %d instances. Unrolling factor ",
5697                        decided_to_slp);
5698       dump_dec (MSG_NOTE, unrolling_factor);
5699       dump_printf (MSG_NOTE, "\n");
5700     }
5701
5702   return (decided_to_slp > 0);
5703 }
5704
5705 /* Private data for vect_detect_hybrid_slp.  */
5706 struct vdhs_data
5707 {
5708   loop_vec_info loop_vinfo;
5709   vec<stmt_vec_info> *worklist;
5710 };
5711
5712 /* Walker for walk_gimple_op.  */
5713
5714 static tree
5715 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5716 {
5717   walk_stmt_info *wi = (walk_stmt_info *)data;
5718   vdhs_data *dat = (vdhs_data *)wi->info;
5719
5720   if (wi->is_lhs)
5721     return NULL_TREE;
5722
5723   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5724   if (!def_stmt_info)
5725     return NULL_TREE;
5726   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5727   if (PURE_SLP_STMT (def_stmt_info))
5728     {
5729       if (dump_enabled_p ())
5730         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5731                          def_stmt_info->stmt);
5732       STMT_SLP_TYPE (def_stmt_info) = hybrid;
5733       dat->worklist->safe_push (def_stmt_info);
5734     }
5735
5736   return NULL_TREE;
5737 }
5738
5739 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5740    if so, otherwise pushing it to WORKLIST.  */
5741
5742 static void
5743 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5744                                vec<stmt_vec_info> &worklist,
5745                                stmt_vec_info stmt_info)
5746 {
5747   if (dump_enabled_p ())
5748     dump_printf_loc (MSG_NOTE, vect_location,
5749                      "Processing hybrid candidate : %G", stmt_info->stmt);
5750   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5751   imm_use_iterator iter2;
5752   ssa_op_iter iter1;
5753   use_operand_p use_p;
5754   def_operand_p def_p;
5755   bool any_def = false;
5756   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5757     {
5758       any_def = true;
5759       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5760         {
5761           if (is_gimple_debug (USE_STMT (use_p)))
5762             continue;
5763           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5764           /* An out-of loop use means this is a loop_vect sink.  */
5765           if (!use_info)
5766             {
5767               if (dump_enabled_p ())
5768                 dump_printf_loc (MSG_NOTE, vect_location,
5769                                  "Found loop_vect sink: %G", stmt_info->stmt);
5770               worklist.safe_push (stmt_info);
5771               return;
5772             }
5773           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5774             {
5775               if (dump_enabled_p ())
5776                 dump_printf_loc (MSG_NOTE, vect_location,
5777                                  "Found loop_vect use: %G", use_info->stmt);
5778               worklist.safe_push (stmt_info);
5779               return;
5780             }
5781         }
5782     }
5783   /* No def means this is a loo_vect sink.  */
5784   if (!any_def)
5785     {
5786       if (dump_enabled_p ())
5787         dump_printf_loc (MSG_NOTE, vect_location,
5788                          "Found loop_vect sink: %G", stmt_info->stmt);
5789       worklist.safe_push (stmt_info);
5790       return;
5791     }
5792   if (dump_enabled_p ())
5793     dump_printf_loc (MSG_NOTE, vect_location,
5794                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5795   STMT_SLP_TYPE (stmt_info) = pure_slp;
5796 }
5797
5798 /* Find stmts that must be both vectorized and SLPed.  */
5799
5800 void
5801 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5802 {
5803   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5804
5805   /* All stmts participating in SLP are marked pure_slp, all other
5806      stmts are loop_vect.
5807      First collect all loop_vect stmts into a worklist.
5808      SLP patterns cause not all original scalar stmts to appear in
5809      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5810      Rectify this here and do a backward walk over the IL only considering
5811      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5812      mark them as pure_slp.  */
5813   auto_vec<stmt_vec_info> worklist;
5814   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5815     {
5816       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5817       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5818            gsi_next (&gsi))
5819         {
5820           gphi *phi = gsi.phi ();
5821           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5822           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5823             maybe_push_to_hybrid_worklist (loop_vinfo,
5824                                            worklist, stmt_info);
5825         }
5826       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5827            gsi_prev (&gsi))
5828         {
5829           gimple *stmt = gsi_stmt (gsi);
5830           if (is_gimple_debug (stmt))
5831             continue;
5832           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5833           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5834             {
5835               for (gimple_stmt_iterator gsi2
5836                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5837                    !gsi_end_p (gsi2); gsi_next (&gsi2))
5838                 {
5839                   stmt_vec_info patt_info
5840                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5841                   if (!STMT_SLP_TYPE (patt_info)
5842                       && STMT_VINFO_RELEVANT (patt_info))
5843                     maybe_push_to_hybrid_worklist (loop_vinfo,
5844                                                    worklist, patt_info);
5845                 }
5846               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5847             }
5848           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5849             maybe_push_to_hybrid_worklist (loop_vinfo,
5850                                            worklist, stmt_info);
5851         }
5852     }
5853
5854   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5855      mark any SLP vectorized stmt as hybrid.
5856      ???  We're visiting def stmts N times (once for each non-SLP and
5857      once for each hybrid-SLP use).  */
5858   walk_stmt_info wi;
5859   vdhs_data dat;
5860   dat.worklist = &worklist;
5861   dat.loop_vinfo = loop_vinfo;
5862   memset (&wi, 0, sizeof (wi));
5863   wi.info = (void *)&dat;
5864   while (!worklist.is_empty ())
5865     {
5866       stmt_vec_info stmt_info = worklist.pop ();
5867       /* Since SSA operands are not set up for pattern stmts we need
5868          to use walk_gimple_op.  */
5869       wi.is_lhs = 0;
5870       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5871       /* For gather/scatter make sure to walk the offset operand, that
5872          can be a scaling and conversion away.  */
5873       gather_scatter_info gs_info;
5874       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5875           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5876         {
5877           int dummy;
5878           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5879         }
5880     }
5881 }
5882
5883
5884 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
5885
5886 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5887   : vec_info (vec_info::bb, shared),
5888     bbs (_bbs),
5889     roots (vNULL)
5890 {
5891   for (unsigned i = 0; i < bbs.length (); ++i)
5892     {
5893       if (i != 0)
5894         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5895              gsi_next (&si))
5896           {
5897             gphi *phi = si.phi ();
5898             gimple_set_uid (phi, 0);
5899             add_stmt (phi);
5900           }
5901       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5902            !gsi_end_p (gsi); gsi_next (&gsi))
5903         {
5904           gimple *stmt = gsi_stmt (gsi);
5905           gimple_set_uid (stmt, 0);
5906           if (is_gimple_debug (stmt))
5907             continue;
5908           add_stmt (stmt);
5909         }
5910     }
5911 }
5912
5913
5914 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5915    stmts in the basic block.  */
5916
5917 _bb_vec_info::~_bb_vec_info ()
5918 {
5919   /* Reset region marker.  */
5920   for (unsigned i = 0; i < bbs.length (); ++i)
5921     {
5922       if (i != 0)
5923         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5924              gsi_next (&si))
5925           {
5926             gphi *phi = si.phi ();
5927             gimple_set_uid (phi, -1);
5928           }
5929       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5930            !gsi_end_p (gsi); gsi_next (&gsi))
5931         {
5932           gimple *stmt = gsi_stmt (gsi);
5933           gimple_set_uid (stmt, -1);
5934         }
5935     }
5936
5937   for (unsigned i = 0; i < roots.length (); ++i)
5938     {
5939       roots[i].stmts.release ();
5940       roots[i].roots.release ();
5941       roots[i].remain.release ();
5942     }
5943   roots.release ();
5944 }
5945
5946 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
5947    given then that child nodes have already been processed, and that
5948    their def types currently match their SLP node's def type.  */
5949
5950 static bool
5951 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5952                                     slp_instance node_instance,
5953                                     stmt_vector_for_cost *cost_vec)
5954 {
5955   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5956
5957   /* Calculate the number of vector statements to be created for the
5958      scalar stmts in this node.  For SLP reductions it is equal to the
5959      number of vector statements in the children (which has already been
5960      calculated by the recursive call).  Otherwise it is the number of
5961      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5962      VF divided by the number of elements in a vector.  */
5963   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5964       && !STMT_VINFO_DATA_REF (stmt_info)
5965       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5966     {
5967       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
5968         if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
5969           {
5970             SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5971               = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
5972             break;
5973           }
5974     }
5975   else
5976     {
5977       poly_uint64 vf;
5978       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5979         vf = loop_vinfo->vectorization_factor;
5980       else
5981         vf = 1;
5982       unsigned int group_size = SLP_TREE_LANES (node);
5983       tree vectype = SLP_TREE_VECTYPE (node);
5984       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5985         = vect_get_num_vectors (vf * group_size, vectype);
5986     }
5987
5988   /* Handle purely internal nodes.  */
5989   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5990     {
5991       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
5992         return false;
5993
5994       stmt_vec_info slp_stmt_info;
5995       unsigned int i;
5996       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
5997         {
5998           if (STMT_VINFO_LIVE_P (slp_stmt_info)
5999               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6000                                                node_instance, i,
6001                                                false, cost_vec))
6002             return false;
6003         }
6004       return true;
6005     }
6006
6007   bool dummy;
6008   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6009                             node, node_instance, cost_vec);
6010 }
6011
6012 /* Try to build NODE from scalars, returning true on success.
6013    NODE_INSTANCE is the SLP instance that contains NODE.  */
6014
6015 static bool
6016 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6017                               slp_instance node_instance)
6018 {
6019   stmt_vec_info stmt_info;
6020   unsigned int i;
6021
6022   if (!is_a <bb_vec_info> (vinfo)
6023       || node == SLP_INSTANCE_TREE (node_instance)
6024       || !SLP_TREE_SCALAR_STMTS (node).exists ()
6025       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6026       /* Force the mask use to be built from scalars instead.  */
6027       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6028     return false;
6029
6030   if (dump_enabled_p ())
6031     dump_printf_loc (MSG_NOTE, vect_location,
6032                      "Building vector operands of %p from scalars instead\n",
6033                      (void *) node);
6034
6035   /* Don't remove and free the child nodes here, since they could be
6036      referenced by other structures.  The analysis and scheduling phases
6037      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
6038   unsigned int group_size = SLP_TREE_LANES (node);
6039   SLP_TREE_DEF_TYPE (node) = vect_external_def;
6040   /* Invariants get their vector type from the uses.  */
6041   SLP_TREE_VECTYPE (node) = NULL_TREE;
6042   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6043   SLP_TREE_LOAD_PERMUTATION (node).release ();
6044   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6045     {
6046       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6047       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6048     }
6049   return true;
6050 }
6051
6052 /* Return true if all elements of the slice are the same.  */
6053 bool
6054 vect_scalar_ops_slice::all_same_p () const
6055 {
6056   for (unsigned int i = 1; i < length; ++i)
6057     if (!operand_equal_p (op (0), op (i)))
6058       return false;
6059   return true;
6060 }
6061
6062 hashval_t
6063 vect_scalar_ops_slice_hash::hash (const value_type &s)
6064 {
6065   hashval_t hash = 0;
6066   for (unsigned i = 0; i < s.length; ++i)
6067     hash = iterative_hash_expr (s.op (i), hash);
6068   return hash;
6069 }
6070
6071 bool
6072 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6073                                    const compare_type &s2)
6074 {
6075   if (s1.length != s2.length)
6076     return false;
6077   for (unsigned i = 0; i < s1.length; ++i)
6078     if (!operand_equal_p (s1.op (i), s2.op (i)))
6079       return false;
6080   return true;
6081 }
6082
6083 /* Compute the prologue cost for invariant or constant operands represented
6084    by NODE.  */
6085
6086 static void
6087 vect_prologue_cost_for_slp (slp_tree node,
6088                             stmt_vector_for_cost *cost_vec)
6089 {
6090   /* There's a special case of an existing vector, that costs nothing.  */
6091   if (SLP_TREE_SCALAR_OPS (node).length () == 0
6092       && !SLP_TREE_VEC_DEFS (node).is_empty ())
6093     return;
6094   /* Without looking at the actual initializer a vector of
6095      constants can be implemented as load from the constant pool.
6096      When all elements are the same we can use a splat.  */
6097   tree vectype = SLP_TREE_VECTYPE (node);
6098   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6099   unsigned HOST_WIDE_INT const_nunits;
6100   unsigned nelt_limit;
6101   auto ops = &SLP_TREE_SCALAR_OPS (node);
6102   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6103   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6104       && ! multiple_p (const_nunits, group_size))
6105     {
6106       nelt_limit = const_nunits;
6107       hash_set<vect_scalar_ops_slice_hash> vector_ops;
6108       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6109         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6110           starts.quick_push (i * const_nunits);
6111     }
6112   else
6113     {
6114       /* If either the vector has variable length or the vectors
6115          are composed of repeated whole groups we only need to
6116          cost construction once.  All vectors will be the same.  */
6117       nelt_limit = group_size;
6118       starts.quick_push (0);
6119     }
6120   /* ???  We're just tracking whether vectors in a single node are the same.
6121      Ideally we'd do something more global.  */
6122   bool passed = false;
6123   for (unsigned int start : starts)
6124     {
6125       vect_cost_for_stmt kind;
6126       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6127         kind = vector_load;
6128       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6129         kind = scalar_to_vec;
6130       else
6131         kind = vec_construct;
6132       /* The target cost hook has no idea which part of the SLP node
6133          we are costing so avoid passing it down more than once.  Pass
6134          it to the first vec_construct or scalar_to_vec part since for those
6135          the x86 backend tries to account for GPR to XMM register moves.  */
6136       record_stmt_cost (cost_vec, 1, kind,
6137                         (kind != vector_load && !passed) ? node : nullptr,
6138                         vectype, 0, vect_prologue);
6139       if (kind != vector_load)
6140         passed = true;
6141     }
6142 }
6143
6144 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6145    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6146
6147    Return true if the operations are supported.  */
6148
6149 static bool
6150 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6151                                   slp_instance node_instance,
6152                                   hash_set<slp_tree> &visited_set,
6153                                   vec<slp_tree> &visited_vec,
6154                                   stmt_vector_for_cost *cost_vec)
6155 {
6156   int i, j;
6157   slp_tree child;
6158
6159   /* Assume we can code-generate all invariants.  */
6160   if (!node
6161       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6162       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6163     return true;
6164
6165   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6166     {
6167       if (dump_enabled_p ())
6168         dump_printf_loc (MSG_NOTE, vect_location,
6169                          "Failed cyclic SLP reference in %p\n", (void *) node);
6170       return false;
6171     }
6172   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6173
6174   /* If we already analyzed the exact same set of scalar stmts we're done.
6175      We share the generated vector stmts for those.  */
6176   if (visited_set.add (node))
6177     return true;
6178   visited_vec.safe_push (node);
6179
6180   bool res = true;
6181   unsigned visited_rec_start = visited_vec.length ();
6182   unsigned cost_vec_rec_start = cost_vec->length ();
6183   bool seen_non_constant_child = false;
6184   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6185     {
6186       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6187                                               visited_set, visited_vec,
6188                                               cost_vec);
6189       if (!res)
6190         break;
6191       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6192         seen_non_constant_child = true;
6193     }
6194   /* We're having difficulties scheduling nodes with just constant
6195      operands and no scalar stmts since we then cannot compute a stmt
6196      insertion place.  */
6197   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6198     {
6199       if (dump_enabled_p ())
6200         dump_printf_loc (MSG_NOTE, vect_location,
6201                          "Cannot vectorize all-constant op node %p\n",
6202                          (void *) node);
6203       res = false;
6204     }
6205
6206   if (res)
6207     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6208                                               cost_vec);
6209   /* If analysis failed we have to pop all recursive visited nodes
6210      plus ourselves.  */
6211   if (!res)
6212     {
6213       while (visited_vec.length () >= visited_rec_start)
6214         visited_set.remove (visited_vec.pop ());
6215       cost_vec->truncate (cost_vec_rec_start);
6216     }
6217
6218   /* When the node can be vectorized cost invariant nodes it references.
6219      This is not done in DFS order to allow the refering node
6220      vectorizable_* calls to nail down the invariant nodes vector type
6221      and possibly unshare it if it needs a different vector type than
6222      other referrers.  */
6223   if (res)
6224     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6225       if (child
6226           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6227               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6228           /* Perform usual caching, note code-generation still
6229              code-gens these nodes multiple times but we expect
6230              to CSE them later.  */
6231           && !visited_set.add (child))
6232         {
6233           visited_vec.safe_push (child);
6234           /* ???  After auditing more code paths make a "default"
6235              and push the vector type from NODE to all children
6236              if it is not already set.  */
6237           /* Compute the number of vectors to be generated.  */
6238           tree vector_type = SLP_TREE_VECTYPE (child);
6239           if (!vector_type)
6240             {
6241               /* For shifts with a scalar argument we don't need
6242                  to cost or code-generate anything.
6243                  ???  Represent this more explicitely.  */
6244               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6245                            == shift_vec_info_type)
6246                           && j == 1);
6247               continue;
6248             }
6249           unsigned group_size = SLP_TREE_LANES (child);
6250           poly_uint64 vf = 1;
6251           if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6252             vf = loop_vinfo->vectorization_factor;
6253           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6254             = vect_get_num_vectors (vf * group_size, vector_type);
6255           /* And cost them.  */
6256           vect_prologue_cost_for_slp (child, cost_vec);
6257         }
6258
6259   /* If this node or any of its children can't be vectorized, try pruning
6260      the tree here rather than felling the whole thing.  */
6261   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6262     {
6263       /* We'll need to revisit this for invariant costing and number
6264          of vectorized stmt setting.   */
6265       res = true;
6266     }
6267
6268   return res;
6269 }
6270
6271 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6272    region and that can be vectorized using vectorizable_live_operation
6273    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
6274    scalar code computing it to be retained.  */
6275
6276 static void
6277 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6278                              slp_instance instance,
6279                              stmt_vector_for_cost *cost_vec,
6280                              hash_set<stmt_vec_info> &svisited,
6281                              hash_set<slp_tree> &visited)
6282 {
6283   if (visited.add (node))
6284     return;
6285
6286   unsigned i;
6287   stmt_vec_info stmt_info;
6288   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6289   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6290     {
6291       if (svisited.contains (stmt_info))
6292         continue;
6293       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6294       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6295           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6296         /* Only the pattern root stmt computes the original scalar value.  */
6297         continue;
6298       bool mark_visited = true;
6299       gimple *orig_stmt = orig_stmt_info->stmt;
6300       ssa_op_iter op_iter;
6301       def_operand_p def_p;
6302       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6303         {
6304           imm_use_iterator use_iter;
6305           gimple *use_stmt;
6306           stmt_vec_info use_stmt_info;
6307           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6308             if (!is_gimple_debug (use_stmt))
6309               {
6310                 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6311                 if (!use_stmt_info
6312                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6313                   {
6314                     STMT_VINFO_LIVE_P (stmt_info) = true;
6315                     if (vectorizable_live_operation (bb_vinfo, stmt_info,
6316                                                      node, instance, i,
6317                                                      false, cost_vec))
6318                       /* ???  So we know we can vectorize the live stmt
6319                          from one SLP node.  If we cannot do so from all
6320                          or none consistently we'd have to record which
6321                          SLP node (and lane) we want to use for the live
6322                          operation.  So make sure we can code-generate
6323                          from all nodes.  */
6324                       mark_visited = false;
6325                     else
6326                       STMT_VINFO_LIVE_P (stmt_info) = false;
6327                     break;
6328                   }
6329               }
6330           /* We have to verify whether we can insert the lane extract
6331              before all uses.  The following is a conservative approximation.
6332              We cannot put this into vectorizable_live_operation because
6333              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6334              doesn't work.
6335              Note that while the fact that we emit code for loads at the
6336              first load should make this a non-problem leafs we construct
6337              from scalars are vectorized after the last scalar def.
6338              ???  If we'd actually compute the insert location during
6339              analysis we could use sth less conservative than the last
6340              scalar stmt in the node for the dominance check.  */
6341           /* ???  What remains is "live" uses in vector CTORs in the same
6342              SLP graph which is where those uses can end up code-generated
6343              right after their definition instead of close to their original
6344              use.  But that would restrict us to code-generate lane-extracts
6345              from the latest stmt in a node.  So we compensate for this
6346              during code-generation, simply not replacing uses for those
6347              hopefully rare cases.  */
6348           if (STMT_VINFO_LIVE_P (stmt_info))
6349             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6350               if (!is_gimple_debug (use_stmt)
6351                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6352                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6353                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6354                 {
6355                   if (dump_enabled_p ())
6356                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357                                      "Cannot determine insertion place for "
6358                                      "lane extract\n");
6359                   STMT_VINFO_LIVE_P (stmt_info) = false;
6360                   mark_visited = true;
6361                 }
6362         }
6363       if (mark_visited)
6364         svisited.add (stmt_info);
6365     }
6366
6367   slp_tree child;
6368   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6369     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6370       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6371                                    cost_vec, svisited, visited);
6372 }
6373
6374 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
6375
6376 static bool
6377 vectorizable_bb_reduc_epilogue (slp_instance instance,
6378                                 stmt_vector_for_cost *cost_vec)
6379 {
6380   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6381   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6382   if (reduc_code == MINUS_EXPR)
6383     reduc_code = PLUS_EXPR;
6384   internal_fn reduc_fn;
6385   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6386   if (!vectype
6387       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6388       || reduc_fn == IFN_LAST
6389       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6390       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6391                                      TREE_TYPE (vectype)))
6392     {
6393       if (dump_enabled_p ())
6394         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6395                          "not vectorized: basic block reduction epilogue "
6396                          "operation unsupported.\n");
6397       return false;
6398     }
6399
6400   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6401      cost log2 vector operations plus shuffles and one extraction.  */
6402   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6403   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6404                     vectype, 0, vect_body);
6405   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6406                     vectype, 0, vect_body);
6407   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6408                     vectype, 0, vect_body);
6409   return true;
6410 }
6411
6412 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6413    and recurse to children.  */
6414
6415 static void
6416 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6417                               hash_set<slp_tree> &visited)
6418 {
6419   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6420       || visited.add (node))
6421     return;
6422
6423   stmt_vec_info stmt;
6424   unsigned i;
6425   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6426     roots.remove (vect_orig_stmt (stmt));
6427
6428   slp_tree child;
6429   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6430     if (child)
6431       vect_slp_prune_covered_roots (child, roots, visited);
6432 }
6433
6434 /* Analyze statements in SLP instances of VINFO.  Return true if the
6435    operations are supported. */
6436
6437 bool
6438 vect_slp_analyze_operations (vec_info *vinfo)
6439 {
6440   slp_instance instance;
6441   int i;
6442
6443   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6444
6445   hash_set<slp_tree> visited;
6446   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6447     {
6448       auto_vec<slp_tree> visited_vec;
6449       stmt_vector_for_cost cost_vec;
6450       cost_vec.create (2);
6451       if (is_a <bb_vec_info> (vinfo))
6452         vect_location = instance->location ();
6453       if (!vect_slp_analyze_node_operations (vinfo,
6454                                              SLP_INSTANCE_TREE (instance),
6455                                              instance, visited, visited_vec,
6456                                              &cost_vec)
6457           /* CTOR instances require vectorized defs for the SLP tree root.  */
6458           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6459               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6460                   != vect_internal_def
6461                   /* Make sure we vectorized with the expected type.  */
6462                   || !useless_type_conversion_p
6463                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6464                                               (instance->root_stmts[0]->stmt))),
6465                          TREE_TYPE (SLP_TREE_VECTYPE
6466                                             (SLP_INSTANCE_TREE (instance))))))
6467           /* Check we can vectorize the reduction.  */
6468           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6469               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6470         {
6471           slp_tree node = SLP_INSTANCE_TREE (instance);
6472           stmt_vec_info stmt_info;
6473           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6474             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6475           else
6476             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6477           if (dump_enabled_p ())
6478             dump_printf_loc (MSG_NOTE, vect_location,
6479                              "removing SLP instance operations starting from: %G",
6480                              stmt_info->stmt);
6481           vect_free_slp_instance (instance);
6482           vinfo->slp_instances.ordered_remove (i);
6483           cost_vec.release ();
6484           while (!visited_vec.is_empty ())
6485             visited.remove (visited_vec.pop ());
6486         }
6487       else
6488         {
6489           i++;
6490           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6491             {
6492               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6493               cost_vec.release ();
6494             }
6495           else
6496             /* For BB vectorization remember the SLP graph entry
6497                cost for later.  */
6498             instance->cost_vec = cost_vec;
6499         }
6500     }
6501
6502   /* Now look for SLP instances with a root that are covered by other
6503      instances and remove them.  */
6504   hash_set<stmt_vec_info> roots;
6505   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6506     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6507       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6508   if (!roots.is_empty ())
6509     {
6510       visited.empty ();
6511       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6512         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6513                                       visited);
6514       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6515         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6516             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6517           {
6518             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6519             if (dump_enabled_p ())
6520               dump_printf_loc (MSG_NOTE, vect_location,
6521                                "removing SLP instance operations starting "
6522                                "from: %G", root->stmt);
6523             vect_free_slp_instance (instance);
6524             vinfo->slp_instances.ordered_remove (i);
6525           }
6526         else
6527           ++i;
6528     }
6529
6530   /* Compute vectorizable live stmts.  */
6531   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6532     {
6533       hash_set<stmt_vec_info> svisited;
6534       hash_set<slp_tree> visited;
6535       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6536         {
6537           vect_location = instance->location ();
6538           vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6539                                        instance, &instance->cost_vec, svisited,
6540                                        visited);
6541         }
6542     }
6543
6544   return !vinfo->slp_instances.is_empty ();
6545 }
6546
6547 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6548    closing the eventual chain.  */
6549
6550 static slp_instance
6551 get_ultimate_leader (slp_instance instance,
6552                      hash_map<slp_instance, slp_instance> &instance_leader)
6553 {
6554   auto_vec<slp_instance *, 8> chain;
6555   slp_instance *tem;
6556   while (*(tem = instance_leader.get (instance)) != instance)
6557     {
6558       chain.safe_push (tem);
6559       instance = *tem;
6560     }
6561   while (!chain.is_empty ())
6562     *chain.pop () = instance;
6563   return instance;
6564 }
6565
6566 namespace {
6567 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
6568    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6569    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
6570
6571    INSTANCE_LEADER is as for get_ultimate_leader.  */
6572
6573 template<typename T>
6574 bool
6575 vect_map_to_instance (slp_instance instance, T key,
6576                       hash_map<T, slp_instance> &key_to_instance,
6577                       hash_map<slp_instance, slp_instance> &instance_leader)
6578 {
6579   bool existed_p;
6580   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6581   if (!existed_p)
6582     ;
6583   else if (key_instance != instance)
6584     {
6585       /* If we're running into a previously marked key make us the
6586          leader of the current ultimate leader.  This keeps the
6587          leader chain acyclic and works even when the current instance
6588          connects two previously independent graph parts.  */
6589       slp_instance key_leader
6590         = get_ultimate_leader (key_instance, instance_leader);
6591       if (key_leader != instance)
6592         instance_leader.put (key_leader, instance);
6593     }
6594   key_instance = instance;
6595   return existed_p;
6596 }
6597 }
6598
6599 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
6600
6601 static void
6602 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6603                            slp_instance instance, slp_tree node,
6604                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6605                            hash_map<slp_tree, slp_instance> &node_to_instance,
6606                            hash_map<slp_instance, slp_instance> &instance_leader)
6607 {
6608   stmt_vec_info stmt_info;
6609   unsigned i;
6610
6611   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6612     vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6613                           instance_leader);
6614
6615   if (vect_map_to_instance (instance, node, node_to_instance,
6616                             instance_leader))
6617     return;
6618
6619   slp_tree child;
6620   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6621     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6622       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6623                                  node_to_instance, instance_leader);
6624 }
6625
6626 /* Partition the SLP graph into pieces that can be costed independently.  */
6627
6628 static void
6629 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6630 {
6631   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6632
6633   /* First walk the SLP graph assigning each involved scalar stmt a
6634      corresponding SLP graph entry and upon visiting a previously
6635      marked stmt, make the stmts leader the current SLP graph entry.  */
6636   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6637   hash_map<slp_tree, slp_instance> node_to_instance;
6638   hash_map<slp_instance, slp_instance> instance_leader;
6639   slp_instance instance;
6640   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6641     {
6642       instance_leader.put (instance, instance);
6643       vect_bb_partition_graph_r (bb_vinfo,
6644                                  instance, SLP_INSTANCE_TREE (instance),
6645                                  stmt_to_instance, node_to_instance,
6646                                  instance_leader);
6647     }
6648
6649   /* Then collect entries to each independent subgraph.  */
6650   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6651     {
6652       slp_instance leader = get_ultimate_leader (instance, instance_leader);
6653       leader->subgraph_entries.safe_push (instance);
6654       if (dump_enabled_p ()
6655           && leader != instance)
6656         dump_printf_loc (MSG_NOTE, vect_location,
6657                          "instance %p is leader of %p\n",
6658                          (void *) leader, (void *) instance);
6659     }
6660 }
6661
6662 /* Compute the set of scalar stmts participating in internal and external
6663    nodes.  */
6664
6665 static void
6666 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6667                                          hash_set<slp_tree> &visited,
6668                                          hash_set<stmt_vec_info> &vstmts,
6669                                          hash_set<stmt_vec_info> &estmts)
6670 {
6671   int i;
6672   stmt_vec_info stmt_info;
6673   slp_tree child;
6674
6675   if (visited.add (node))
6676     return;
6677
6678   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6679     {
6680       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6681         vstmts.add (stmt_info);
6682
6683       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6684         if (child)
6685           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6686                                                    vstmts, estmts);
6687     }
6688   else
6689     for (tree def : SLP_TREE_SCALAR_OPS (node))
6690       {
6691         stmt_vec_info def_stmt = vinfo->lookup_def (def);
6692         if (def_stmt)
6693           estmts.add (def_stmt);
6694       }
6695 }
6696
6697
6698 /* Compute the scalar cost of the SLP node NODE and its children
6699    and return it.  Do not account defs that are marked in LIFE and
6700    update LIFE according to uses of NODE.  */
6701
6702 static void
6703 vect_bb_slp_scalar_cost (vec_info *vinfo,
6704                          slp_tree node, vec<bool, va_heap> *life,
6705                          stmt_vector_for_cost *cost_vec,
6706                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6707                          hash_set<slp_tree> &visited)
6708 {
6709   unsigned i;
6710   stmt_vec_info stmt_info;
6711   slp_tree child;
6712
6713   if (visited.add (node))
6714     return;
6715
6716   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6717     {
6718       ssa_op_iter op_iter;
6719       def_operand_p def_p;
6720
6721       if ((*life)[i])
6722         continue;
6723
6724       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6725       gimple *orig_stmt = orig_stmt_info->stmt;
6726
6727       /* If there is a non-vectorized use of the defs then the scalar
6728          stmt is kept live in which case we do not account it or any
6729          required defs in the SLP children in the scalar cost.  This
6730          way we make the vectorization more costly when compared to
6731          the scalar cost.  */
6732       if (!STMT_VINFO_LIVE_P (stmt_info))
6733         {
6734           auto_vec<gimple *, 8> worklist;
6735           hash_set<gimple *> *worklist_visited = NULL;
6736           worklist.quick_push (orig_stmt);
6737           do
6738             {
6739               gimple *work_stmt = worklist.pop ();
6740               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6741                 {
6742                   imm_use_iterator use_iter;
6743                   gimple *use_stmt;
6744                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6745                                          DEF_FROM_PTR (def_p))
6746                     if (!is_gimple_debug (use_stmt))
6747                       {
6748                         stmt_vec_info use_stmt_info
6749                           = vinfo->lookup_stmt (use_stmt);
6750                         if (!use_stmt_info
6751                             || !vectorized_scalar_stmts.contains (use_stmt_info))
6752                           {
6753                             if (use_stmt_info
6754                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6755                               {
6756                                 /* For stmts participating in patterns we have
6757                                    to check its uses recursively.  */
6758                                 if (!worklist_visited)
6759                                   worklist_visited = new hash_set<gimple *> ();
6760                                 if (!worklist_visited->add (use_stmt))
6761                                   worklist.safe_push (use_stmt);
6762                                 continue;
6763                               }
6764                             (*life)[i] = true;
6765                             goto next_lane;
6766                           }
6767                       }
6768                 }
6769             }
6770           while (!worklist.is_empty ());
6771 next_lane:
6772           if (worklist_visited)
6773             delete worklist_visited;
6774           if ((*life)[i])
6775             continue;
6776         }
6777
6778       /* Count scalar stmts only once.  */
6779       if (gimple_visited_p (orig_stmt))
6780         continue;
6781       gimple_set_visited (orig_stmt, true);
6782
6783       vect_cost_for_stmt kind;
6784       if (STMT_VINFO_DATA_REF (orig_stmt_info))
6785         {
6786           if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6787             kind = scalar_load;
6788           else
6789             kind = scalar_store;
6790         }
6791       else if (vect_nop_conversion_p (orig_stmt_info))
6792         continue;
6793       /* For single-argument PHIs assume coalescing which means zero cost
6794          for the scalar and the vector PHIs.  This avoids artificially
6795          favoring the vector path (but may pessimize it in some cases).  */
6796       else if (is_a <gphi *> (orig_stmt_info->stmt)
6797                && gimple_phi_num_args
6798                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6799         continue;
6800       else
6801         kind = scalar_stmt;
6802       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6803                         SLP_TREE_VECTYPE (node), 0, vect_body);
6804     }
6805
6806   auto_vec<bool, 20> subtree_life;
6807   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6808     {
6809       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6810         {
6811           /* Do not directly pass LIFE to the recursive call, copy it to
6812              confine changes in the callee to the current child/subtree.  */
6813           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6814             {
6815               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6816               for (unsigned j = 0;
6817                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6818                 {
6819                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6820                   if (perm.first == i)
6821                     subtree_life[perm.second] = (*life)[j];
6822                 }
6823             }
6824           else
6825             {
6826               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6827               subtree_life.safe_splice (*life);
6828             }
6829           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6830                                    vectorized_scalar_stmts, visited);
6831           subtree_life.truncate (0);
6832         }
6833     }
6834 }
6835
6836 /* Comparator for the loop-index sorted cost vectors.  */
6837
6838 static int
6839 li_cost_vec_cmp (const void *a_, const void *b_)
6840 {
6841   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6842   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6843   if (a->first < b->first)
6844     return -1;
6845   else if (a->first == b->first)
6846     return 0;
6847   return 1;
6848 }
6849
6850 /* Check if vectorization of the basic block is profitable for the
6851    subgraph denoted by SLP_INSTANCES.  */
6852
6853 static bool
6854 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6855                                     vec<slp_instance> slp_instances,
6856                                     loop_p orig_loop)
6857 {
6858   slp_instance instance;
6859   int i;
6860   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6861   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6862
6863   if (dump_enabled_p ())
6864     {
6865       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6866       hash_set<slp_tree> visited;
6867       FOR_EACH_VEC_ELT (slp_instances, i, instance)
6868         vect_print_slp_graph (MSG_NOTE, vect_location,
6869                               SLP_INSTANCE_TREE (instance), visited);
6870     }
6871
6872   /* Compute the set of scalar stmts we know will go away 'locally' when
6873      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
6874      not accurate for nodes promoted extern late or for scalar stmts that
6875      are used both in extern defs and in vectorized defs.  */
6876   hash_set<stmt_vec_info> vectorized_scalar_stmts;
6877   hash_set<stmt_vec_info> scalar_stmts_in_externs;
6878   hash_set<slp_tree> visited;
6879   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6880     {
6881       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6882                                                SLP_INSTANCE_TREE (instance),
6883                                                visited,
6884                                                vectorized_scalar_stmts,
6885                                                scalar_stmts_in_externs);
6886       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6887         vectorized_scalar_stmts.add (rstmt);
6888     }
6889   /* Scalar stmts used as defs in external nodes need to be preseved, so
6890      remove them from vectorized_scalar_stmts.  */
6891   for (stmt_vec_info stmt : scalar_stmts_in_externs)
6892     vectorized_scalar_stmts.remove (stmt);
6893
6894   /* Calculate scalar cost and sum the cost for the vector stmts
6895      previously collected.  */
6896   stmt_vector_for_cost scalar_costs = vNULL;
6897   stmt_vector_for_cost vector_costs = vNULL;
6898   visited.empty ();
6899   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6900     {
6901       auto_vec<bool, 20> life;
6902       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6903                               true);
6904       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6905         record_stmt_cost (&scalar_costs,
6906                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
6907                           scalar_stmt,
6908                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6909       vect_bb_slp_scalar_cost (bb_vinfo,
6910                                SLP_INSTANCE_TREE (instance),
6911                                &life, &scalar_costs, vectorized_scalar_stmts,
6912                                visited);
6913       vector_costs.safe_splice (instance->cost_vec);
6914       instance->cost_vec.release ();
6915     }
6916
6917   if (dump_enabled_p ())
6918     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6919
6920   /* When costing non-loop vectorization we need to consider each covered
6921      loop independently and make sure vectorization is profitable.  For
6922      now we assume a loop may be not entered or executed an arbitrary
6923      number of iterations (???  static information can provide more
6924      precise info here) which means we can simply cost each containing
6925      loops stmts separately.  */
6926
6927   /* First produce cost vectors sorted by loop index.  */
6928   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6929     li_scalar_costs (scalar_costs.length ());
6930   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6931     li_vector_costs (vector_costs.length ());
6932   stmt_info_for_cost *cost;
6933   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6934     {
6935       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6936       li_scalar_costs.quick_push (std::make_pair (l, cost));
6937     }
6938   /* Use a random used loop as fallback in case the first vector_costs
6939      entry does not have a stmt_info associated with it.  */
6940   unsigned l = li_scalar_costs[0].first;
6941   FOR_EACH_VEC_ELT (vector_costs, i, cost)
6942     {
6943       /* We inherit from the previous COST, invariants, externals and
6944          extracts immediately follow the cost for the related stmt.  */
6945       if (cost->stmt_info)
6946         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6947       li_vector_costs.quick_push (std::make_pair (l, cost));
6948     }
6949   li_scalar_costs.qsort (li_cost_vec_cmp);
6950   li_vector_costs.qsort (li_cost_vec_cmp);
6951
6952   /* Now cost the portions individually.  */
6953   unsigned vi = 0;
6954   unsigned si = 0;
6955   bool profitable = true;
6956   while (si < li_scalar_costs.length ()
6957          && vi < li_vector_costs.length ())
6958     {
6959       unsigned sl = li_scalar_costs[si].first;
6960       unsigned vl = li_vector_costs[vi].first;
6961       if (sl != vl)
6962         {
6963           if (dump_enabled_p ())
6964             dump_printf_loc (MSG_NOTE, vect_location,
6965                              "Scalar %d and vector %d loop part do not "
6966                              "match up, skipping scalar part\n", sl, vl);
6967           /* Skip the scalar part, assuming zero cost on the vector side.  */
6968           do
6969             {
6970               si++;
6971             }
6972           while (si < li_scalar_costs.length ()
6973                  && li_scalar_costs[si].first == sl);
6974           continue;
6975         }
6976
6977       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
6978       do
6979         {
6980           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
6981           si++;
6982         }
6983       while (si < li_scalar_costs.length ()
6984              && li_scalar_costs[si].first == sl);
6985       unsigned dummy;
6986       finish_cost (scalar_target_cost_data, nullptr,
6987                    &dummy, &scalar_cost, &dummy);
6988
6989       /* Complete the target-specific vector cost calculation.  */
6990       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
6991       do
6992         {
6993           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
6994           vi++;
6995         }
6996       while (vi < li_vector_costs.length ()
6997              && li_vector_costs[vi].first == vl);
6998       finish_cost (vect_target_cost_data, scalar_target_cost_data,
6999                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7000       delete scalar_target_cost_data;
7001       delete vect_target_cost_data;
7002
7003       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7004
7005       if (dump_enabled_p ())
7006         {
7007           dump_printf_loc (MSG_NOTE, vect_location,
7008                            "Cost model analysis for part in loop %d:\n", sl);
7009           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
7010                        vec_inside_cost + vec_outside_cost);
7011           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
7012         }
7013
7014       /* Vectorization is profitable if its cost is more than the cost of scalar
7015          version.  Note that we err on the vector side for equal cost because
7016          the cost estimate is otherwise quite pessimistic (constant uses are
7017          free on the scalar side but cost a load on the vector side for
7018          example).  */
7019       if (vec_outside_cost + vec_inside_cost > scalar_cost)
7020         {
7021           profitable = false;
7022           break;
7023         }
7024     }
7025   if (profitable && vi < li_vector_costs.length ())
7026     {
7027       if (dump_enabled_p ())
7028         dump_printf_loc (MSG_NOTE, vect_location,
7029                          "Excess vector cost for part in loop %d:\n",
7030                          li_vector_costs[vi].first);
7031       profitable = false;
7032     }
7033
7034   /* Unset visited flag.  This is delayed when the subgraph is profitable
7035      and we process the loop for remaining unvectorized if-converted code.  */
7036   if (!orig_loop || !profitable)
7037     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7038       gimple_set_visited  (cost->stmt_info->stmt, false);
7039
7040   scalar_costs.release ();
7041   vector_costs.release ();
7042
7043   return profitable;
7044 }
7045
7046 /* qsort comparator for lane defs.  */
7047
7048 static int
7049 vld_cmp (const void *a_, const void *b_)
7050 {
7051   auto *a = (const std::pair<unsigned, tree> *)a_;
7052   auto *b = (const std::pair<unsigned, tree> *)b_;
7053   return a->first - b->first;
7054 }
7055
7056 /* Return true if USE_STMT is a vector lane insert into VEC and set
7057    *THIS_LANE to the lane number that is set.  */
7058
7059 static bool
7060 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7061 {
7062   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7063   if (!use_ass
7064       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7065       || (vec
7066           ? gimple_assign_rhs1 (use_ass) != vec
7067           : ((vec = gimple_assign_rhs1 (use_ass)), false))
7068       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7069                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7070       || !constant_multiple_p
7071             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7072              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7073              this_lane))
7074     return false;
7075   return true;
7076 }
7077
7078 /* Find any vectorizable constructors and add them to the grouped_store
7079    array.  */
7080
7081 static void
7082 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7083 {
7084   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7085     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7086          !gsi_end_p (gsi); gsi_next (&gsi))
7087     {
7088       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7089       if (!assign)
7090         continue;
7091
7092       tree rhs = gimple_assign_rhs1 (assign);
7093       enum tree_code code = gimple_assign_rhs_code (assign);
7094       use_operand_p use_p;
7095       gimple *use_stmt;
7096       if (code == CONSTRUCTOR)
7097         {
7098           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7099               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7100                            CONSTRUCTOR_NELTS (rhs))
7101               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7102               || uniform_vector_p (rhs))
7103             continue;
7104
7105           unsigned j;
7106           tree val;
7107           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7108             if (TREE_CODE (val) != SSA_NAME
7109                 || !bb_vinfo->lookup_def (val))
7110               break;
7111           if (j != CONSTRUCTOR_NELTS (rhs))
7112             continue;
7113
7114           vec<stmt_vec_info> roots = vNULL;
7115           roots.safe_push (bb_vinfo->lookup_stmt (assign));
7116           vec<stmt_vec_info> stmts;
7117           stmts.create (CONSTRUCTOR_NELTS (rhs));
7118           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7119             stmts.quick_push
7120               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7121           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7122                                                stmts, roots));
7123         }
7124       else if (code == BIT_INSERT_EXPR
7125                && VECTOR_TYPE_P (TREE_TYPE (rhs))
7126                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7127                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7128                && integer_zerop (gimple_assign_rhs3 (assign))
7129                && useless_type_conversion_p
7130                     (TREE_TYPE (TREE_TYPE (rhs)),
7131                      TREE_TYPE (gimple_assign_rhs2 (assign)))
7132                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7133         {
7134           /* We start to match on insert to lane zero but since the
7135              inserts need not be ordered we'd have to search both
7136              the def and the use chains.  */
7137           tree vectype = TREE_TYPE (rhs);
7138           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7139           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7140           auto_sbitmap lanes (nlanes);
7141           bitmap_clear (lanes);
7142           bitmap_set_bit (lanes, 0);
7143           tree def = gimple_assign_lhs (assign);
7144           lane_defs.quick_push
7145                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
7146           unsigned lanes_found = 1;
7147           /* Start with the use chains, the last stmt will be the root.  */
7148           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7149           vec<stmt_vec_info> roots = vNULL;
7150           roots.safe_push (last);
7151           do
7152             {
7153               use_operand_p use_p;
7154               gimple *use_stmt;
7155               if (!single_imm_use (def, &use_p, &use_stmt))
7156                 break;
7157               unsigned this_lane;
7158               if (!bb_vinfo->lookup_stmt (use_stmt)
7159                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7160                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7161                 break;
7162               if (bitmap_bit_p (lanes, this_lane))
7163                 break;
7164               lanes_found++;
7165               bitmap_set_bit (lanes, this_lane);
7166               gassign *use_ass = as_a <gassign *> (use_stmt);
7167               lane_defs.quick_push (std::make_pair
7168                                      (this_lane, gimple_assign_rhs2 (use_ass)));
7169               last = bb_vinfo->lookup_stmt (use_ass);
7170               roots.safe_push (last);
7171               def = gimple_assign_lhs (use_ass);
7172             }
7173           while (lanes_found < nlanes);
7174           if (roots.length () > 1)
7175             std::swap(roots[0], roots[roots.length () - 1]);
7176           if (lanes_found < nlanes)
7177             {
7178               /* Now search the def chain.  */
7179               def = gimple_assign_rhs1 (assign);
7180               do
7181                 {
7182                   if (TREE_CODE (def) != SSA_NAME
7183                       || !has_single_use (def))
7184                     break;
7185                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7186                   unsigned this_lane;
7187                   if (!bb_vinfo->lookup_stmt (def_stmt)
7188                       || !vect_slp_is_lane_insert (def_stmt,
7189                                                    NULL_TREE, &this_lane)
7190                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7191                     break;
7192                   if (bitmap_bit_p (lanes, this_lane))
7193                     break;
7194                   lanes_found++;
7195                   bitmap_set_bit (lanes, this_lane);
7196                   lane_defs.quick_push (std::make_pair
7197                                           (this_lane,
7198                                            gimple_assign_rhs2 (def_stmt)));
7199                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7200                   def = gimple_assign_rhs1 (def_stmt);
7201                 }
7202               while (lanes_found < nlanes);
7203             }
7204           if (lanes_found == nlanes)
7205             {
7206               /* Sort lane_defs after the lane index and register the root.  */
7207               lane_defs.qsort (vld_cmp);
7208               vec<stmt_vec_info> stmts;
7209               stmts.create (nlanes);
7210               for (unsigned i = 0; i < nlanes; ++i)
7211                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7212               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7213                                                    stmts, roots));
7214             }
7215           else
7216             roots.release ();
7217         }
7218       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7219                && (associative_tree_code (code) || code == MINUS_EXPR)
7220                /* ???  This pessimizes a two-element reduction.  PR54400.
7221                   ???  In-order reduction could be handled if we only
7222                   traverse one operand chain in vect_slp_linearize_chain.  */
7223                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7224                /* Ops with constants at the tail can be stripped here.  */
7225                && TREE_CODE (rhs) == SSA_NAME
7226                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7227                /* Should be the chain end.  */
7228                && (!single_imm_use (gimple_assign_lhs (assign),
7229                                     &use_p, &use_stmt)
7230                    || !is_gimple_assign (use_stmt)
7231                    || (gimple_assign_rhs_code (use_stmt) != code
7232                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
7233                            || (gimple_assign_rhs_code (use_stmt)
7234                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7235         {
7236           /* We start the match at the end of a possible association
7237              chain.  */
7238           auto_vec<chain_op_t> chain;
7239           auto_vec<std::pair<tree_code, gimple *> > worklist;
7240           auto_vec<gimple *> chain_stmts;
7241           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7242           if (code == MINUS_EXPR)
7243             code = PLUS_EXPR;
7244           internal_fn reduc_fn;
7245           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7246               || reduc_fn == IFN_LAST)
7247             continue;
7248           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7249                                     /* ??? */
7250                                     code_stmt, alt_code_stmt, &chain_stmts);
7251           if (chain.length () > 1)
7252             {
7253               /* Sort the chain according to def_type and operation.  */
7254               chain.sort (dt_sort_cmp, bb_vinfo);
7255               /* ???  Now we'd want to strip externals and constants
7256                  but record those to be handled in the epilogue.  */
7257               /* ???  For now do not allow mixing ops or externs/constants.  */
7258               bool invalid = false;
7259               unsigned remain_cnt = 0;
7260               for (unsigned i = 0; i < chain.length (); ++i)
7261                 {
7262                   if (chain[i].code != code)
7263                     {
7264                       invalid = true;
7265                       break;
7266                     }
7267                   if (chain[i].dt != vect_internal_def)
7268                     remain_cnt++;
7269                 }
7270               if (!invalid && chain.length () - remain_cnt > 1)
7271                 {
7272                   vec<stmt_vec_info> stmts;
7273                   vec<tree> remain = vNULL;
7274                   stmts.create (chain.length ());
7275                   if (remain_cnt > 0)
7276                     remain.create (remain_cnt);
7277                   for (unsigned i = 0; i < chain.length (); ++i)
7278                     {
7279                       if (chain[i].dt == vect_internal_def)
7280                         stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7281                       else
7282                         remain.quick_push (chain[i].op);
7283                     }
7284                   vec<stmt_vec_info> roots;
7285                   roots.create (chain_stmts.length ());
7286                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
7287                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7288                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7289                                                        stmts, roots, remain));
7290                 }
7291             }
7292         }
7293     }
7294 }
7295
7296 /* Walk the grouped store chains and replace entries with their
7297    pattern variant if any.  */
7298
7299 static void
7300 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7301 {
7302   stmt_vec_info first_element;
7303   unsigned i;
7304
7305   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7306     {
7307       /* We also have CTORs in this array.  */
7308       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7309         continue;
7310       if (STMT_VINFO_IN_PATTERN_P (first_element))
7311         {
7312           stmt_vec_info orig = first_element;
7313           first_element = STMT_VINFO_RELATED_STMT (first_element);
7314           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7315           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7316           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7317           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7318           vinfo->grouped_stores[i] = first_element;
7319         }
7320       stmt_vec_info prev = first_element;
7321       while (DR_GROUP_NEXT_ELEMENT (prev))
7322         {
7323           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7324           if (STMT_VINFO_IN_PATTERN_P (elt))
7325             {
7326               stmt_vec_info orig = elt;
7327               elt = STMT_VINFO_RELATED_STMT (elt);
7328               DR_GROUP_NEXT_ELEMENT (prev) = elt;
7329               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7330               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7331             }
7332           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7333           prev = elt;
7334         }
7335     }
7336 }
7337
7338 /* Check if the region described by BB_VINFO can be vectorized, returning
7339    true if so.  When returning false, set FATAL to true if the same failure
7340    would prevent vectorization at other vector sizes, false if it is still
7341    worth trying other sizes.  N_STMTS is the number of statements in the
7342    region.  */
7343
7344 static bool
7345 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7346                        vec<int> *dataref_groups)
7347 {
7348   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7349
7350   slp_instance instance;
7351   int i;
7352   poly_uint64 min_vf = 2;
7353
7354   /* The first group of checks is independent of the vector size.  */
7355   fatal = true;
7356
7357   /* Analyze the data references.  */
7358
7359   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7360     {
7361       if (dump_enabled_p ())
7362         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7363                          "not vectorized: unhandled data-ref in basic "
7364                          "block.\n");
7365       return false;
7366     }
7367
7368   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7369     {
7370      if (dump_enabled_p ())
7371        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7372                         "not vectorized: unhandled data access in "
7373                         "basic block.\n");
7374       return false;
7375     }
7376
7377   vect_slp_check_for_roots (bb_vinfo);
7378
7379   /* If there are no grouped stores and no constructors in the region
7380      there is no need to continue with pattern recog as vect_analyze_slp
7381      will fail anyway.  */
7382   if (bb_vinfo->grouped_stores.is_empty ()
7383       && bb_vinfo->roots.is_empty ())
7384     {
7385       if (dump_enabled_p ())
7386         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7387                          "not vectorized: no grouped stores in "
7388                          "basic block.\n");
7389       return false;
7390     }
7391
7392   /* While the rest of the analysis below depends on it in some way.  */
7393   fatal = false;
7394
7395   vect_pattern_recog (bb_vinfo);
7396
7397   /* Update store groups from pattern processing.  */
7398   vect_fixup_store_groups_with_patterns (bb_vinfo);
7399
7400   /* Check the SLP opportunities in the basic block, analyze and build SLP
7401      trees.  */
7402   if (!vect_analyze_slp (bb_vinfo, n_stmts))
7403     {
7404       if (dump_enabled_p ())
7405         {
7406           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7407                            "Failed to SLP the basic block.\n");
7408           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7409                            "not vectorized: failed to find SLP opportunities "
7410                            "in basic block.\n");
7411         }
7412       return false;
7413     }
7414
7415   /* Optimize permutations.  */
7416   vect_optimize_slp (bb_vinfo);
7417
7418   /* Gather the loads reachable from the SLP graph entries.  */
7419   vect_gather_slp_loads (bb_vinfo);
7420
7421   vect_record_base_alignments (bb_vinfo);
7422
7423   /* Analyze and verify the alignment of data references and the
7424      dependence in the SLP instances.  */
7425   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7426     {
7427       vect_location = instance->location ();
7428       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7429           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7430         {
7431           slp_tree node = SLP_INSTANCE_TREE (instance);
7432           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7433           if (dump_enabled_p ())
7434             dump_printf_loc (MSG_NOTE, vect_location,
7435                              "removing SLP instance operations starting from: %G",
7436                              stmt_info->stmt);
7437           vect_free_slp_instance (instance);
7438           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7439           continue;
7440         }
7441
7442       /* Mark all the statements that we want to vectorize as pure SLP and
7443          relevant.  */
7444       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7445       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7446       unsigned j;
7447       stmt_vec_info root;
7448       /* Likewise consider instance root stmts as vectorized.  */
7449       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7450         STMT_SLP_TYPE (root) = pure_slp;
7451
7452       i++;
7453     }
7454   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7455     return false;
7456
7457   if (!vect_slp_analyze_operations (bb_vinfo))
7458     {
7459       if (dump_enabled_p ())
7460         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7461                          "not vectorized: bad operation in basic block.\n");
7462       return false;
7463     }
7464
7465   vect_bb_partition_graph (bb_vinfo);
7466
7467   return true;
7468 }
7469
7470 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
7471    basic blocks in BBS, returning true on success.
7472    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
7473
7474 static bool
7475 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7476                  vec<int> *dataref_groups, unsigned int n_stmts,
7477                  loop_p orig_loop)
7478 {
7479   bb_vec_info bb_vinfo;
7480   auto_vector_modes vector_modes;
7481
7482   /* Autodetect first vector size we try.  */
7483   machine_mode next_vector_mode = VOIDmode;
7484   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7485   unsigned int mode_i = 0;
7486
7487   vec_info_shared shared;
7488
7489   machine_mode autodetected_vector_mode = VOIDmode;
7490   while (1)
7491     {
7492       bool vectorized = false;
7493       bool fatal = false;
7494       bb_vinfo = new _bb_vec_info (bbs, &shared);
7495
7496       bool first_time_p = shared.datarefs.is_empty ();
7497       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7498       if (first_time_p)
7499         bb_vinfo->shared->save_datarefs ();
7500       else
7501         bb_vinfo->shared->check_datarefs ();
7502       bb_vinfo->vector_mode = next_vector_mode;
7503
7504       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7505         {
7506           if (dump_enabled_p ())
7507             {
7508               dump_printf_loc (MSG_NOTE, vect_location,
7509                                "***** Analysis succeeded with vector mode"
7510                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7511               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7512             }
7513
7514           bb_vinfo->shared->check_datarefs ();
7515
7516           auto_vec<slp_instance> profitable_subgraphs;
7517           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7518             {
7519               if (instance->subgraph_entries.is_empty ())
7520                 continue;
7521
7522               dump_user_location_t saved_vect_location = vect_location;
7523               vect_location = instance->location ();
7524               if (!unlimited_cost_model (NULL)
7525                   && !vect_bb_vectorization_profitable_p
7526                         (bb_vinfo, instance->subgraph_entries, orig_loop))
7527                 {
7528                   if (dump_enabled_p ())
7529                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7530                                      "not vectorized: vectorization is not "
7531                                      "profitable.\n");
7532                   vect_location = saved_vect_location;
7533                   continue;
7534                 }
7535
7536               vect_location = saved_vect_location;
7537               if (!dbg_cnt (vect_slp))
7538                 continue;
7539
7540               profitable_subgraphs.safe_push (instance);
7541             }
7542
7543           /* When we're vectorizing an if-converted loop body make sure
7544              we vectorized all if-converted code.  */
7545           if (!profitable_subgraphs.is_empty ()
7546               && orig_loop)
7547             {
7548               gcc_assert (bb_vinfo->bbs.length () == 1);
7549               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7550                    !gsi_end_p (gsi); gsi_next (&gsi))
7551                 {
7552                   /* The costing above left us with DCEable vectorized scalar
7553                      stmts having the visited flag set on profitable
7554                      subgraphs.  Do the delayed clearing of the flag here.  */
7555                   if (gimple_visited_p (gsi_stmt (gsi)))
7556                     {
7557                       gimple_set_visited (gsi_stmt (gsi), false);
7558                       continue;
7559                     }
7560                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7561                     continue;
7562
7563                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7564                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
7565                       {
7566                         if (!profitable_subgraphs.is_empty ()
7567                             && dump_enabled_p ())
7568                           dump_printf_loc (MSG_NOTE, vect_location,
7569                                            "not profitable because of "
7570                                            "unprofitable if-converted scalar "
7571                                            "code\n");
7572                         profitable_subgraphs.truncate (0);
7573                       }
7574                 }
7575             }
7576
7577           /* Finally schedule the profitable subgraphs.  */
7578           for (slp_instance instance : profitable_subgraphs)
7579             {
7580               if (!vectorized && dump_enabled_p ())
7581                 dump_printf_loc (MSG_NOTE, vect_location,
7582                                  "Basic block will be vectorized "
7583                                  "using SLP\n");
7584               vectorized = true;
7585
7586               /* Dump before scheduling as store vectorization will remove
7587                  the original stores and mess with the instance tree
7588                  so querying its location will eventually ICE.  */
7589               if (flag_checking)
7590                 for (slp_instance sub : instance->subgraph_entries)
7591                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7592               unsigned HOST_WIDE_INT bytes;
7593               if (dump_enabled_p ())
7594                 for (slp_instance sub : instance->subgraph_entries)
7595                   {
7596                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7597                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7598                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7599                                        sub->location (),
7600                                        "basic block part vectorized using %wu "
7601                                        "byte vectors\n", bytes);
7602                     else
7603                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7604                                        sub->location (),
7605                                        "basic block part vectorized using "
7606                                        "variable length vectors\n");
7607                   }
7608
7609               dump_user_location_t saved_vect_location = vect_location;
7610               vect_location = instance->location ();
7611
7612               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7613
7614               vect_location = saved_vect_location;
7615             }
7616         }
7617       else
7618         {
7619           if (dump_enabled_p ())
7620             dump_printf_loc (MSG_NOTE, vect_location,
7621                              "***** Analysis failed with vector mode %s\n",
7622                              GET_MODE_NAME (bb_vinfo->vector_mode));
7623         }
7624
7625       if (mode_i == 0)
7626         autodetected_vector_mode = bb_vinfo->vector_mode;
7627
7628       if (!fatal)
7629         while (mode_i < vector_modes.length ()
7630                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7631           {
7632             if (dump_enabled_p ())
7633               dump_printf_loc (MSG_NOTE, vect_location,
7634                                "***** The result for vector mode %s would"
7635                                " be the same\n",
7636                                GET_MODE_NAME (vector_modes[mode_i]));
7637             mode_i += 1;
7638           }
7639
7640       delete bb_vinfo;
7641
7642       if (mode_i < vector_modes.length ()
7643           && VECTOR_MODE_P (autodetected_vector_mode)
7644           && (related_vector_mode (vector_modes[mode_i],
7645                                    GET_MODE_INNER (autodetected_vector_mode))
7646               == autodetected_vector_mode)
7647           && (related_vector_mode (autodetected_vector_mode,
7648                                    GET_MODE_INNER (vector_modes[mode_i]))
7649               == vector_modes[mode_i]))
7650         {
7651           if (dump_enabled_p ())
7652             dump_printf_loc (MSG_NOTE, vect_location,
7653                              "***** Skipping vector mode %s, which would"
7654                              " repeat the analysis for %s\n",
7655                              GET_MODE_NAME (vector_modes[mode_i]),
7656                              GET_MODE_NAME (autodetected_vector_mode));
7657           mode_i += 1;
7658         }
7659
7660       if (vectorized
7661           || mode_i == vector_modes.length ()
7662           || autodetected_vector_mode == VOIDmode
7663           /* If vect_slp_analyze_bb_1 signaled that analysis for all
7664              vector sizes will fail do not bother iterating.  */
7665           || fatal)
7666         return vectorized;
7667
7668       /* Try the next biggest vector size.  */
7669       next_vector_mode = vector_modes[mode_i++];
7670       if (dump_enabled_p ())
7671         dump_printf_loc (MSG_NOTE, vect_location,
7672                          "***** Re-trying analysis with vector mode %s\n",
7673                          GET_MODE_NAME (next_vector_mode));
7674     }
7675 }
7676
7677
7678 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
7679    true if anything in the basic-block was vectorized.  */
7680
7681 static bool
7682 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7683 {
7684   vec<data_reference_p> datarefs = vNULL;
7685   auto_vec<int> dataref_groups;
7686   int insns = 0;
7687   int current_group = 0;
7688
7689   for (unsigned i = 0; i < bbs.length (); i++)
7690     {
7691       basic_block bb = bbs[i];
7692       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7693            gsi_next (&gsi))
7694         {
7695           gimple *stmt = gsi_stmt (gsi);
7696           if (is_gimple_debug (stmt))
7697             continue;
7698
7699           insns++;
7700
7701           if (gimple_location (stmt) != UNKNOWN_LOCATION)
7702             vect_location = stmt;
7703
7704           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7705                                               &dataref_groups, current_group))
7706             ++current_group;
7707         }
7708       /* New BBs always start a new DR group.  */
7709       ++current_group;
7710     }
7711
7712   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7713 }
7714
7715 /* Special entry for the BB vectorizer.  Analyze and transform a single
7716    if-converted BB with ORIG_LOOPs body being the not if-converted
7717    representation.  Returns true if anything in the basic-block was
7718    vectorized.  */
7719
7720 bool
7721 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7722 {
7723   auto_vec<basic_block> bbs;
7724   bbs.safe_push (bb);
7725   return vect_slp_bbs (bbs, orig_loop);
7726 }
7727
7728 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
7729    true if anything in the basic-block was vectorized.  */
7730
7731 bool
7732 vect_slp_function (function *fun)
7733 {
7734   bool r = false;
7735   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7736   unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
7737
7738   /* For the moment split the function into pieces to avoid making
7739      the iteration on the vector mode moot.  Split at points we know
7740      to not handle well which is CFG merges (SLP discovery doesn't
7741      handle non-loop-header PHIs) and loop exits.  Since pattern
7742      recog requires reverse iteration to visit uses before defs
7743      simply chop RPO into pieces.  */
7744   auto_vec<basic_block> bbs;
7745   for (unsigned i = 0; i < n; i++)
7746     {
7747       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7748       bool split = false;
7749
7750       /* Split when a BB is not dominated by the first block.  */
7751       if (!bbs.is_empty ()
7752           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7753         {
7754           if (dump_enabled_p ())
7755             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7756                              "splitting region at dominance boundary bb%d\n",
7757                              bb->index);
7758           split = true;
7759         }
7760       /* Split when the loop determined by the first block
7761          is exited.  This is because we eventually insert
7762          invariants at region begin.  */
7763       else if (!bbs.is_empty ()
7764                && bbs[0]->loop_father != bb->loop_father
7765                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7766         {
7767           if (dump_enabled_p ())
7768             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7769                              "splitting region at loop %d exit at bb%d\n",
7770                              bbs[0]->loop_father->num, bb->index);
7771           split = true;
7772         }
7773
7774       if (split && !bbs.is_empty ())
7775         {
7776           r |= vect_slp_bbs (bbs, NULL);
7777           bbs.truncate (0);
7778         }
7779
7780       /* We need to be able to insert at the head of the region which
7781          we cannot for region starting with a returns-twice call.  */
7782       if (bbs.is_empty ())
7783         if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7784           if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7785             {
7786               if (dump_enabled_p ())
7787                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7788                                  "skipping bb%d as start of region as it "
7789                                  "starts with returns-twice call\n",
7790                                  bb->index);
7791               continue;
7792             }
7793
7794       bbs.safe_push (bb);
7795
7796       /* When we have a stmt ending this block and defining a
7797          value we have to insert on edges when inserting after it for
7798          a vector containing its definition.  Avoid this for now.  */
7799       if (gimple *last = *gsi_last_bb (bb))
7800         if (gimple_get_lhs (last)
7801             && is_ctrl_altering_stmt (last))
7802           {
7803             if (dump_enabled_p ())
7804               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7805                                "splitting region at control altering "
7806                                "definition %G", last);
7807             r |= vect_slp_bbs (bbs, NULL);
7808             bbs.truncate (0);
7809           }
7810     }
7811
7812   if (!bbs.is_empty ())
7813     r |= vect_slp_bbs (bbs, NULL);
7814
7815   free (rpo);
7816
7817   return r;
7818 }
7819
7820 /* Build a variable-length vector in which the elements in ELTS are repeated
7821    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
7822    RESULTS and add any new instructions to SEQ.
7823
7824    The approach we use is:
7825
7826    (1) Find a vector mode VM with integer elements of mode IM.
7827
7828    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7829        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
7830        from small vectors to IM.
7831
7832    (3) Duplicate each ELTS'[I] into a vector of mode VM.
7833
7834    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7835        correct byte contents.
7836
7837    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7838
7839    We try to find the largest IM for which this sequence works, in order
7840    to cut down on the number of interleaves.  */
7841
7842 void
7843 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7844                           const vec<tree> &elts, unsigned int nresults,
7845                           vec<tree> &results)
7846 {
7847   unsigned int nelts = elts.length ();
7848   tree element_type = TREE_TYPE (vector_type);
7849
7850   /* (1) Find a vector mode VM with integer elements of mode IM.  */
7851   unsigned int nvectors = 1;
7852   tree new_vector_type;
7853   tree permutes[2];
7854   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7855                                        &nvectors, &new_vector_type,
7856                                        permutes))
7857     gcc_unreachable ();
7858
7859   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
7860   unsigned int partial_nelts = nelts / nvectors;
7861   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7862
7863   tree_vector_builder partial_elts;
7864   auto_vec<tree, 32> pieces (nvectors * 2);
7865   pieces.quick_grow_cleared (nvectors * 2);
7866   for (unsigned int i = 0; i < nvectors; ++i)
7867     {
7868       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7869              ELTS' has mode IM.  */
7870       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7871       for (unsigned int j = 0; j < partial_nelts; ++j)
7872         partial_elts.quick_push (elts[i * partial_nelts + j]);
7873       tree t = gimple_build_vector (seq, &partial_elts);
7874       t = gimple_build (seq, VIEW_CONVERT_EXPR,
7875                         TREE_TYPE (new_vector_type), t);
7876
7877       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
7878       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7879     }
7880
7881   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7882          correct byte contents.
7883
7884      Conceptually, we need to repeat the following operation log2(nvectors)
7885      times, where hi_start = nvectors / 2:
7886
7887         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7888         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7889
7890      However, if each input repeats every N elements and the VF is
7891      a multiple of N * 2, the HI result is the same as the LO result.
7892      This will be true for the first N1 iterations of the outer loop,
7893      followed by N2 iterations for which both the LO and HI results
7894      are needed.  I.e.:
7895
7896         N1 + N2 = log2(nvectors)
7897
7898      Each "N1 iteration" doubles the number of redundant vectors and the
7899      effect of the process as a whole is to have a sequence of nvectors/2**N1
7900      vectors that repeats 2**N1 times.  Rather than generate these redundant
7901      vectors, we halve the number of vectors for each N1 iteration.  */
7902   unsigned int in_start = 0;
7903   unsigned int out_start = nvectors;
7904   unsigned int new_nvectors = nvectors;
7905   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7906     {
7907       unsigned int hi_start = new_nvectors / 2;
7908       unsigned int out_i = 0;
7909       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7910         {
7911           if ((in_i & 1) != 0
7912               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7913                              2 * in_repeat))
7914             continue;
7915
7916           tree output = make_ssa_name (new_vector_type);
7917           tree input1 = pieces[in_start + (in_i / 2)];
7918           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7919           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7920                                                input1, input2,
7921                                                permutes[in_i & 1]);
7922           gimple_seq_add_stmt (seq, stmt);
7923           pieces[out_start + out_i] = output;
7924           out_i += 1;
7925         }
7926       std::swap (in_start, out_start);
7927       new_nvectors = out_i;
7928     }
7929
7930   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
7931   results.reserve (nresults);
7932   for (unsigned int i = 0; i < nresults; ++i)
7933     if (i < new_nvectors)
7934       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7935                                         pieces[in_start + i]));
7936     else
7937       results.quick_push (results[i - new_nvectors]);
7938 }
7939
7940
7941 /* For constant and loop invariant defs in OP_NODE this function creates
7942    vector defs that will be used in the vectorized stmts and stores them
7943    to SLP_TREE_VEC_DEFS of OP_NODE.  */
7944
7945 static void
7946 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
7947 {
7948   unsigned HOST_WIDE_INT nunits;
7949   tree vec_cst;
7950   unsigned j, number_of_places_left_in_vector;
7951   tree vector_type;
7952   tree vop;
7953   int group_size = op_node->ops.length ();
7954   unsigned int vec_num, i;
7955   unsigned number_of_copies = 1;
7956   bool constant_p;
7957   gimple_seq ctor_seq = NULL;
7958   auto_vec<tree, 16> permute_results;
7959
7960   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
7961   vector_type = SLP_TREE_VECTYPE (op_node);
7962
7963   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
7964   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
7965   auto_vec<tree> voprnds (number_of_vectors);
7966
7967   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
7968      created vectors. It is greater than 1 if unrolling is performed.
7969
7970      For example, we have two scalar operands, s1 and s2 (e.g., group of
7971      strided accesses of size two), while NUNITS is four (i.e., four scalars
7972      of this type can be packed in a vector).  The output vector will contain
7973      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
7974      will be 2).
7975
7976      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
7977      containing the operands.
7978
7979      For example, NUNITS is four as before, and the group size is 8
7980      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
7981      {s5, s6, s7, s8}.  */
7982
7983   /* When using duplicate_and_interleave, we just need one element for
7984      each scalar statement.  */
7985   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
7986     nunits = group_size;
7987
7988   number_of_copies = nunits * number_of_vectors / group_size;
7989
7990   number_of_places_left_in_vector = nunits;
7991   constant_p = true;
7992   tree_vector_builder elts (vector_type, nunits, 1);
7993   elts.quick_grow (nunits);
7994   stmt_vec_info insert_after = NULL;
7995   for (j = 0; j < number_of_copies; j++)
7996     {
7997       tree op;
7998       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
7999         {
8000           /* Create 'vect_ = {op0,op1,...,opn}'.  */
8001           number_of_places_left_in_vector--;
8002           tree orig_op = op;
8003           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8004             {
8005               if (CONSTANT_CLASS_P (op))
8006                 {
8007                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8008                     {
8009                       /* Can't use VIEW_CONVERT_EXPR for booleans because
8010                          of possibly different sizes of scalar value and
8011                          vector element.  */
8012                       if (integer_zerop (op))
8013                         op = build_int_cst (TREE_TYPE (vector_type), 0);
8014                       else if (integer_onep (op))
8015                         op = build_all_ones_cst (TREE_TYPE (vector_type));
8016                       else
8017                         gcc_unreachable ();
8018                     }
8019                   else
8020                     op = fold_unary (VIEW_CONVERT_EXPR,
8021                                      TREE_TYPE (vector_type), op);
8022                   gcc_assert (op && CONSTANT_CLASS_P (op));
8023                 }
8024               else
8025                 {
8026                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8027                   gimple *init_stmt;
8028                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8029                     {
8030                       tree true_val
8031                         = build_all_ones_cst (TREE_TYPE (vector_type));
8032                       tree false_val
8033                         = build_zero_cst (TREE_TYPE (vector_type));
8034                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8035                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8036                                                        op, true_val,
8037                                                        false_val);
8038                     }
8039                   else
8040                     {
8041                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8042                                    op);
8043                       init_stmt
8044                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8045                                                op);
8046                     }
8047                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
8048                   op = new_temp;
8049                 }
8050             }
8051           elts[number_of_places_left_in_vector] = op;
8052           if (!CONSTANT_CLASS_P (op))
8053             constant_p = false;
8054           /* For BB vectorization we have to compute an insert location
8055              when a def is inside the analyzed region since we cannot
8056              simply insert at the BB start in this case.  */
8057           stmt_vec_info opdef;
8058           if (TREE_CODE (orig_op) == SSA_NAME
8059               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8060               && is_a <bb_vec_info> (vinfo)
8061               && (opdef = vinfo->lookup_def (orig_op)))
8062             {
8063               if (!insert_after)
8064                 insert_after = opdef;
8065               else
8066                 insert_after = get_later_stmt (insert_after, opdef);
8067             }
8068
8069           if (number_of_places_left_in_vector == 0)
8070             {
8071               if (constant_p
8072                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8073                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8074                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8075               else
8076                 {
8077                   if (permute_results.is_empty ())
8078                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8079                                               elts, number_of_vectors,
8080                                               permute_results);
8081                   vec_cst = permute_results[number_of_vectors - j - 1];
8082                 }
8083               if (!gimple_seq_empty_p (ctor_seq))
8084                 {
8085                   if (insert_after)
8086                     {
8087                       gimple_stmt_iterator gsi;
8088                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8089                         {
8090                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8091                           gsi_insert_seq_before (&gsi, ctor_seq,
8092                                                  GSI_CONTINUE_LINKING);
8093                         }
8094                       else if (!stmt_ends_bb_p (insert_after->stmt))
8095                         {
8096                           gsi = gsi_for_stmt (insert_after->stmt);
8097                           gsi_insert_seq_after (&gsi, ctor_seq,
8098                                                 GSI_CONTINUE_LINKING);
8099                         }
8100                       else
8101                         {
8102                           /* When we want to insert after a def where the
8103                              defining stmt throws then insert on the fallthru
8104                              edge.  */
8105                           edge e = find_fallthru_edge
8106                                      (gimple_bb (insert_after->stmt)->succs);
8107                           basic_block new_bb
8108                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8109                           gcc_assert (!new_bb);
8110                         }
8111                     }
8112                   else
8113                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
8114                   ctor_seq = NULL;
8115                 }
8116               voprnds.quick_push (vec_cst);
8117               insert_after = NULL;
8118               number_of_places_left_in_vector = nunits;
8119               constant_p = true;
8120               elts.new_vector (vector_type, nunits, 1);
8121               elts.quick_grow (nunits);
8122             }
8123         }
8124     }
8125
8126   /* Since the vectors are created in the reverse order, we should invert
8127      them.  */
8128   vec_num = voprnds.length ();
8129   for (j = vec_num; j != 0; j--)
8130     {
8131       vop = voprnds[j - 1];
8132       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8133     }
8134
8135   /* In case that VF is greater than the unrolling factor needed for the SLP
8136      group of stmts, NUMBER_OF_VECTORS to be created is greater than
8137      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8138      to replicate the vectors.  */
8139   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8140     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8141          i++)
8142       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8143 }
8144
8145 /* Get the Ith vectorized definition from SLP_NODE.  */
8146
8147 tree
8148 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8149 {
8150   return SLP_TREE_VEC_DEFS (slp_node)[i];
8151 }
8152
8153 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
8154
8155 void
8156 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8157 {
8158   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8159   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8160 }
8161
8162 /* Get N vectorized definitions for SLP_NODE.  */
8163
8164 void
8165 vect_get_slp_defs (vec_info *,
8166                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8167 {
8168   if (n == -1U)
8169     n = SLP_TREE_CHILDREN (slp_node).length ();
8170
8171   for (unsigned i = 0; i < n; ++i)
8172     {
8173       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8174       vec<tree> vec_defs = vNULL;
8175       vect_get_slp_defs (child, &vec_defs);
8176       vec_oprnds->quick_push (vec_defs);
8177     }
8178 }
8179
8180 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8181    - PERM gives the permutation that the caller wants to use for NODE,
8182      which might be different from SLP_LOAD_PERMUTATION.
8183    - DUMP_P controls whether the function dumps information.  */
8184
8185 static bool
8186 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8187                                 load_permutation_t &perm,
8188                                 const vec<tree> &dr_chain,
8189                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
8190                                 bool analyze_only, bool dump_p,
8191                                 unsigned *n_perms, unsigned int *n_loads,
8192                                 bool dce_chain)
8193 {
8194   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8195   int vec_index = 0;
8196   tree vectype = SLP_TREE_VECTYPE (node);
8197   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8198   unsigned int mask_element;
8199   unsigned dr_group_size;
8200   machine_mode mode;
8201
8202   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8203     dr_group_size = 1;
8204   else
8205     {
8206       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8207       dr_group_size = DR_GROUP_SIZE (stmt_info);
8208     }
8209
8210   mode = TYPE_MODE (vectype);
8211   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8212   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8213
8214   /* Initialize the vect stmts of NODE to properly insert the generated
8215      stmts later.  */
8216   if (! analyze_only)
8217     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8218       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8219
8220   /* Generate permutation masks for every NODE. Number of masks for each NODE
8221      is equal to GROUP_SIZE.
8222      E.g., we have a group of three nodes with three loads from the same
8223      location in each node, and the vector size is 4. I.e., we have a
8224      a0b0c0a1b1c1... sequence and we need to create the following vectors:
8225      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8226      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8227      ...
8228
8229      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8230      The last mask is illegal since we assume two operands for permute
8231      operation, and the mask element values can't be outside that range.
8232      Hence, the last mask must be converted into {2,5,5,5}.
8233      For the first two permutations we need the first and the second input
8234      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8235      we need the second and the third vectors: {b1,c1,a2,b2} and
8236      {c2,a3,b3,c3}.  */
8237
8238   int vect_stmts_counter = 0;
8239   unsigned int index = 0;
8240   int first_vec_index = -1;
8241   int second_vec_index = -1;
8242   bool noop_p = true;
8243   *n_perms = 0;
8244
8245   vec_perm_builder mask;
8246   unsigned int nelts_to_build;
8247   unsigned int nvectors_per_build;
8248   unsigned int in_nlanes;
8249   bool repeating_p = (group_size == dr_group_size
8250                       && multiple_p (nunits, group_size));
8251   if (repeating_p)
8252     {
8253       /* A single vector contains a whole number of copies of the node, so:
8254          (a) all permutes can use the same mask; and
8255          (b) the permutes only need a single vector input.  */
8256       mask.new_vector (nunits, group_size, 3);
8257       nelts_to_build = mask.encoded_nelts ();
8258       /* It's possible to obtain zero nstmts during analyze_only, so make
8259          it at least one to ensure the later computation for n_perms
8260          proceed.  */
8261       nvectors_per_build = nstmts > 0 ? nstmts : 1;
8262       in_nlanes = dr_group_size * 3;
8263     }
8264   else
8265     {
8266       /* We need to construct a separate mask for each vector statement.  */
8267       unsigned HOST_WIDE_INT const_nunits, const_vf;
8268       if (!nunits.is_constant (&const_nunits)
8269           || !vf.is_constant (&const_vf))
8270         return false;
8271       mask.new_vector (const_nunits, const_nunits, 1);
8272       nelts_to_build = const_vf * group_size;
8273       nvectors_per_build = 1;
8274       in_nlanes = const_vf * dr_group_size;
8275     }
8276   auto_sbitmap used_in_lanes (in_nlanes);
8277   bitmap_clear (used_in_lanes);
8278   auto_bitmap used_defs;
8279
8280   unsigned int count = mask.encoded_nelts ();
8281   mask.quick_grow (count);
8282   vec_perm_indices indices;
8283
8284   for (unsigned int j = 0; j < nelts_to_build; j++)
8285     {
8286       unsigned int iter_num = j / group_size;
8287       unsigned int stmt_num = j % group_size;
8288       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8289       bitmap_set_bit (used_in_lanes, i);
8290       if (repeating_p)
8291         {
8292           first_vec_index = 0;
8293           mask_element = i;
8294         }
8295       else
8296         {
8297           /* Enforced before the loop when !repeating_p.  */
8298           unsigned int const_nunits = nunits.to_constant ();
8299           vec_index = i / const_nunits;
8300           mask_element = i % const_nunits;
8301           if (vec_index == first_vec_index
8302               || first_vec_index == -1)
8303             {
8304               first_vec_index = vec_index;
8305             }
8306           else if (vec_index == second_vec_index
8307                    || second_vec_index == -1)
8308             {
8309               second_vec_index = vec_index;
8310               mask_element += const_nunits;
8311             }
8312           else
8313             {
8314               if (dump_p)
8315                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8316                                  "permutation requires at "
8317                                  "least three vectors %G",
8318                                  stmt_info->stmt);
8319               gcc_assert (analyze_only);
8320               return false;
8321             }
8322
8323           gcc_assert (mask_element < 2 * const_nunits);
8324         }
8325
8326       if (mask_element != index)
8327         noop_p = false;
8328       mask[index++] = mask_element;
8329
8330       if (index == count)
8331         {
8332           if (!noop_p)
8333             {
8334               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8335               if (!can_vec_perm_const_p (mode, mode, indices))
8336                 {
8337                   if (dump_p)
8338                     {
8339                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8340                                        "unsupported vect permute { ");
8341                       for (i = 0; i < count; ++i)
8342                         {
8343                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8344                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8345                         }
8346                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8347                     }
8348                   gcc_assert (analyze_only);
8349                   return false;
8350                 }
8351
8352               tree mask_vec = NULL_TREE;
8353               if (!analyze_only)
8354                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8355
8356               if (second_vec_index == -1)
8357                 second_vec_index = first_vec_index;
8358
8359               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8360                 {
8361                   ++*n_perms;
8362                   if (analyze_only)
8363                     continue;
8364                   /* Generate the permute statement if necessary.  */
8365                   tree first_vec = dr_chain[first_vec_index + ri];
8366                   tree second_vec = dr_chain[second_vec_index + ri];
8367                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8368                   tree perm_dest
8369                     = vect_create_destination_var (gimple_assign_lhs (stmt),
8370                                                    vectype);
8371                   perm_dest = make_ssa_name (perm_dest);
8372                   gimple *perm_stmt
8373                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8374                                            second_vec, mask_vec);
8375                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8376                                                gsi);
8377                   if (dce_chain)
8378                     {
8379                       bitmap_set_bit (used_defs, first_vec_index + ri);
8380                       bitmap_set_bit (used_defs, second_vec_index + ri);
8381                     }
8382
8383                   /* Store the vector statement in NODE.  */
8384                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8385                 }
8386             }
8387           else if (!analyze_only)
8388             {
8389               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8390                 {
8391                   tree first_vec = dr_chain[first_vec_index + ri];
8392                   /* If mask was NULL_TREE generate the requested
8393                      identity transform.  */
8394                   if (dce_chain)
8395                     bitmap_set_bit (used_defs, first_vec_index + ri);
8396
8397                   /* Store the vector statement in NODE.  */
8398                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8399                 }
8400             }
8401
8402           index = 0;
8403           first_vec_index = -1;
8404           second_vec_index = -1;
8405           noop_p = true;
8406         }
8407     }
8408
8409   if (n_loads)
8410     {
8411       if (repeating_p)
8412         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8413       else
8414         {
8415           /* Enforced above when !repeating_p.  */
8416           unsigned int const_nunits = nunits.to_constant ();
8417           *n_loads = 0;
8418           bool load_seen = false;
8419           for (unsigned i = 0; i < in_nlanes; ++i)
8420             {
8421               if (i % const_nunits == 0)
8422                 {
8423                   if (load_seen)
8424                     *n_loads += 1;
8425                   load_seen = false;
8426                 }
8427               if (bitmap_bit_p (used_in_lanes, i))
8428                 load_seen = true;
8429             }
8430           if (load_seen)
8431             *n_loads += 1;
8432         }
8433     }
8434
8435   if (dce_chain)
8436     for (unsigned i = 0; i < dr_chain.length (); ++i)
8437       if (!bitmap_bit_p (used_defs, i))
8438         {
8439           gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8440           gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8441           gsi_remove (&rgsi, true);
8442           release_defs (stmt);
8443         }
8444
8445   return true;
8446 }
8447
8448 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8449    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8450    permute statements for the SLP node NODE.  Store the number of vector
8451    permute instructions in *N_PERMS and the number of vector load
8452    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
8453    that were not needed.  */
8454
8455 bool
8456 vect_transform_slp_perm_load (vec_info *vinfo,
8457                               slp_tree node, const vec<tree> &dr_chain,
8458                               gimple_stmt_iterator *gsi, poly_uint64 vf,
8459                               bool analyze_only, unsigned *n_perms,
8460                               unsigned int *n_loads, bool dce_chain)
8461 {
8462   return vect_transform_slp_perm_load_1 (vinfo, node,
8463                                          SLP_TREE_LOAD_PERMUTATION (node),
8464                                          dr_chain, gsi, vf, analyze_only,
8465                                          dump_enabled_p (), n_perms, n_loads,
8466                                          dce_chain);
8467 }
8468
8469 /* Produce the next vector result for SLP permutation NODE by adding a vector
8470    statement at GSI.  If MASK_VEC is nonnull, add:
8471
8472       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8473
8474    otherwise add:
8475
8476       <new SSA name> = FIRST_DEF.  */
8477
8478 static void
8479 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8480                           slp_tree node, tree first_def, tree second_def,
8481                           tree mask_vec, poly_uint64 identity_offset)
8482 {
8483   tree vectype = SLP_TREE_VECTYPE (node);
8484
8485   /* ???  We SLP match existing vector element extracts but
8486      allow punning which we need to re-instantiate at uses
8487      but have no good way of explicitly representing.  */
8488   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8489       && !types_compatible_p (TREE_TYPE (first_def), vectype))
8490     {
8491       gassign *conv_stmt
8492         = gimple_build_assign (make_ssa_name (vectype),
8493                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8494       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8495       first_def = gimple_assign_lhs (conv_stmt);
8496     }
8497   gassign *perm_stmt;
8498   tree perm_dest = make_ssa_name (vectype);
8499   if (mask_vec)
8500     {
8501       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8502                            TYPE_SIZE (vectype))
8503           && !types_compatible_p (TREE_TYPE (second_def), vectype))
8504         {
8505           gassign *conv_stmt
8506             = gimple_build_assign (make_ssa_name (vectype),
8507                                    build1 (VIEW_CONVERT_EXPR,
8508                                            vectype, second_def));
8509           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8510           second_def = gimple_assign_lhs (conv_stmt);
8511         }
8512       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8513                                        first_def, second_def,
8514                                        mask_vec);
8515     }
8516   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8517     {
8518       /* For identity permutes we still need to handle the case
8519          of offsetted extracts or concats.  */
8520       unsigned HOST_WIDE_INT c;
8521       auto first_def_nunits
8522         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8523       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8524         {
8525           unsigned HOST_WIDE_INT elsz
8526             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8527           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8528                                  TYPE_SIZE (vectype),
8529                                  bitsize_int (identity_offset * elsz));
8530           perm_stmt = gimple_build_assign (perm_dest, lowpart);
8531         }
8532       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8533                                     first_def_nunits, &c) && c == 2)
8534         {
8535           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8536                                             NULL_TREE, second_def);
8537           perm_stmt = gimple_build_assign (perm_dest, ctor);
8538         }
8539       else
8540         gcc_unreachable ();
8541     }
8542   else
8543     {
8544       /* We need a copy here in case the def was external.  */
8545       perm_stmt = gimple_build_assign (perm_dest, first_def);
8546     }
8547   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8548   /* Store the vector statement in NODE.  */
8549   node->push_vec_def (perm_stmt);
8550 }
8551
8552 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
8553    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8554    If GSI is nonnull, emit the permutation there.
8555
8556    When GSI is null, the only purpose of NODE is to give properties
8557    of the result, such as the vector type and number of SLP lanes.
8558    The node does not need to be a VEC_PERM_EXPR.
8559
8560    If the target supports the operation, return the number of individual
8561    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
8562    dump file if DUMP_P is true.  */
8563
8564 static int
8565 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8566                                 slp_tree node, lane_permutation_t &perm,
8567                                 vec<slp_tree> &children, bool dump_p)
8568 {
8569   tree vectype = SLP_TREE_VECTYPE (node);
8570
8571   /* ???  We currently only support all same vector input types
8572      while the SLP IL should really do a concat + select and thus accept
8573      arbitrary mismatches.  */
8574   slp_tree child;
8575   unsigned i;
8576   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8577   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8578   tree op_vectype = NULL_TREE;
8579   FOR_EACH_VEC_ELT (children, i, child)
8580     if (SLP_TREE_VECTYPE (child))
8581       {
8582         op_vectype = SLP_TREE_VECTYPE (child);
8583         break;
8584       }
8585   if (!op_vectype)
8586     op_vectype = vectype;
8587   FOR_EACH_VEC_ELT (children, i, child)
8588     {
8589       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8590            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8591           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8592           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8593         {
8594           if (dump_p)
8595             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8596                              "Unsupported vector types in lane permutation\n");
8597           return -1;
8598         }
8599       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8600         repeating_p = false;
8601     }
8602
8603   gcc_assert (perm.length () == SLP_TREE_LANES (node));
8604   if (dump_p)
8605     {
8606       dump_printf_loc (MSG_NOTE, vect_location,
8607                        "vectorizing permutation");
8608       for (unsigned i = 0; i < perm.length (); ++i)
8609         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8610       if (repeating_p)
8611         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8612       dump_printf (MSG_NOTE, "\n");
8613     }
8614
8615   /* REPEATING_P is true if every output vector is guaranteed to use the
8616      same permute vector.  We can handle that case for both variable-length
8617      and constant-length vectors, but we only handle other cases for
8618      constant-length vectors.
8619
8620      Set:
8621
8622      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8623        mask vector that we want to build.
8624
8625      - NCOPIES to the number of copies of PERM that we need in order
8626        to build the necessary permute mask vectors.
8627
8628      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8629        for each permute mask vector.  This is only relevant when GSI is
8630        nonnull.  */
8631   uint64_t npatterns;
8632   unsigned nelts_per_pattern;
8633   uint64_t ncopies;
8634   unsigned noutputs_per_mask;
8635   if (repeating_p)
8636     {
8637       /* We need a single permute mask vector that has the form:
8638
8639            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8640
8641          In other words, the original n-element permute in PERM is
8642          "unrolled" to fill a full vector.  The stepped vector encoding
8643          that we use for permutes requires 3n elements.  */
8644       npatterns = SLP_TREE_LANES (node);
8645       nelts_per_pattern = ncopies = 3;
8646       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8647     }
8648   else
8649     {
8650       /* Calculate every element of every permute mask vector explicitly,
8651          instead of relying on the pattern described above.  */
8652       if (!nunits.is_constant (&npatterns))
8653         return -1;
8654       nelts_per_pattern = ncopies = 1;
8655       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8656         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8657           return -1;
8658       noutputs_per_mask = 1;
8659     }
8660   unsigned olanes = ncopies * SLP_TREE_LANES (node);
8661   gcc_assert (repeating_p || multiple_p (olanes, nunits));
8662
8663   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8664      from the { SLP operand, scalar lane } permutation as recorded in the
8665      SLP node as intermediate step.  This part should already work
8666      with SLP children with arbitrary number of lanes.  */
8667   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8668   auto_vec<unsigned> active_lane;
8669   vperm.create (olanes);
8670   active_lane.safe_grow_cleared (children.length (), true);
8671   for (unsigned i = 0; i < ncopies; ++i)
8672     {
8673       for (unsigned pi = 0; pi < perm.length (); ++pi)
8674         {
8675           std::pair<unsigned, unsigned> p = perm[pi];
8676           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8677           if (repeating_p)
8678             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8679           else
8680             {
8681               /* We checked above that the vectors are constant-length.  */
8682               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8683               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8684               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8685               vperm.quick_push ({{p.first, vi}, vl});
8686             }
8687         }
8688       /* Advance to the next group.  */
8689       for (unsigned j = 0; j < children.length (); ++j)
8690         active_lane[j] += SLP_TREE_LANES (children[j]);
8691     }
8692
8693   if (dump_p)
8694     {
8695       dump_printf_loc (MSG_NOTE, vect_location,
8696                        "vectorizing permutation");
8697       for (unsigned i = 0; i < perm.length (); ++i)
8698         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8699       if (repeating_p)
8700         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8701       dump_printf (MSG_NOTE, "\n");
8702       dump_printf_loc (MSG_NOTE, vect_location, "as");
8703       for (unsigned i = 0; i < vperm.length (); ++i)
8704         {
8705           if (i != 0
8706               && (repeating_p
8707                   ? multiple_p (i, npatterns)
8708                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8709             dump_printf (MSG_NOTE, ",");
8710           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8711                        vperm[i].first.first, vperm[i].first.second,
8712                        vperm[i].second);
8713         }
8714       dump_printf (MSG_NOTE, "\n");
8715     }
8716
8717   /* We can only handle two-vector permutes, everything else should
8718      be lowered on the SLP level.  The following is closely inspired
8719      by vect_transform_slp_perm_load and is supposed to eventually
8720      replace it.
8721      ???   As intermediate step do code-gen in the SLP tree representation
8722      somehow?  */
8723   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8724   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8725   unsigned int index = 0;
8726   poly_uint64 mask_element;
8727   vec_perm_builder mask;
8728   mask.new_vector (nunits, npatterns, nelts_per_pattern);
8729   unsigned int count = mask.encoded_nelts ();
8730   mask.quick_grow (count);
8731   vec_perm_indices indices;
8732   unsigned nperms = 0;
8733   for (unsigned i = 0; i < vperm.length (); ++i)
8734     {
8735       mask_element = vperm[i].second;
8736       if (first_vec.first == -1U
8737           || first_vec == vperm[i].first)
8738         first_vec = vperm[i].first;
8739       else if (second_vec.first == -1U
8740                || second_vec == vperm[i].first)
8741         {
8742           second_vec = vperm[i].first;
8743           mask_element += nunits;
8744         }
8745       else
8746         {
8747           if (dump_p)
8748             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8749                              "permutation requires at "
8750                              "least three vectors\n");
8751           gcc_assert (!gsi);
8752           return -1;
8753         }
8754
8755       mask[index++] = mask_element;
8756
8757       if (index == count)
8758         {
8759           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8760                               TYPE_VECTOR_SUBPARTS (op_vectype));
8761           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8762                              && constant_multiple_p (mask[0], nunits));
8763           machine_mode vmode = TYPE_MODE (vectype);
8764           machine_mode op_vmode = TYPE_MODE (op_vectype);
8765           unsigned HOST_WIDE_INT c;
8766           if ((!identity_p
8767                && !can_vec_perm_const_p (vmode, op_vmode, indices))
8768               || (identity_p
8769                   && !known_le (nunits,
8770                                 TYPE_VECTOR_SUBPARTS (op_vectype))
8771                   && (!constant_multiple_p (nunits,
8772                                             TYPE_VECTOR_SUBPARTS (op_vectype),
8773                                             &c) || c != 2)))
8774             {
8775               if (dump_p)
8776                 {
8777                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8778                                    vect_location,
8779                                    "unsupported vect permute { ");
8780                   for (i = 0; i < count; ++i)
8781                     {
8782                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8783                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8784                     }
8785                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8786                 }
8787               gcc_assert (!gsi);
8788               return -1;
8789             }
8790
8791           if (!identity_p)
8792             nperms++;
8793           if (gsi)
8794             {
8795               if (second_vec.first == -1U)
8796                 second_vec = first_vec;
8797
8798               slp_tree
8799                 first_node = children[first_vec.first],
8800                 second_node = children[second_vec.first];
8801
8802               tree mask_vec = NULL_TREE;
8803               if (!identity_p)
8804                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8805
8806               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8807                 {
8808                   tree first_def
8809                     = vect_get_slp_vect_def (first_node,
8810                                              first_vec.second + vi);
8811                   tree second_def
8812                     = vect_get_slp_vect_def (second_node,
8813                                              second_vec.second + vi);
8814                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
8815                                             second_def, mask_vec, mask[0]);
8816                 }
8817             }
8818
8819           index = 0;
8820           first_vec = std::make_pair (-1U, -1U);
8821           second_vec = std::make_pair (-1U, -1U);
8822         }
8823     }
8824
8825   return nperms;
8826 }
8827
8828 /* Vectorize the SLP permutations in NODE as specified
8829    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8830    child number and lane number.
8831    Interleaving of two two-lane two-child SLP subtrees (not supported):
8832      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8833    A blend of two four-lane two-child SLP subtrees:
8834      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8835    Highpart of a four-lane one-child SLP subtree (not supported):
8836      [ { 0, 2 }, { 0, 3 } ]
8837    Where currently only a subset is supported by code generating below.  */
8838
8839 static bool
8840 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8841                               slp_tree node, stmt_vector_for_cost *cost_vec)
8842 {
8843   tree vectype = SLP_TREE_VECTYPE (node);
8844   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8845   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8846                                                SLP_TREE_CHILDREN (node),
8847                                                dump_enabled_p ());
8848   if (nperms < 0)
8849     return false;
8850
8851   if (!gsi)
8852     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8853
8854   return true;
8855 }
8856
8857 /* Vectorize SLP NODE.  */
8858
8859 static void
8860 vect_schedule_slp_node (vec_info *vinfo,
8861                         slp_tree node, slp_instance instance)
8862 {
8863   gimple_stmt_iterator si;
8864   int i;
8865   slp_tree child;
8866
8867   /* For existing vectors there's nothing to do.  */
8868   if (SLP_TREE_DEF_TYPE (node) == vect_external_def
8869       && SLP_TREE_VEC_DEFS (node).exists ())
8870     return;
8871
8872   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
8873
8874   /* Vectorize externals and constants.  */
8875   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8876       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8877     {
8878       /* ???  vectorizable_shift can end up using a scalar operand which is
8879          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
8880          node in this case.  */
8881       if (!SLP_TREE_VECTYPE (node))
8882         return;
8883
8884       vect_create_constant_vectors (vinfo, node);
8885       return;
8886     }
8887
8888   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8889
8890   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8891   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8892
8893   if (dump_enabled_p ())
8894     dump_printf_loc (MSG_NOTE, vect_location,
8895                      "------>vectorizing SLP node starting from: %G",
8896                      stmt_info->stmt);
8897
8898   if (STMT_VINFO_DATA_REF (stmt_info)
8899       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8900     {
8901       /* Vectorized loads go before the first scalar load to make it
8902          ready early, vectorized stores go before the last scalar
8903          stmt which is where all uses are ready.  */
8904       stmt_vec_info last_stmt_info = NULL;
8905       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8906         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8907       else /* DR_IS_WRITE */
8908         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8909       si = gsi_for_stmt (last_stmt_info->stmt);
8910     }
8911   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8912             || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8913             || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8914            && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8915     {
8916       /* For PHI node vectorization we do not use the insertion iterator.  */
8917       si = gsi_none ();
8918     }
8919   else
8920     {
8921       /* Emit other stmts after the children vectorized defs which is
8922          earliest possible.  */
8923       gimple *last_stmt = NULL;
8924       bool seen_vector_def = false;
8925       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8926         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8927           {
8928             /* For fold-left reductions we are retaining the scalar
8929                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8930                set so the representation isn't perfect.  Resort to the
8931                last scalar def here.  */
8932             if (SLP_TREE_VEC_DEFS (child).is_empty ())
8933               {
8934                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8935                             == cycle_phi_info_type);
8936                 gphi *phi = as_a <gphi *>
8937                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
8938                 if (!last_stmt
8939                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
8940                   last_stmt = phi;
8941               }
8942             /* We are emitting all vectorized stmts in the same place and
8943                the last one is the last.
8944                ???  Unless we have a load permutation applied and that
8945                figures to re-use an earlier generated load.  */
8946             unsigned j;
8947             tree vdef;
8948             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
8949               {
8950                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
8951                 if (!last_stmt
8952                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8953                   last_stmt = vstmt;
8954               }
8955           }
8956         else if (!SLP_TREE_VECTYPE (child))
8957           {
8958             /* For externals we use unvectorized at all scalar defs.  */
8959             unsigned j;
8960             tree def;
8961             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
8962               if (TREE_CODE (def) == SSA_NAME
8963                   && !SSA_NAME_IS_DEFAULT_DEF (def))
8964                 {
8965                   gimple *stmt = SSA_NAME_DEF_STMT (def);
8966                   if (!last_stmt
8967                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
8968                     last_stmt = stmt;
8969                 }
8970           }
8971         else
8972           {
8973             /* For externals we have to look at all defs since their
8974                insertion place is decided per vector.  But beware
8975                of pre-existing vectors where we need to make sure
8976                we do not insert before the region boundary.  */
8977             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
8978                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
8979               seen_vector_def = true;
8980             else
8981               {
8982                 unsigned j;
8983                 tree vdef;
8984                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
8985                   if (TREE_CODE (vdef) == SSA_NAME
8986                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
8987                     {
8988                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
8989                       if (!last_stmt
8990                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8991                         last_stmt = vstmt;
8992                     }
8993               }
8994           }
8995       /* This can happen when all children are pre-existing vectors or
8996          constants.  */
8997       if (!last_stmt)
8998         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
8999       if (!last_stmt)
9000         {
9001           gcc_assert (seen_vector_def);
9002           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9003         }
9004       else if (is_ctrl_altering_stmt (last_stmt))
9005         {
9006           /* We split regions to vectorize at control altering stmts
9007              with a definition so this must be an external which
9008              we can insert at the start of the region.  */
9009           si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9010         }
9011       else if (is_a <bb_vec_info> (vinfo)
9012                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9013                && gimple_could_trap_p (stmt_info->stmt))
9014         {
9015           /* We've constrained possibly trapping operations to all come
9016              from the same basic-block, if vectorized defs would allow earlier
9017              scheduling still force vectorized stmts to the original block.
9018              This is only necessary for BB vectorization since for loop vect
9019              all operations are in a single BB and scalar stmt based
9020              placement doesn't play well with epilogue vectorization.  */
9021           gcc_assert (dominated_by_p (CDI_DOMINATORS,
9022                                       gimple_bb (stmt_info->stmt),
9023                                       gimple_bb (last_stmt)));
9024           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9025         }
9026       else if (is_a <gphi *> (last_stmt))
9027         si = gsi_after_labels (gimple_bb (last_stmt));
9028       else
9029         {
9030           si = gsi_for_stmt (last_stmt);
9031           gsi_next (&si);
9032         }
9033     }
9034
9035   /* Handle purely internal nodes.  */
9036   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9037     {
9038       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
9039          be shared with different SLP nodes (but usually it's the same
9040          operation apart from the case the stmt is only there for denoting
9041          the actual scalar lane defs ...).  So do not call vect_transform_stmt
9042          but open-code it here (partly).  */
9043       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9044       gcc_assert (done);
9045       stmt_vec_info slp_stmt_info;
9046       unsigned int i;
9047       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9048         if (STMT_VINFO_LIVE_P (slp_stmt_info))
9049           {
9050             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9051                                                 instance, i, true, NULL);
9052             gcc_assert (done);
9053           }
9054     }
9055   else
9056     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9057 }
9058
9059 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9060    For loop vectorization this is done in vectorizable_call, but for SLP
9061    it needs to be deferred until end of vect_schedule_slp, because multiple
9062    SLP instances may refer to the same scalar stmt.  */
9063
9064 static void
9065 vect_remove_slp_scalar_calls (vec_info *vinfo,
9066                               slp_tree node, hash_set<slp_tree> &visited)
9067 {
9068   gimple *new_stmt;
9069   gimple_stmt_iterator gsi;
9070   int i;
9071   slp_tree child;
9072   tree lhs;
9073   stmt_vec_info stmt_info;
9074
9075   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9076     return;
9077
9078   if (visited.add (node))
9079     return;
9080
9081   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9082     vect_remove_slp_scalar_calls (vinfo, child, visited);
9083
9084   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9085     {
9086       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9087       if (!stmt || gimple_bb (stmt) == NULL)
9088         continue;
9089       if (is_pattern_stmt_p (stmt_info)
9090           || !PURE_SLP_STMT (stmt_info))
9091         continue;
9092       lhs = gimple_call_lhs (stmt);
9093       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9094       gsi = gsi_for_stmt (stmt);
9095       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9096       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
9097     }
9098 }
9099
9100 static void
9101 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9102 {
9103   hash_set<slp_tree> visited;
9104   vect_remove_slp_scalar_calls (vinfo, node, visited);
9105 }
9106
9107 /* Vectorize the instance root.  */
9108
9109 void
9110 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9111 {
9112   gassign *rstmt = NULL;
9113
9114   if (instance->kind == slp_inst_kind_ctor)
9115     {
9116       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9117         {
9118           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9119           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9120           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9121                                           TREE_TYPE (vect_lhs)))
9122             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9123                                vect_lhs);
9124           rstmt = gimple_build_assign (root_lhs, vect_lhs);
9125         }
9126       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9127         {
9128           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9129           tree child_def;
9130           int j;
9131           vec<constructor_elt, va_gc> *v;
9132           vec_alloc (v, nelts);
9133
9134           /* A CTOR can handle V16HI composition from VNx8HI so we
9135              do not need to convert vector elements if the types
9136              do not match.  */
9137           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9138             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9139           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9140           tree rtype
9141             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9142           tree r_constructor = build_constructor (rtype, v);
9143           rstmt = gimple_build_assign (lhs, r_constructor);
9144         }
9145     }
9146   else if (instance->kind == slp_inst_kind_bb_reduc)
9147     {
9148       /* Largely inspired by reduction chain epilogue handling in
9149          vect_create_epilog_for_reduction.  */
9150       vec<tree> vec_defs = vNULL;
9151       vect_get_slp_defs (node, &vec_defs);
9152       enum tree_code reduc_code
9153         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9154       /* ???  We actually have to reflect signs somewhere.  */
9155       if (reduc_code == MINUS_EXPR)
9156         reduc_code = PLUS_EXPR;
9157       gimple_seq epilogue = NULL;
9158       /* We may end up with more than one vector result, reduce them
9159          to one vector.  */
9160       tree vec_def = vec_defs[0];
9161       tree vectype = TREE_TYPE (vec_def);
9162       tree compute_vectype = vectype;
9163       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9164                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
9165                                  && operation_can_overflow (reduc_code));
9166       if (pun_for_overflow_p)
9167         {
9168           compute_vectype = unsigned_type_for (vectype);
9169           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9170                                   compute_vectype, vec_def);
9171         }
9172       for (unsigned i = 1; i < vec_defs.length (); ++i)
9173         {
9174           tree def = vec_defs[i];
9175           if (pun_for_overflow_p)
9176             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9177                                 compute_vectype, def);
9178           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9179                                   vec_def, def);
9180         }
9181       vec_defs.release ();
9182       /* ???  Support other schemes than direct internal fn.  */
9183       internal_fn reduc_fn;
9184       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9185           || reduc_fn == IFN_LAST)
9186         gcc_unreachable ();
9187       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9188                                       TREE_TYPE (compute_vectype), vec_def);
9189       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9190         {
9191           tree rem_def = NULL_TREE;
9192           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9193             {
9194               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9195               if (!rem_def)
9196                 rem_def = def;
9197               else
9198                 rem_def = gimple_build (&epilogue, reduc_code,
9199                                         TREE_TYPE (scalar_def),
9200                                         rem_def, def);
9201             }
9202           scalar_def = gimple_build (&epilogue, reduc_code,
9203                                      TREE_TYPE (scalar_def),
9204                                      scalar_def, rem_def);
9205         }
9206       scalar_def = gimple_convert (&epilogue,
9207                                    TREE_TYPE (vectype), scalar_def);
9208       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9209       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9210       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9211       update_stmt (gsi_stmt (rgsi));
9212       return;
9213     }
9214   else
9215     gcc_unreachable ();
9216
9217   gcc_assert (rstmt);
9218
9219   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9220   gsi_replace (&rgsi, rstmt, true);
9221 }
9222
9223 struct slp_scc_info
9224 {
9225   bool on_stack;
9226   int dfs;
9227   int lowlink;
9228 };
9229
9230 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
9231
9232 static void
9233 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9234                    hash_map<slp_tree, slp_scc_info> &scc_info,
9235                    int &maxdfs, vec<slp_tree> &stack)
9236 {
9237   bool existed_p;
9238   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9239   gcc_assert (!existed_p);
9240   info->dfs = maxdfs;
9241   info->lowlink = maxdfs;
9242   maxdfs++;
9243
9244   /* Leaf.  */
9245   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9246     {
9247       info->on_stack = false;
9248       vect_schedule_slp_node (vinfo, node, instance);
9249       return;
9250     }
9251
9252   info->on_stack = true;
9253   stack.safe_push (node);
9254
9255   unsigned i;
9256   slp_tree child;
9257   /* DFS recurse.  */
9258   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9259     {
9260       if (!child)
9261         continue;
9262       slp_scc_info *child_info = scc_info.get (child);
9263       if (!child_info)
9264         {
9265           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9266           /* Recursion might have re-allocated the node.  */
9267           info = scc_info.get (node);
9268           child_info = scc_info.get (child);
9269           info->lowlink = MIN (info->lowlink, child_info->lowlink);
9270         }
9271       else if (child_info->on_stack)
9272         info->lowlink = MIN (info->lowlink, child_info->dfs);
9273     }
9274   if (info->lowlink != info->dfs)
9275     return;
9276
9277   auto_vec<slp_tree, 4> phis_to_fixup;
9278
9279   /* Singleton.  */
9280   if (stack.last () == node)
9281     {
9282       stack.pop ();
9283       info->on_stack = false;
9284       vect_schedule_slp_node (vinfo, node, instance);
9285       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9286           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9287         phis_to_fixup.quick_push (node);
9288     }
9289   else
9290     {
9291       /* SCC.  */
9292       int last_idx = stack.length () - 1;
9293       while (stack[last_idx] != node)
9294         last_idx--;
9295       /* We can break the cycle at PHIs who have at least one child
9296          code generated.  Then we could re-start the DFS walk until
9297          all nodes in the SCC are covered (we might have new entries
9298          for only back-reachable nodes).  But it's simpler to just
9299          iterate and schedule those that are ready.  */
9300       unsigned todo = stack.length () - last_idx;
9301       do
9302         {
9303           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9304             {
9305               slp_tree entry = stack[idx];
9306               if (!entry)
9307                 continue;
9308               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9309                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9310               bool ready = !phi;
9311               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9312                   if (!child)
9313                     {
9314                       gcc_assert (phi);
9315                       ready = true;
9316                       break;
9317                     }
9318                   else if (scc_info.get (child)->on_stack)
9319                     {
9320                       if (!phi)
9321                         {
9322                           ready = false;
9323                           break;
9324                         }
9325                     }
9326                   else
9327                     {
9328                       if (phi)
9329                         {
9330                           ready = true;
9331                           break;
9332                         }
9333                     }
9334               if (ready)
9335                 {
9336                   vect_schedule_slp_node (vinfo, entry, instance);
9337                   scc_info.get (entry)->on_stack = false;
9338                   stack[idx] = NULL;
9339                   todo--;
9340                   if (phi)
9341                     phis_to_fixup.safe_push (entry);
9342                 }
9343             }
9344         }
9345       while (todo != 0);
9346
9347       /* Pop the SCC.  */
9348       stack.truncate (last_idx);
9349     }
9350
9351   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
9352   slp_tree phi_node;
9353   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9354     {
9355       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9356       edge_iterator ei;
9357       edge e;
9358       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9359         {
9360           unsigned dest_idx = e->dest_idx;
9361           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9362           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9363             continue;
9364           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9365           /* Simply fill all args.  */
9366           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9367               != vect_first_order_recurrence)
9368             for (unsigned i = 0; i < n; ++i)
9369               {
9370                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9371                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9372                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9373                              e, gimple_phi_arg_location (phi, dest_idx));
9374               }
9375           else
9376             {
9377               /* Unless it is a first order recurrence which needs
9378                  args filled in for both the PHI node and the permutes.  */
9379               gimple *perm
9380                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9381               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9382               add_phi_arg (as_a <gphi *> (rphi),
9383                            vect_get_slp_vect_def (child, n - 1),
9384                            e, gimple_phi_arg_location (phi, dest_idx));
9385               for (unsigned i = 0; i < n; ++i)
9386                 {
9387                   gimple *perm
9388                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9389                   if (i > 0)
9390                     gimple_assign_set_rhs1 (perm,
9391                                             vect_get_slp_vect_def (child, i - 1));
9392                   gimple_assign_set_rhs2 (perm,
9393                                           vect_get_slp_vect_def (child, i));
9394                   update_stmt (perm);
9395                 }
9396             }
9397         }
9398     }
9399 }
9400
9401 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
9402
9403 void
9404 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9405 {
9406   slp_instance instance;
9407   unsigned int i;
9408
9409   hash_map<slp_tree, slp_scc_info> scc_info;
9410   int maxdfs = 0;
9411   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9412     {
9413       slp_tree node = SLP_INSTANCE_TREE (instance);
9414       if (dump_enabled_p ())
9415         {
9416           dump_printf_loc (MSG_NOTE, vect_location,
9417                            "Vectorizing SLP tree:\n");
9418           /* ???  Dump all?  */
9419           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9420             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9421                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9422           vect_print_slp_graph (MSG_NOTE, vect_location,
9423                                 SLP_INSTANCE_TREE (instance));
9424         }
9425       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9426          have a PHI be the node breaking the cycle.  */
9427       auto_vec<slp_tree> stack;
9428       if (!scc_info.get (node))
9429         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9430
9431       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9432         vectorize_slp_instance_root_stmt (node, instance);
9433
9434       if (dump_enabled_p ())
9435         dump_printf_loc (MSG_NOTE, vect_location,
9436                          "vectorizing stmts using SLP.\n");
9437     }
9438
9439   FOR_EACH_VEC_ELT (slp_instances, i, instance)
9440     {
9441       slp_tree root = SLP_INSTANCE_TREE (instance);
9442       stmt_vec_info store_info;
9443       unsigned int j;
9444
9445       /* Remove scalar call stmts.  Do not do this for basic-block
9446          vectorization as not all uses may be vectorized.
9447          ???  Why should this be necessary?  DCE should be able to
9448          remove the stmts itself.
9449          ???  For BB vectorization we can as well remove scalar
9450          stmts starting from the SLP tree root if they have no
9451          uses.  */
9452       if (is_a <loop_vec_info> (vinfo))
9453         vect_remove_slp_scalar_calls (vinfo, root);
9454
9455       /* Remove vectorized stores original scalar stmts.  */
9456       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9457         {
9458           if (!STMT_VINFO_DATA_REF (store_info)
9459               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9460             break;
9461
9462           store_info = vect_orig_stmt (store_info);
9463           /* Free the attached stmt_vec_info and remove the stmt.  */
9464           vinfo->remove_stmt (store_info);
9465
9466           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9467              to not crash in vect_free_slp_tree later.  */
9468           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9469             SLP_TREE_REPRESENTATIVE (root) = NULL;
9470         }
9471     }
9472 }