gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944         {
 945           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 946                                unshare_expr (niter),
 947                                build_int_cst (TREE_TYPE (niter), 1));
 948           if (TREE_CODE (niter) == INTEGER_CST
 949               && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
 950             {
 951               /* If we manage to fold niter + 1 into INTEGER_CST even when
 952                  niter is some complex expression, ensure back
 953                  *number_of_iterationsm1 is an INTEGER_CST as well.  See
 954                  PR113210.  */
 955               *number_of_iterationsm1
 956                 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
 957                                build_minus_one_cst (TREE_TYPE (niter)));
 958             }
 959         }
 960       *number_of_iterations = niter;
 961     }
 962
 963   if (dump_enabled_p ())
 964     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 965
 966   return conds;
 967 }
 968
 969 /*  Determine the main loop exit for the vectorizer.  */
 970
 971 edge
 972 vec_init_loop_exit_info (class loop *loop)
 973 {
 974   /* Before we begin we must first determine which exit is the main one and
 975      which are auxilary exits.  */
 976   auto_vec<edge> exits = get_loop_exit_edges (loop);
 977   if (exits.length () == 1)
 978     return exits[0];
 979
 980   /* If we have multiple exits we only support counting IV at the moment.
 981      Analyze all exits and return the last one we can analyze.  */
 982   class tree_niter_desc niter_desc;
 983   edge candidate = NULL;
 984   for (edge exit : exits)
 985     {
 986       if (!get_loop_exit_condition (exit))
 987         continue;
 988
 989       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 990           && !chrec_contains_undetermined (niter_desc.niter))
 991         {
 992           tree may_be_zero = niter_desc.may_be_zero;
 993           if ((integer_zerop (may_be_zero)
 994                /* As we are handling may_be_zero that's not false by
 995                   rewriting niter to may_be_zero ? 0 : niter we require
 996                   an empty latch.  */
 997                || (single_pred_p (loop->latch)
 998                    && exit->src == single_pred (loop->latch)
 999                    && (integer_nonzerop (may_be_zero)
1000                        || COMPARISON_CLASS_P (may_be_zero))))
1001               && (!candidate
1002                   || dominated_by_p (CDI_DOMINATORS, exit->src,
1003                                      candidate->src)))
1004             candidate = exit;
1005         }
1006     }
1007
1008   return candidate;
1009 }
1010
1011 /* Function bb_in_loop_p
1012
1013    Used as predicate for dfs order traversal of the loop bbs.  */
1014
1015 static bool
1016 bb_in_loop_p (const_basic_block bb, const void *data)
1017 {
1018   const class loop *const loop = (const class loop *)data;
1019   if (flow_bb_inside_loop_p (loop, bb))
1020     return true;
1021   return false;
1022 }
1023
1024
1025 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1026    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1027
1028 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1029   : vec_info (vec_info::loop, shared),
1030     loop (loop_in),
1031     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1032     num_itersm1 (NULL_TREE),
1033     num_iters (NULL_TREE),
1034     num_iters_unchanged (NULL_TREE),
1035     num_iters_assumptions (NULL_TREE),
1036     vector_costs (nullptr),
1037     scalar_costs (nullptr),
1038     th (0),
1039     versioning_threshold (0),
1040     vectorization_factor (0),
1041     main_loop_edge (nullptr),
1042     skip_main_loop_edge (nullptr),
1043     skip_this_loop_edge (nullptr),
1044     reusable_accumulators (),
1045     suggested_unroll_factor (1),
1046     max_vectorization_factor (0),
1047     mask_skip_niters (NULL_TREE),
1048     rgroup_compare_type (NULL_TREE),
1049     simd_if_cond (NULL_TREE),
1050     partial_vector_style (vect_partial_vectors_none),
1051     unaligned_dr (NULL),
1052     peeling_for_alignment (0),
1053     ptr_mask (0),
1054     ivexpr_map (NULL),
1055     scan_map (NULL),
1056     slp_unrolling_factor (1),
1057     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1058     vectorizable (false),
1059     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1060     using_partial_vectors_p (false),
1061     using_decrementing_iv_p (false),
1062     using_select_vl_p (false),
1063     epil_using_partial_vectors_p (false),
1064     partial_load_store_bias (0),
1065     peeling_for_gaps (false),
1066     peeling_for_niter (false),
1067     early_breaks (false),
1068     no_data_dependencies (false),
1069     has_mask_store (false),
1070     scalar_loop_scaling (profile_probability::uninitialized ()),
1071     scalar_loop (NULL),
1072     orig_loop_info (NULL),
1073     vec_loop_iv_exit (NULL),
1074     vec_epilogue_loop_iv_exit (NULL),
1075     scalar_loop_iv_exit (NULL)
1076 {
1077   /* CHECKME: We want to visit all BBs before their successors (except for
1078      latch blocks, for which this assertion wouldn't hold).  In the simple
1079      case of the loop forms we allow, a dfs order of the BBs would the same
1080      as reversed postorder traversal, so we are safe.  */
1081
1082   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1083                                           bbs, loop->num_nodes, loop);
1084   gcc_assert (nbbs == loop->num_nodes);
1085
1086   for (unsigned int i = 0; i < nbbs; i++)
1087     {
1088       basic_block bb = bbs[i];
1089       gimple_stmt_iterator si;
1090
1091       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1092         {
1093           gimple *phi = gsi_stmt (si);
1094           gimple_set_uid (phi, 0);
1095           add_stmt (phi);
1096         }
1097
1098       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1099         {
1100           gimple *stmt = gsi_stmt (si);
1101           gimple_set_uid (stmt, 0);
1102           if (is_gimple_debug (stmt))
1103             continue;
1104           add_stmt (stmt);
1105           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1106              third argument is the #pragma omp simd if (x) condition, when 0,
1107              loop shouldn't be vectorized, when non-zero constant, it should
1108              be vectorized normally, otherwise versioned with vectorized loop
1109              done if the condition is non-zero at runtime.  */
1110           if (loop_in->simduid
1111               && is_gimple_call (stmt)
1112               && gimple_call_internal_p (stmt)
1113               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1114               && gimple_call_num_args (stmt) >= 3
1115               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1116               && (loop_in->simduid
1117                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1118             {
1119               tree arg = gimple_call_arg (stmt, 2);
1120               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1121                 simd_if_cond = arg;
1122               else
1123                 gcc_assert (integer_nonzerop (arg));
1124             }
1125         }
1126     }
1127
1128   epilogue_vinfos.create (6);
1129 }
1130
1131 /* Free all levels of rgroup CONTROLS.  */
1132
1133 void
1134 release_vec_loop_controls (vec<rgroup_controls> *controls)
1135 {
1136   rgroup_controls *rgc;
1137   unsigned int i;
1138   FOR_EACH_VEC_ELT (*controls, i, rgc)
1139     rgc->controls.release ();
1140   controls->release ();
1141 }
1142
1143 /* Free all memory used by the _loop_vec_info, as well as all the
1144    stmt_vec_info structs of all the stmts in the loop.  */
1145
1146 _loop_vec_info::~_loop_vec_info ()
1147 {
1148   free (bbs);
1149
1150   release_vec_loop_controls (&masks.rgc_vec);
1151   release_vec_loop_controls (&lens);
1152   delete ivexpr_map;
1153   delete scan_map;
1154   epilogue_vinfos.release ();
1155   delete scalar_costs;
1156   delete vector_costs;
1157
1158   /* When we release an epiloge vinfo that we do not intend to use
1159      avoid clearing AUX of the main loop which should continue to
1160      point to the main loop vinfo since otherwise we'll leak that.  */
1161   if (loop->aux == this)
1162     loop->aux = NULL;
1163 }
1164
1165 /* Return an invariant or register for EXPR and emit necessary
1166    computations in the LOOP_VINFO loop preheader.  */
1167
1168 tree
1169 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1170 {
1171   if (is_gimple_reg (expr)
1172       || is_gimple_min_invariant (expr))
1173     return expr;
1174
1175   if (! loop_vinfo->ivexpr_map)
1176     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1177   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1178   if (! cached)
1179     {
1180       gimple_seq stmts = NULL;
1181       cached = force_gimple_operand (unshare_expr (expr),
1182                                      &stmts, true, NULL_TREE);
1183       if (stmts)
1184         {
1185           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1186           gsi_insert_seq_on_edge_immediate (e, stmts);
1187         }
1188     }
1189   return cached;
1190 }
1191
1192 /* Return true if we can use CMP_TYPE as the comparison type to produce
1193    all masks required to mask LOOP_VINFO.  */
1194
1195 static bool
1196 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1197 {
1198   rgroup_controls *rgm;
1199   unsigned int i;
1200   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1201     if (rgm->type != NULL_TREE
1202         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1203                                             cmp_type, rgm->type,
1204                                             OPTIMIZE_FOR_SPEED))
1205       return false;
1206   return true;
1207 }
1208
1209 /* Calculate the maximum number of scalars per iteration for every
1210    rgroup in LOOP_VINFO.  */
1211
1212 static unsigned int
1213 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1214 {
1215   unsigned int res = 1;
1216   unsigned int i;
1217   rgroup_controls *rgm;
1218   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1219     res = MAX (res, rgm->max_nscalars_per_iter);
1220   return res;
1221 }
1222
1223 /* Calculate the minimum precision necessary to represent:
1224
1225       MAX_NITERS * FACTOR
1226
1227    as an unsigned integer, where MAX_NITERS is the maximum number of
1228    loop header iterations for the original scalar form of LOOP_VINFO.  */
1229
1230 static unsigned
1231 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1232 {
1233   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1234
1235   /* Get the maximum number of iterations that is representable
1236      in the counter type.  */
1237   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1238   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1239
1240   /* Get a more refined estimate for the number of iterations.  */
1241   widest_int max_back_edges;
1242   if (max_loop_iterations (loop, &max_back_edges))
1243     max_ni = wi::smin (max_ni, max_back_edges + 1);
1244
1245   /* Work out how many bits we need to represent the limit.  */
1246   return wi::min_precision (max_ni * factor, UNSIGNED);
1247 }
1248
1249 /* True if the loop needs peeling or partial vectors when vectorized.  */
1250
1251 static bool
1252 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1253 {
1254   unsigned HOST_WIDE_INT const_vf;
1255   HOST_WIDE_INT max_niter
1256     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1257
1258   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1259   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1260     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1261                                           (loop_vinfo));
1262
1263   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1264       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1265     {
1266       /* Work out the (constant) number of iterations that need to be
1267          peeled for reasons other than niters.  */
1268       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1269       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1270         peel_niter += 1;
1271       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1272                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1273         return true;
1274     }
1275   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1276       /* ??? When peeling for gaps but not alignment, we could
1277          try to check whether the (variable) niters is known to be
1278          VF * N + 1.  That's something of a niche case though.  */
1279       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1280       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1281       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1282            < (unsigned) exact_log2 (const_vf))
1283           /* In case of versioning, check if the maximum number of
1284              iterations is greater than th.  If they are identical,
1285              the epilogue is unnecessary.  */
1286           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1287               || ((unsigned HOST_WIDE_INT) max_niter
1288                   /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1289                      but that's only computed later based on our result.
1290                      The following is the most conservative approximation.  */
1291                   > (std::max ((unsigned HOST_WIDE_INT) th,
1292                                const_vf) / const_vf) * const_vf))))
1293     return true;
1294
1295   return false;
1296 }
1297
1298 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1299    whether we can actually generate the masks required.  Return true if so,
1300    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1301
1302 static bool
1303 vect_verify_full_masking (loop_vec_info loop_vinfo)
1304 {
1305   unsigned int min_ni_width;
1306
1307   /* Use a normal loop if there are no statements that need masking.
1308      This only happens in rare degenerate cases: it means that the loop
1309      has no loads, no stores, and no live-out values.  */
1310   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1311     return false;
1312
1313   /* Produce the rgroup controls.  */
1314   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1315     {
1316       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1317       tree vectype = mask.first;
1318       unsigned nvectors = mask.second;
1319
1320       if (masks->rgc_vec.length () < nvectors)
1321         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1322       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1323       /* The number of scalars per iteration and the number of vectors are
1324          both compile-time constants.  */
1325       unsigned int nscalars_per_iter
1326           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1327                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1328
1329       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1330         {
1331           rgm->max_nscalars_per_iter = nscalars_per_iter;
1332           rgm->type = truth_type_for (vectype);
1333           rgm->factor = 1;
1334         }
1335     }
1336
1337   unsigned int max_nscalars_per_iter
1338     = vect_get_max_nscalars_per_iter (loop_vinfo);
1339
1340   /* Work out how many bits we need to represent the limit.  */
1341   min_ni_width
1342     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1343
1344   /* Find a scalar mode for which WHILE_ULT is supported.  */
1345   opt_scalar_int_mode cmp_mode_iter;
1346   tree cmp_type = NULL_TREE;
1347   tree iv_type = NULL_TREE;
1348   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1349   unsigned int iv_precision = UINT_MAX;
1350
1351   if (iv_limit != -1)
1352     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1353                                       UNSIGNED);
1354
1355   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1356     {
1357       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1358       if (cmp_bits >= min_ni_width
1359           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1360         {
1361           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1362           if (this_type
1363               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1364             {
1365               /* Although we could stop as soon as we find a valid mode,
1366                  there are at least two reasons why that's not always the
1367                  best choice:
1368
1369                  - An IV that's Pmode or wider is more likely to be reusable
1370                    in address calculations than an IV that's narrower than
1371                    Pmode.
1372
1373                  - Doing the comparison in IV_PRECISION or wider allows
1374                    a natural 0-based IV, whereas using a narrower comparison
1375                    type requires mitigations against wrap-around.
1376
1377                  Conversely, if the IV limit is variable, doing the comparison
1378                  in a wider type than the original type can introduce
1379                  unnecessary extensions, so picking the widest valid mode
1380                  is not always a good choice either.
1381
1382                  Here we prefer the first IV type that's Pmode or wider,
1383                  and the first comparison type that's IV_PRECISION or wider.
1384                  (The comparison type must be no wider than the IV type,
1385                  to avoid extensions in the vector loop.)
1386
1387                  ??? We might want to try continuing beyond Pmode for ILP32
1388                  targets if CMP_BITS < IV_PRECISION.  */
1389               iv_type = this_type;
1390               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1391                 cmp_type = this_type;
1392               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1393                 break;
1394             }
1395         }
1396     }
1397
1398   if (!cmp_type)
1399     {
1400       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1401       return false;
1402     }
1403
1404   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1405   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1406   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1407   return true;
1408 }
1409
1410 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1411    whether we can actually generate AVX512 style masks.  Return true if so,
1412    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1413
1414 static bool
1415 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1416 {
1417   /* Produce differently organized rgc_vec and differently check
1418      we can produce masks.  */
1419
1420   /* Use a normal loop if there are no statements that need masking.
1421      This only happens in rare degenerate cases: it means that the loop
1422      has no loads, no stores, and no live-out values.  */
1423   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1424     return false;
1425
1426   /* For the decrementing IV we need to represent all values in
1427      [0, niter + niter_skip] where niter_skip is the elements we
1428      skip in the first iteration for prologue peeling.  */
1429   tree iv_type = NULL_TREE;
1430   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1431   unsigned int iv_precision = UINT_MAX;
1432   if (iv_limit != -1)
1433     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1434
1435   /* First compute the type for the IV we use to track the remaining
1436      scalar iterations.  */
1437   opt_scalar_int_mode cmp_mode_iter;
1438   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1439     {
1440       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1441       if (cmp_bits >= iv_precision
1442           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1443         {
1444           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1445           if (iv_type)
1446             break;
1447         }
1448     }
1449   if (!iv_type)
1450     return false;
1451
1452   /* Produce the rgroup controls.  */
1453   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1454     {
1455       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1456       tree vectype = mask.first;
1457       unsigned nvectors = mask.second;
1458
1459       /* The number of scalars per iteration and the number of vectors are
1460          both compile-time constants.  */
1461       unsigned int nscalars_per_iter
1462         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1463                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1464
1465       /* We index the rgroup_controls vector with nscalars_per_iter
1466          which we keep constant and instead have a varying nvectors,
1467          remembering the vector mask with the fewest nV.  */
1468       if (masks->rgc_vec.length () < nscalars_per_iter)
1469         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1470       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1471
1472       if (!rgm->type || rgm->factor > nvectors)
1473         {
1474           rgm->type = truth_type_for (vectype);
1475           rgm->compare_type = NULL_TREE;
1476           rgm->max_nscalars_per_iter = nscalars_per_iter;
1477           rgm->factor = nvectors;
1478           rgm->bias_adjusted_ctrl = NULL_TREE;
1479         }
1480     }
1481
1482   /* There is no fixed compare type we are going to use but we have to
1483      be able to get at one for each mask group.  */
1484   unsigned int min_ni_width
1485     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1486
1487   bool ok = true;
1488   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1489     {
1490       tree mask_type = rgc.type;
1491       if (!mask_type)
1492         continue;
1493
1494       /* For now vect_get_loop_mask only supports integer mode masks
1495          when we need to split it.  */
1496       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1497           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1498         {
1499           ok = false;
1500           break;
1501         }
1502
1503       /* If iv_type is usable as compare type use that - we can elide the
1504          saturation in that case.   */
1505       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1506         {
1507           tree cmp_vectype
1508             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1509           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1510             rgc.compare_type = cmp_vectype;
1511         }
1512       if (!rgc.compare_type)
1513         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1514           {
1515             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1516             if (cmp_bits >= min_ni_width
1517                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1518               {
1519                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1520                 if (!cmp_type)
1521                   continue;
1522
1523                 /* Check whether we can produce the mask with cmp_type.  */
1524                 tree cmp_vectype
1525                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1526                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1527                   {
1528                     rgc.compare_type = cmp_vectype;
1529                     break;
1530                   }
1531               }
1532         }
1533       if (!rgc.compare_type)
1534         {
1535           ok = false;
1536           break;
1537         }
1538     }
1539   if (!ok)
1540     {
1541       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1542       return false;
1543     }
1544
1545   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1546   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1547   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1548   return true;
1549 }
1550
1551 /* Check whether we can use vector access with length based on precison
1552    comparison.  So far, to keep it simple, we only allow the case that the
1553    precision of the target supported length is larger than the precision
1554    required by loop niters.  */
1555
1556 static bool
1557 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1558 {
1559   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1560     return false;
1561
1562   machine_mode len_load_mode, len_store_mode;
1563   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1564          .exists (&len_load_mode))
1565     return false;
1566   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1567          .exists (&len_store_mode))
1568     return false;
1569
1570   signed char partial_load_bias = internal_len_load_store_bias
1571     (IFN_LEN_LOAD, len_load_mode);
1572
1573   signed char partial_store_bias = internal_len_load_store_bias
1574     (IFN_LEN_STORE, len_store_mode);
1575
1576   gcc_assert (partial_load_bias == partial_store_bias);
1577
1578   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1579     return false;
1580
1581   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1582      len_loads with a length of zero.  In order to avoid that we prohibit
1583      more than one loop length here.  */
1584   if (partial_load_bias == -1
1585       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1586     return false;
1587
1588   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1589
1590   unsigned int max_nitems_per_iter = 1;
1591   unsigned int i;
1592   rgroup_controls *rgl;
1593   /* Find the maximum number of items per iteration for every rgroup.  */
1594   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1595     {
1596       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1597       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1598     }
1599
1600   /* Work out how many bits we need to represent the length limit.  */
1601   unsigned int min_ni_prec
1602     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1603
1604   /* Now use the maximum of below precisions for one suitable IV type:
1605      - the IV's natural precision
1606      - the precision needed to hold: the maximum number of scalar
1607        iterations multiplied by the scale factor (min_ni_prec above)
1608      - the Pmode precision
1609
1610      If min_ni_prec is less than the precision of the current niters,
1611      we perfer to still use the niters type.  Prefer to use Pmode and
1612      wider IV to avoid narrow conversions.  */
1613
1614   unsigned int ni_prec
1615     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1616   min_ni_prec = MAX (min_ni_prec, ni_prec);
1617   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1618
1619   tree iv_type = NULL_TREE;
1620   opt_scalar_int_mode tmode_iter;
1621   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1622     {
1623       scalar_mode tmode = tmode_iter.require ();
1624       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1625
1626       /* ??? Do we really want to construct one IV whose precision exceeds
1627          BITS_PER_WORD?  */
1628       if (tbits > BITS_PER_WORD)
1629         break;
1630
1631       /* Find the first available standard integral type.  */
1632       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1633         {
1634           iv_type = build_nonstandard_integer_type (tbits, true);
1635           break;
1636         }
1637     }
1638
1639   if (!iv_type)
1640     {
1641       if (dump_enabled_p ())
1642         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643                          "can't vectorize with length-based partial vectors"
1644                          " because there is no suitable iv type.\n");
1645       return false;
1646     }
1647
1648   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1649   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1650   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1651
1652   return true;
1653 }
1654
1655 /* Calculate the cost of one scalar iteration of the loop.  */
1656 static void
1657 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1658 {
1659   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1660   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1661   int nbbs = loop->num_nodes, factor;
1662   int innerloop_iters, i;
1663
1664   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1665
1666   /* Gather costs for statements in the scalar loop.  */
1667
1668   /* FORNOW.  */
1669   innerloop_iters = 1;
1670   if (loop->inner)
1671     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1672
1673   for (i = 0; i < nbbs; i++)
1674     {
1675       gimple_stmt_iterator si;
1676       basic_block bb = bbs[i];
1677
1678       if (bb->loop_father == loop->inner)
1679         factor = innerloop_iters;
1680       else
1681         factor = 1;
1682
1683       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1684         {
1685           gimple *stmt = gsi_stmt (si);
1686           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1687
1688           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1689             continue;
1690
1691           /* Skip stmts that are not vectorized inside the loop.  */
1692           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1693           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1694               && (!STMT_VINFO_LIVE_P (vstmt_info)
1695                   || !VECTORIZABLE_CYCLE_DEF
1696                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1697             continue;
1698
1699           vect_cost_for_stmt kind;
1700           if (STMT_VINFO_DATA_REF (stmt_info))
1701             {
1702               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1703                kind = scalar_load;
1704              else
1705                kind = scalar_store;
1706             }
1707           else if (vect_nop_conversion_p (stmt_info))
1708             continue;
1709           else
1710             kind = scalar_stmt;
1711
1712           /* We are using vect_prologue here to avoid scaling twice
1713              by the inner loop factor.  */
1714           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1715                             factor, kind, stmt_info, 0, vect_prologue);
1716         }
1717     }
1718
1719   /* Now accumulate cost.  */
1720   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1721   add_stmt_costs (loop_vinfo->scalar_costs,
1722                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1723   loop_vinfo->scalar_costs->finish_cost (nullptr);
1724 }
1725
1726 /* Function vect_analyze_loop_form.
1727
1728    Verify that certain CFG restrictions hold, including:
1729    - the loop has a pre-header
1730    - the loop has a single entry
1731    - nested loops can have only a single exit.
1732    - the loop exit condition is simple enough
1733    - the number of iterations can be analyzed, i.e, a countable loop.  The
1734      niter could be analyzed under some assumptions.  */
1735
1736 opt_result
1737 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1738 {
1739   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1740
1741   edge exit_e = vec_init_loop_exit_info (loop);
1742   if (!exit_e)
1743     return opt_result::failure_at (vect_location,
1744                                    "not vectorized:"
1745                                    " could not determine main exit from"
1746                                    " loop with multiple exits.\n");
1747   info->loop_exit = exit_e;
1748   if (dump_enabled_p ())
1749       dump_printf_loc (MSG_NOTE, vect_location,
1750                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1751                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1752
1753   /* Check if we have any control flow that doesn't leave the loop.  */
1754   class loop *v_loop = loop->inner ? loop->inner : loop;
1755   basic_block *bbs = get_loop_body (v_loop);
1756   for (unsigned i = 0; i < v_loop->num_nodes; i++)
1757     if (EDGE_COUNT (bbs[i]->succs) != 1
1758         && (EDGE_COUNT (bbs[i]->succs) != 2
1759             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1760       {
1761         free (bbs);
1762         return opt_result::failure_at (vect_location,
1763                                        "not vectorized:"
1764                                        " unsupported control flow in loop.\n");
1765       }
1766   free (bbs);
1767
1768   /* Different restrictions apply when we are considering an inner-most loop,
1769      vs. an outer (nested) loop.
1770      (FORNOW. May want to relax some of these restrictions in the future).  */
1771
1772   info->inner_loop_cond = NULL;
1773   if (!loop->inner)
1774     {
1775       /* Inner-most loop.  */
1776
1777       if (empty_block_p (loop->header))
1778         return opt_result::failure_at (vect_location,
1779                                        "not vectorized: empty loop.\n");
1780     }
1781   else
1782     {
1783       class loop *innerloop = loop->inner;
1784       edge entryedge;
1785
1786       /* Nested loop. We currently require that the loop is doubly-nested,
1787          contains a single inner loop with a single exit to the block
1788          with the single exit condition in the outer loop.
1789          Vectorizable outer-loops look like this:
1790
1791                         (pre-header)
1792                            |
1793                           header <---+
1794                            |         |
1795                           inner-loop |
1796                            |         |
1797                           tail ------+
1798                            |
1799                         (exit-bb)
1800
1801          The inner-loop also has the properties expected of inner-most loops
1802          as described above.  */
1803
1804       if ((loop->inner)->inner || (loop->inner)->next)
1805         return opt_result::failure_at (vect_location,
1806                                        "not vectorized:"
1807                                        " multiple nested loops.\n");
1808
1809       entryedge = loop_preheader_edge (innerloop);
1810       if (entryedge->src != loop->header
1811           || !single_exit (innerloop)
1812           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1813         return opt_result::failure_at (vect_location,
1814                                        "not vectorized:"
1815                                        " unsupported outerloop form.\n");
1816
1817       /* Analyze the inner-loop.  */
1818       vect_loop_form_info inner;
1819       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1820       if (!res)
1821         {
1822           if (dump_enabled_p ())
1823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824                              "not vectorized: Bad inner loop.\n");
1825           return res;
1826         }
1827
1828       /* Don't support analyzing niter under assumptions for inner
1829          loop.  */
1830       if (!integer_onep (inner.assumptions))
1831         return opt_result::failure_at (vect_location,
1832                                        "not vectorized: Bad inner loop.\n");
1833
1834       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1835         return opt_result::failure_at (vect_location,
1836                                        "not vectorized: inner-loop count not"
1837                                        " invariant.\n");
1838
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_NOTE, vect_location,
1841                          "Considering outer-loop vectorization.\n");
1842       info->inner_loop_cond = inner.conds[0];
1843     }
1844
1845   if (EDGE_COUNT (loop->header->preds) != 2)
1846     return opt_result::failure_at (vect_location,
1847                                    "not vectorized:"
1848                                    " too many incoming edges.\n");
1849
1850   /* We assume that the latch is empty.  */
1851   if (!empty_block_p (loop->latch)
1852       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1853     return opt_result::failure_at (vect_location,
1854                                    "not vectorized: latch block not empty.\n");
1855
1856   /* Make sure there is no abnormal exit.  */
1857   auto_vec<edge> exits = get_loop_exit_edges (loop);
1858   for (edge e : exits)
1859     {
1860       if (e->flags & EDGE_ABNORMAL)
1861         return opt_result::failure_at (vect_location,
1862                                        "not vectorized:"
1863                                        " abnormal loop exit edge.\n");
1864     }
1865
1866   info->conds
1867     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1868                             &info->number_of_iterations,
1869                             &info->number_of_iterationsm1);
1870   if (info->conds.is_empty ())
1871     return opt_result::failure_at
1872       (vect_location,
1873        "not vectorized: complicated exit condition.\n");
1874
1875   /* Determine what the primary and alternate exit conds are.  */
1876   for (unsigned i = 0; i < info->conds.length (); i++)
1877     {
1878       gcond *cond = info->conds[i];
1879       if (exit_e->src == gimple_bb (cond))
1880         std::swap (info->conds[0], info->conds[i]);
1881     }
1882
1883   if (integer_zerop (info->assumptions)
1884       || !info->number_of_iterations
1885       || chrec_contains_undetermined (info->number_of_iterations))
1886     return opt_result::failure_at
1887       (info->conds[0],
1888        "not vectorized: number of iterations cannot be computed.\n");
1889
1890   if (integer_zerop (info->number_of_iterations))
1891     return opt_result::failure_at
1892       (info->conds[0],
1893        "not vectorized: number of iterations = 0.\n");
1894
1895   if (!(tree_fits_shwi_p (info->number_of_iterations)
1896         && tree_to_shwi (info->number_of_iterations) > 0))
1897     {
1898       if (dump_enabled_p ())
1899         {
1900           dump_printf_loc (MSG_NOTE, vect_location,
1901                            "Symbolic number of iterations is ");
1902           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1903           dump_printf (MSG_NOTE, "\n");
1904         }
1905     }
1906
1907   return opt_result::success ();
1908 }
1909
1910 /* Create a loop_vec_info for LOOP with SHARED and the
1911    vect_analyze_loop_form result.  */
1912
1913 loop_vec_info
1914 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1915                         const vect_loop_form_info *info,
1916                         loop_vec_info main_loop_info)
1917 {
1918   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1919   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1920   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1921   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1922   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1923   /* Also record the assumptions for versioning.  */
1924   if (!integer_onep (info->assumptions) && !main_loop_info)
1925     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1926
1927   for (gcond *cond : info->conds)
1928     {
1929       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1930       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1931       /* Mark the statement as a condition.  */
1932       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1933     }
1934
1935   for (unsigned i = 1; i < info->conds.length (); i ++)
1936     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1937   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1938
1939   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1940
1941   /* Check to see if we're vectorizing multiple exits.  */
1942   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1943     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1944
1945   if (info->inner_loop_cond)
1946     {
1947       stmt_vec_info inner_loop_cond_info
1948         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1949       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1950       /* If we have an estimate on the number of iterations of the inner
1951          loop use that to limit the scale for costing, otherwise use
1952          --param vect-inner-loop-cost-factor literally.  */
1953       widest_int nit;
1954       if (estimated_stmt_executions (loop->inner, &nit))
1955         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1956           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1957     }
1958
1959   return loop_vinfo;
1960 }
1961
1962
1963
1964 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1965    statements update the vectorization factor.  */
1966
1967 static void
1968 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1969 {
1970   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1972   int nbbs = loop->num_nodes;
1973   poly_uint64 vectorization_factor;
1974   int i;
1975
1976   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1977
1978   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979   gcc_assert (known_ne (vectorization_factor, 0U));
1980
1981   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1982      vectorization factor of the loop is the unrolling factor required by
1983      the SLP instances.  If that unrolling factor is 1, we say, that we
1984      perform pure SLP on loop - cross iteration parallelism is not
1985      exploited.  */
1986   bool only_slp_in_loop = true;
1987   for (i = 0; i < nbbs; i++)
1988     {
1989       basic_block bb = bbs[i];
1990       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1991            gsi_next (&si))
1992         {
1993           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1994           if (!stmt_info)
1995             continue;
1996           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1997                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1998               && !PURE_SLP_STMT (stmt_info))
1999             /* STMT needs both SLP and loop-based vectorization.  */
2000             only_slp_in_loop = false;
2001         }
2002       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2003            gsi_next (&si))
2004         {
2005           if (is_gimple_debug (gsi_stmt (si)))
2006             continue;
2007           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2008           stmt_info = vect_stmt_to_vectorize (stmt_info);
2009           if ((STMT_VINFO_RELEVANT_P (stmt_info)
2010                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2011               && !PURE_SLP_STMT (stmt_info))
2012             /* STMT needs both SLP and loop-based vectorization.  */
2013             only_slp_in_loop = false;
2014         }
2015     }
2016
2017   if (only_slp_in_loop)
2018     {
2019       if (dump_enabled_p ())
2020         dump_printf_loc (MSG_NOTE, vect_location,
2021                          "Loop contains only SLP stmts\n");
2022       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2023     }
2024   else
2025     {
2026       if (dump_enabled_p ())
2027         dump_printf_loc (MSG_NOTE, vect_location,
2028                          "Loop contains SLP and non-SLP stmts\n");
2029       /* Both the vectorization factor and unroll factor have the form
2030          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2031          so they must have a common multiple.  */
2032       vectorization_factor
2033         = force_common_multiple (vectorization_factor,
2034                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2035     }
2036
2037   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2038   if (dump_enabled_p ())
2039     {
2040       dump_printf_loc (MSG_NOTE, vect_location,
2041                        "Updating vectorization factor to ");
2042       dump_dec (MSG_NOTE, vectorization_factor);
2043       dump_printf (MSG_NOTE, ".\n");
2044     }
2045 }
2046
2047 /* Return true if STMT_INFO describes a double reduction phi and if
2048    the other phi in the reduction is also relevant for vectorization.
2049    This rejects cases such as:
2050
2051       outer1:
2052         x_1 = PHI <x_3(outer2), ...>;
2053         ...
2054
2055       inner:
2056         x_2 = ...;
2057         ...
2058
2059       outer2:
2060         x_3 = PHI <x_2(inner)>;
2061
2062    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2063
2064 static bool
2065 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2066 {
2067   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2068     return false;
2069
2070   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2071 }
2072
2073 /* Function vect_analyze_loop_operations.
2074
2075    Scan the loop stmts and make sure they are all vectorizable.  */
2076
2077 static opt_result
2078 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2079 {
2080   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2081   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2082   int nbbs = loop->num_nodes;
2083   int i;
2084   stmt_vec_info stmt_info;
2085   bool need_to_vectorize = false;
2086   bool ok;
2087
2088   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2089
2090   auto_vec<stmt_info_for_cost> cost_vec;
2091
2092   for (i = 0; i < nbbs; i++)
2093     {
2094       basic_block bb = bbs[i];
2095
2096       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2097            gsi_next (&si))
2098         {
2099           gphi *phi = si.phi ();
2100           ok = true;
2101
2102           stmt_info = loop_vinfo->lookup_stmt (phi);
2103           if (dump_enabled_p ())
2104             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2105                              (gimple *) phi);
2106           if (virtual_operand_p (gimple_phi_result (phi)))
2107             continue;
2108
2109           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2110              (i.e., a phi in the tail of the outer-loop).  */
2111           if (! is_loop_header_bb_p (bb))
2112             {
2113               /* FORNOW: we currently don't support the case that these phis
2114                  are not used in the outerloop (unless it is double reduction,
2115                  i.e., this phi is vect_reduction_def), cause this case
2116                  requires to actually do something here.  */
2117               if (STMT_VINFO_LIVE_P (stmt_info)
2118                   && !vect_active_double_reduction_p (stmt_info))
2119                 return opt_result::failure_at (phi,
2120                                                "Unsupported loop-closed phi"
2121                                                " in outer-loop.\n");
2122
2123               /* If PHI is used in the outer loop, we check that its operand
2124                  is defined in the inner loop.  */
2125               if (STMT_VINFO_RELEVANT_P (stmt_info))
2126                 {
2127                   tree phi_op;
2128
2129                   if (gimple_phi_num_args (phi) != 1)
2130                     return opt_result::failure_at (phi, "unsupported phi");
2131
2132                   phi_op = PHI_ARG_DEF (phi, 0);
2133                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2134                   if (!op_def_info)
2135                     return opt_result::failure_at (phi, "unsupported phi\n");
2136
2137                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2138                       && (STMT_VINFO_RELEVANT (op_def_info)
2139                           != vect_used_in_outer_by_reduction))
2140                     return opt_result::failure_at (phi, "unsupported phi\n");
2141
2142                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2143                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2144                            == vect_double_reduction_def))
2145                       && !vectorizable_lc_phi (loop_vinfo,
2146                                                stmt_info, NULL, NULL))
2147                     return opt_result::failure_at (phi, "unsupported phi\n");
2148                 }
2149
2150               continue;
2151             }
2152
2153           gcc_assert (stmt_info);
2154
2155           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2156                || STMT_VINFO_LIVE_P (stmt_info))
2157               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2158               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2159             /* A scalar-dependence cycle that we don't support.  */
2160             return opt_result::failure_at (phi,
2161                                            "not vectorized:"
2162                                            " scalar dependence cycle.\n");
2163
2164           if (STMT_VINFO_RELEVANT_P (stmt_info))
2165             {
2166               need_to_vectorize = true;
2167               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2168                   && ! PURE_SLP_STMT (stmt_info))
2169                 ok = vectorizable_induction (loop_vinfo,
2170                                              stmt_info, NULL, NULL,
2171                                              &cost_vec);
2172               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2173                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2174                             == vect_double_reduction_def)
2175                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2176                        && ! PURE_SLP_STMT (stmt_info))
2177                 ok = vectorizable_reduction (loop_vinfo,
2178                                              stmt_info, NULL, NULL, &cost_vec);
2179               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2180                         == vect_first_order_recurrence)
2181                        && ! PURE_SLP_STMT (stmt_info))
2182                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2183                                            &cost_vec);
2184             }
2185
2186           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2187           if (ok
2188               && STMT_VINFO_LIVE_P (stmt_info)
2189               && !PURE_SLP_STMT (stmt_info))
2190             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2191                                               -1, false, &cost_vec);
2192
2193           if (!ok)
2194             return opt_result::failure_at (phi,
2195                                            "not vectorized: relevant phi not "
2196                                            "supported: %G",
2197                                            static_cast <gimple *> (phi));
2198         }
2199
2200       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2201            gsi_next (&si))
2202         {
2203           gimple *stmt = gsi_stmt (si);
2204           if (!gimple_clobber_p (stmt)
2205               && !is_gimple_debug (stmt))
2206             {
2207               opt_result res
2208                 = vect_analyze_stmt (loop_vinfo,
2209                                      loop_vinfo->lookup_stmt (stmt),
2210                                      &need_to_vectorize,
2211                                      NULL, NULL, &cost_vec);
2212               if (!res)
2213                 return res;
2214             }
2215         }
2216     } /* bbs */
2217
2218   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2219
2220   /* All operations in the loop are either irrelevant (deal with loop
2221      control, or dead), or only used outside the loop and can be moved
2222      out of the loop (e.g. invariants, inductions).  The loop can be
2223      optimized away by scalar optimizations.  We're better off not
2224      touching this loop.  */
2225   if (!need_to_vectorize)
2226     {
2227       if (dump_enabled_p ())
2228         dump_printf_loc (MSG_NOTE, vect_location,
2229                          "All the computation can be taken out of the loop.\n");
2230       return opt_result::failure_at
2231         (vect_location,
2232          "not vectorized: redundant loop. no profit to vectorize.\n");
2233     }
2234
2235   return opt_result::success ();
2236 }
2237
2238 /* Return true if we know that the iteration count is smaller than the
2239    vectorization factor.  Return false if it isn't, or if we can't be sure
2240    either way.  */
2241
2242 static bool
2243 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2244 {
2245   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2246
2247   HOST_WIDE_INT max_niter;
2248   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2249     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2250   else
2251     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2252
2253   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2254     return true;
2255
2256   return false;
2257 }
2258
2259 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2260    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2261    definitely no, or -1 if it's worth retrying.  */
2262
2263 static int
2264 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2265                            unsigned *suggested_unroll_factor)
2266 {
2267   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2268   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2269
2270   /* Only loops that can handle partially-populated vectors can have iteration
2271      counts less than the vectorization factor.  */
2272   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2273       && vect_known_niters_smaller_than_vf (loop_vinfo))
2274     {
2275       if (dump_enabled_p ())
2276         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277                          "not vectorized: iteration count smaller than "
2278                          "vectorization factor.\n");
2279       return 0;
2280     }
2281
2282   /* If we know the number of iterations we can do better, for the
2283      epilogue we can also decide whether the main loop leaves us
2284      with enough iterations, prefering a smaller vector epilog then
2285      also possibly used for the case we skip the vector loop.  */
2286   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2287     {
2288       widest_int scalar_niters
2289         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2290       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2291         {
2292           loop_vec_info orig_loop_vinfo
2293             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2294           unsigned lowest_vf
2295             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2296           int prolog_peeling = 0;
2297           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2298             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2299           if (prolog_peeling >= 0
2300               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2301                            lowest_vf))
2302             {
2303               unsigned gap
2304                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2305               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2306                                % lowest_vf + gap);
2307             }
2308         }
2309       /* Reject vectorizing for a single scalar iteration, even if
2310          we could in principle implement that using partial vectors.  */
2311       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2312       if (scalar_niters <= peeling_gap + 1)
2313         {
2314           if (dump_enabled_p ())
2315             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316                              "not vectorized: loop only has a single "
2317                              "scalar iteration.\n");
2318           return 0;
2319         }
2320
2321       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2322         {
2323           /* Check that the loop processes at least one full vector.  */
2324           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2325           if (known_lt (scalar_niters, vf))
2326             {
2327               if (dump_enabled_p ())
2328                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329                                  "loop does not have enough iterations "
2330                                  "to support vectorization.\n");
2331               return 0;
2332             }
2333
2334           /* If we need to peel an extra epilogue iteration to handle data
2335              accesses with gaps, check that there are enough scalar iterations
2336              available.
2337
2338              The check above is redundant with this one when peeling for gaps,
2339              but the distinction is useful for diagnostics.  */
2340           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2341               && known_le (scalar_niters, vf))
2342             {
2343               if (dump_enabled_p ())
2344                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345                                  "loop does not have enough iterations "
2346                                  "to support peeling for gaps.\n");
2347               return 0;
2348             }
2349         }
2350     }
2351
2352   /* If using the "very cheap" model. reject cases in which we'd keep
2353      a copy of the scalar code (even if we might be able to vectorize it).  */
2354   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2355       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2356           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2358     {
2359       if (dump_enabled_p ())
2360         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361                          "some scalar iterations would need to be peeled\n");
2362       return 0;
2363     }
2364
2365   int min_profitable_iters, min_profitable_estimate;
2366   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2367                                       &min_profitable_estimate,
2368                                       suggested_unroll_factor);
2369
2370   if (min_profitable_iters < 0)
2371     {
2372       if (dump_enabled_p ())
2373         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374                          "not vectorized: vectorization not profitable.\n");
2375       if (dump_enabled_p ())
2376         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377                          "not vectorized: vector version will never be "
2378                          "profitable.\n");
2379       return -1;
2380     }
2381
2382   int min_scalar_loop_bound = (param_min_vect_loop_bound
2383                                * assumed_vf);
2384
2385   /* Use the cost model only if it is more conservative than user specified
2386      threshold.  */
2387   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2388                                     min_profitable_iters);
2389
2390   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2391
2392   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2393       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2394     {
2395       if (dump_enabled_p ())
2396         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397                          "not vectorized: vectorization not profitable.\n");
2398       if (dump_enabled_p ())
2399         dump_printf_loc (MSG_NOTE, vect_location,
2400                          "not vectorized: iteration count smaller than user "
2401                          "specified loop bound parameter or minimum profitable "
2402                          "iterations (whichever is more conservative).\n");
2403       return 0;
2404     }
2405
2406   /* The static profitablity threshold min_profitable_estimate includes
2407      the cost of having to check at runtime whether the scalar loop
2408      should be used instead.  If it turns out that we don't need or want
2409      such a check, the threshold we should use for the static estimate
2410      is simply the point at which the vector loop becomes more profitable
2411      than the scalar loop.  */
2412   if (min_profitable_estimate > min_profitable_iters
2413       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2414       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2415       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2416       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2417     {
2418       if (dump_enabled_p ())
2419         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2420                          " choice between the scalar and vector loops\n");
2421       min_profitable_estimate = min_profitable_iters;
2422     }
2423
2424   /* If the vector loop needs multiple iterations to be beneficial then
2425      things are probably too close to call, and the conservative thing
2426      would be to stick with the scalar code.  */
2427   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2428       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2429     {
2430       if (dump_enabled_p ())
2431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432                          "one iteration of the vector loop would be"
2433                          " more expensive than the equivalent number of"
2434                          " iterations of the scalar loop\n");
2435       return 0;
2436     }
2437
2438   HOST_WIDE_INT estimated_niter;
2439
2440   /* If we are vectorizing an epilogue then we know the maximum number of
2441      scalar iterations it will cover is at least one lower than the
2442      vectorization factor of the main loop.  */
2443   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2444     estimated_niter
2445       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2446   else
2447     {
2448       estimated_niter = estimated_stmt_executions_int (loop);
2449       if (estimated_niter == -1)
2450         estimated_niter = likely_max_stmt_executions_int (loop);
2451     }
2452   if (estimated_niter != -1
2453       && ((unsigned HOST_WIDE_INT) estimated_niter
2454           < MAX (th, (unsigned) min_profitable_estimate)))
2455     {
2456       if (dump_enabled_p ())
2457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458                          "not vectorized: estimated iteration count too "
2459                          "small.\n");
2460       if (dump_enabled_p ())
2461         dump_printf_loc (MSG_NOTE, vect_location,
2462                          "not vectorized: estimated iteration count smaller "
2463                          "than specified loop bound parameter or minimum "
2464                          "profitable iterations (whichever is more "
2465                          "conservative).\n");
2466       return -1;
2467     }
2468
2469   return 1;
2470 }
2471
2472 static opt_result
2473 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2474                            vec<data_reference_p> *datarefs,
2475                            unsigned int *n_stmts)
2476 {
2477   *n_stmts = 0;
2478   for (unsigned i = 0; i < loop->num_nodes; i++)
2479     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2480          !gsi_end_p (gsi); gsi_next (&gsi))
2481       {
2482         gimple *stmt = gsi_stmt (gsi);
2483         if (is_gimple_debug (stmt))
2484           continue;
2485         ++(*n_stmts);
2486         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2487                                                         NULL, 0);
2488         if (!res)
2489           {
2490             if (is_gimple_call (stmt) && loop->safelen)
2491               {
2492                 tree fndecl = gimple_call_fndecl (stmt), op;
2493                 if (fndecl == NULL_TREE
2494                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2495                   {
2496                     fndecl = gimple_call_arg (stmt, 0);
2497                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2498                     fndecl = TREE_OPERAND (fndecl, 0);
2499                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2500                   }
2501                 if (fndecl != NULL_TREE)
2502                   {
2503                     cgraph_node *node = cgraph_node::get (fndecl);
2504                     if (node != NULL && node->simd_clones != NULL)
2505                       {
2506                         unsigned int j, n = gimple_call_num_args (stmt);
2507                         for (j = 0; j < n; j++)
2508                           {
2509                             op = gimple_call_arg (stmt, j);
2510                             if (DECL_P (op)
2511                                 || (REFERENCE_CLASS_P (op)
2512                                     && get_base_address (op)))
2513                               break;
2514                           }
2515                         op = gimple_call_lhs (stmt);
2516                         /* Ignore #pragma omp declare simd functions
2517                            if they don't have data references in the
2518                            call stmt itself.  */
2519                         if (j == n
2520                             && !(op
2521                                  && (DECL_P (op)
2522                                      || (REFERENCE_CLASS_P (op)
2523                                          && get_base_address (op)))))
2524                           continue;
2525                       }
2526                   }
2527               }
2528             return res;
2529           }
2530         /* If dependence analysis will give up due to the limit on the
2531            number of datarefs stop here and fail fatally.  */
2532         if (datarefs->length ()
2533             > (unsigned)param_loop_max_datarefs_for_datadeps)
2534           return opt_result::failure_at (stmt, "exceeded param "
2535                                          "loop-max-datarefs-for-datadeps\n");
2536       }
2537   return opt_result::success ();
2538 }
2539
2540 /* Look for SLP-only access groups and turn each individual access into its own
2541    group.  */
2542 static void
2543 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2544 {
2545   unsigned int i;
2546   struct data_reference *dr;
2547
2548   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2549
2550   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2551   FOR_EACH_VEC_ELT (datarefs, i, dr)
2552     {
2553       gcc_assert (DR_REF (dr));
2554       stmt_vec_info stmt_info
2555         = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2556
2557       /* Check if the load is a part of an interleaving chain.  */
2558       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2559         {
2560           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2561           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2562           unsigned int group_size = DR_GROUP_SIZE (first_element);
2563
2564           /* Check if SLP-only groups.  */
2565           if (!STMT_SLP_TYPE (stmt_info)
2566               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2567             {
2568               /* Dissolve the group.  */
2569               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2570
2571               stmt_vec_info vinfo = first_element;
2572               while (vinfo)
2573                 {
2574                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2575                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2576                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2577                   DR_GROUP_SIZE (vinfo) = 1;
2578                   if (STMT_VINFO_STRIDED_P (first_element)
2579                       /* We cannot handle stores with gaps.  */
2580                       || DR_IS_WRITE (dr_info->dr))
2581                     {
2582                       STMT_VINFO_STRIDED_P (vinfo) = true;
2583                       DR_GROUP_GAP (vinfo) = 0;
2584                     }
2585                   else
2586                     DR_GROUP_GAP (vinfo) = group_size - 1;
2587                   /* Duplicate and adjust alignment info, it needs to
2588                      be present on each group leader, see dr_misalignment.  */
2589                   if (vinfo != first_element)
2590                     {
2591                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2592                       dr_info2->target_alignment = dr_info->target_alignment;
2593                       int misalignment = dr_info->misalignment;
2594                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2595                         {
2596                           HOST_WIDE_INT diff
2597                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2598                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2599                           unsigned HOST_WIDE_INT align_c
2600                             = dr_info->target_alignment.to_constant ();
2601                           misalignment = (misalignment + diff) % align_c;
2602                         }
2603                       dr_info2->misalignment = misalignment;
2604                     }
2605                   vinfo = next;
2606                 }
2607             }
2608         }
2609     }
2610 }
2611
2612 /* Determine if operating on full vectors for LOOP_VINFO might leave
2613    some scalar iterations still to do.  If so, decide how we should
2614    handle those scalar iterations.  The possibilities are:
2615
2616    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2617        In this case:
2618
2619          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2620          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2621          LOOP_VINFO_PEELING_FOR_NITER == false
2622
2623    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2624        to handle the remaining scalar iterations.  In this case:
2625
2626          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2627          LOOP_VINFO_PEELING_FOR_NITER == true
2628
2629        There are two choices:
2630
2631        (2a) Consider vectorizing the epilogue loop at the same VF as the
2632             main loop, but using partial vectors instead of full vectors.
2633             In this case:
2634
2635               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2636
2637        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2638             In this case:
2639
2640               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2641  */
2642
2643 opt_result
2644 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2645 {
2646   /* Determine whether there would be any scalar iterations left over.  */
2647   bool need_peeling_or_partial_vectors_p
2648     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2649
2650   /* Decide whether to vectorize the loop with partial vectors.  */
2651   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2652   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2653   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2654       && need_peeling_or_partial_vectors_p)
2655     {
2656       /* For partial-vector-usage=1, try to push the handling of partial
2657          vectors to the epilogue, with the main loop continuing to operate
2658          on full vectors.
2659
2660          If we are unrolling we also do not want to use partial vectors. This
2661          is to avoid the overhead of generating multiple masks and also to
2662          avoid having to execute entire iterations of FALSE masked instructions
2663          when dealing with one or less full iterations.
2664
2665          ??? We could then end up failing to use partial vectors if we
2666          decide to peel iterations into a prologue, and if the main loop
2667          then ends up processing fewer than VF iterations.  */
2668       if ((param_vect_partial_vector_usage == 1
2669            || loop_vinfo->suggested_unroll_factor > 1)
2670           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2671           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2672         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2673       else
2674         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2675     }
2676
2677   if (dump_enabled_p ())
2678     dump_printf_loc (MSG_NOTE, vect_location,
2679                      "operating on %s vectors%s.\n",
2680                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2681                      ? "partial" : "full",
2682                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2683                      ? " for epilogue loop" : "");
2684
2685   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2686     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2687        && need_peeling_or_partial_vectors_p);
2688
2689   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2690      analysis that we don't know whether the loop is vectorized by partial
2691      vectors (More details see tree-vect-loop-manip.cc).
2692
2693      However, SELECT_VL vectorizaton style should only applied on partial
2694      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2695      number of elements to be process for each iteration.
2696
2697      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2698      if it is not partial vectorized loop.  */
2699   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2700     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2701
2702   return opt_result::success ();
2703 }
2704
2705 /* Function vect_analyze_loop_2.
2706
2707    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2708    analyses will record information in some members of LOOP_VINFO.  FATAL
2709    indicates if some analysis meets fatal error.  If one non-NULL pointer
2710    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2711    worked out suggested unroll factor, while one NULL pointer shows it's
2712    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2713    is to hold the slp decision when the suggested unroll factor is worked
2714    out.  */
2715 static opt_result
2716 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2717                      unsigned *suggested_unroll_factor,
2718                      bool& slp_done_for_suggested_uf)
2719 {
2720   opt_result ok = opt_result::success ();
2721   int res;
2722   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2723   poly_uint64 min_vf = 2;
2724   loop_vec_info orig_loop_vinfo = NULL;
2725
2726   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2727      loop_vec_info of the first vectorized loop.  */
2728   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2729     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2730   else
2731     orig_loop_vinfo = loop_vinfo;
2732   gcc_assert (orig_loop_vinfo);
2733
2734   /* The first group of checks is independent of the vector size.  */
2735   fatal = true;
2736
2737   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2738       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2739     return opt_result::failure_at (vect_location,
2740                                    "not vectorized: simd if(0)\n");
2741
2742   /* Find all data references in the loop (which correspond to vdefs/vuses)
2743      and analyze their evolution in the loop.  */
2744
2745   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2746
2747   /* Gather the data references and count stmts in the loop.  */
2748   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2749     {
2750       opt_result res
2751         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2752                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2753                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2754       if (!res)
2755         {
2756           if (dump_enabled_p ())
2757             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2758                              "not vectorized: loop contains function "
2759                              "calls or data references that cannot "
2760                              "be analyzed\n");
2761           return res;
2762         }
2763       loop_vinfo->shared->save_datarefs ();
2764     }
2765   else
2766     loop_vinfo->shared->check_datarefs ();
2767
2768   /* Analyze the data references and also adjust the minimal
2769      vectorization factor according to the loads and stores.  */
2770
2771   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2772   if (!ok)
2773     {
2774       if (dump_enabled_p ())
2775         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776                          "bad data references.\n");
2777       return ok;
2778     }
2779
2780   /* Check if we are applying unroll factor now.  */
2781   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2782   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2783
2784   /* If the slp decision is false when suggested unroll factor is worked
2785      out, and we are applying suggested unroll factor, we can simply skip
2786      all slp related analyses this time.  */
2787   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2788
2789   /* Classify all cross-iteration scalar data-flow cycles.
2790      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2791   vect_analyze_scalar_cycles (loop_vinfo, slp);
2792
2793   vect_pattern_recog (loop_vinfo);
2794
2795   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2796
2797   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2798      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2799
2800   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2801   if (!ok)
2802     {
2803       if (dump_enabled_p ())
2804         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2805                          "bad data access.\n");
2806       return ok;
2807     }
2808
2809   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2810
2811   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2812   if (!ok)
2813     {
2814       if (dump_enabled_p ())
2815         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2816                          "unexpected pattern.\n");
2817       return ok;
2818     }
2819
2820   /* While the rest of the analysis below depends on it in some way.  */
2821   fatal = false;
2822
2823   /* Analyze data dependences between the data-refs in the loop
2824      and adjust the maximum vectorization factor according to
2825      the dependences.
2826      FORNOW: fail at the first data dependence that we encounter.  */
2827
2828   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2829   if (!ok)
2830     {
2831       if (dump_enabled_p ())
2832         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2833                          "bad data dependence.\n");
2834       return ok;
2835     }
2836   if (max_vf != MAX_VECTORIZATION_FACTOR
2837       && maybe_lt (max_vf, min_vf))
2838     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2839   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2840
2841   ok = vect_determine_vectorization_factor (loop_vinfo);
2842   if (!ok)
2843     {
2844       if (dump_enabled_p ())
2845         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2846                          "can't determine vectorization factor.\n");
2847       return ok;
2848     }
2849
2850   /* Compute the scalar iteration cost.  */
2851   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2852
2853   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2854
2855   if (slp)
2856     {
2857       /* Check the SLP opportunities in the loop, analyze and build
2858          SLP trees.  */
2859       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2860       if (!ok)
2861         return ok;
2862
2863       /* If there are any SLP instances mark them as pure_slp.  */
2864       slp = vect_make_slp_decision (loop_vinfo);
2865       if (slp)
2866         {
2867           /* Find stmts that need to be both vectorized and SLPed.  */
2868           vect_detect_hybrid_slp (loop_vinfo);
2869
2870           /* Update the vectorization factor based on the SLP decision.  */
2871           vect_update_vf_for_slp (loop_vinfo);
2872
2873           /* Optimize the SLP graph with the vectorization factor fixed.  */
2874           vect_optimize_slp (loop_vinfo);
2875
2876           /* Gather the loads reachable from the SLP graph entries.  */
2877           vect_gather_slp_loads (loop_vinfo);
2878         }
2879     }
2880
2881   bool saved_can_use_partial_vectors_p
2882     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2883
2884   /* We don't expect to have to roll back to anything other than an empty
2885      set of rgroups.  */
2886   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2887
2888   /* This is the point where we can re-start analysis with SLP forced off.  */
2889 start_over:
2890
2891   /* Apply the suggested unrolling factor, this was determined by the backend
2892      during finish_cost the first time we ran the analyzis for this
2893      vector mode.  */
2894   if (applying_suggested_uf)
2895     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2896
2897   /* Now the vectorization factor is final.  */
2898   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2899   gcc_assert (known_ne (vectorization_factor, 0U));
2900
2901   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2902     {
2903       dump_printf_loc (MSG_NOTE, vect_location,
2904                        "vectorization_factor = ");
2905       dump_dec (MSG_NOTE, vectorization_factor);
2906       dump_printf (MSG_NOTE, ", niters = %wd\n",
2907                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2908     }
2909
2910   if (max_vf != MAX_VECTORIZATION_FACTOR
2911       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2912     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2913
2914   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2915
2916   /* Analyze the alignment of the data-refs in the loop.
2917      Fail if a data reference is found that cannot be vectorized.  */
2918
2919   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2920   if (!ok)
2921     {
2922       if (dump_enabled_p ())
2923         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2924                          "bad data alignment.\n");
2925       return ok;
2926     }
2927
2928   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2929      It is important to call pruning after vect_analyze_data_ref_accesses,
2930      since we use grouping information gathered by interleaving analysis.  */
2931   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2932   if (!ok)
2933     return ok;
2934
2935   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2936      vectorization, since we do not want to add extra peeling or
2937      add versioning for alignment.  */
2938   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2939     /* This pass will decide on using loop versioning and/or loop peeling in
2940        order to enhance the alignment of data references in the loop.  */
2941     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2942   if (!ok)
2943     return ok;
2944
2945   if (slp)
2946     {
2947       /* Analyze operations in the SLP instances.  Note this may
2948          remove unsupported SLP instances which makes the above
2949          SLP kind detection invalid.  */
2950       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2951       vect_slp_analyze_operations (loop_vinfo);
2952       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2953         {
2954           ok = opt_result::failure_at (vect_location,
2955                                        "unsupported SLP instances\n");
2956           goto again;
2957         }
2958
2959       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2960       slp_tree load_node, slp_root;
2961       unsigned i, x;
2962       slp_instance instance;
2963       bool can_use_lanes = true;
2964       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2965         {
2966           slp_root = SLP_INSTANCE_TREE (instance);
2967           int group_size = SLP_TREE_LANES (slp_root);
2968           tree vectype = SLP_TREE_VECTYPE (slp_root);
2969           bool loads_permuted = false;
2970           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2971             {
2972               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2973                 continue;
2974               unsigned j;
2975               stmt_vec_info load_info;
2976               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2977                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2978                   {
2979                     loads_permuted = true;
2980                     break;
2981                   }
2982             }
2983
2984           /* If the loads and stores can be handled with load/store-lane
2985              instructions record it and move on to the next instance.  */
2986           if (loads_permuted
2987               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2988               && vect_store_lanes_supported (vectype, group_size, false)
2989                    != IFN_LAST)
2990             {
2991               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2992                 if (STMT_VINFO_GROUPED_ACCESS
2993                       (SLP_TREE_REPRESENTATIVE (load_node)))
2994                   {
2995                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2996                         (SLP_TREE_REPRESENTATIVE (load_node));
2997                     /* Use SLP for strided accesses (or if we can't
2998                        load-lanes).  */
2999                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
3000                         || vect_load_lanes_supported
3001                              (STMT_VINFO_VECTYPE (stmt_vinfo),
3002                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3003                       break;
3004                   }
3005
3006               can_use_lanes
3007                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3008
3009               if (can_use_lanes && dump_enabled_p ())
3010                 dump_printf_loc (MSG_NOTE, vect_location,
3011                                  "SLP instance %p can use load/store-lanes\n",
3012                                  (void *) instance);
3013             }
3014           else
3015             {
3016               can_use_lanes = false;
3017               break;
3018             }
3019         }
3020
3021       /* If all SLP instances can use load/store-lanes abort SLP and try again
3022          with SLP disabled.  */
3023       if (can_use_lanes)
3024         {
3025           ok = opt_result::failure_at (vect_location,
3026                                        "Built SLP cancelled: can use "
3027                                        "load/store-lanes\n");
3028           if (dump_enabled_p ())
3029             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3030                              "Built SLP cancelled: all SLP instances support "
3031                              "load/store-lanes\n");
3032           goto again;
3033         }
3034     }
3035
3036   /* Dissolve SLP-only groups.  */
3037   vect_dissolve_slp_only_groups (loop_vinfo);
3038
3039   /* Scan all the remaining operations in the loop that are not subject
3040      to SLP and make sure they are vectorizable.  */
3041   ok = vect_analyze_loop_operations (loop_vinfo);
3042   if (!ok)
3043     {
3044       if (dump_enabled_p ())
3045         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3046                          "bad operation or unsupported loop bound.\n");
3047       return ok;
3048     }
3049
3050   /* For now, we don't expect to mix both masking and length approaches for one
3051      loop, disable it if both are recorded.  */
3052   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3053       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3054       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3055     {
3056       if (dump_enabled_p ())
3057         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058                          "can't vectorize a loop with partial vectors"
3059                          " because we don't expect to mix different"
3060                          " approaches with partial vectors for the"
3061                          " same loop.\n");
3062       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3063     }
3064
3065   /* If we still have the option of using partial vectors,
3066      check whether we can generate the necessary loop controls.  */
3067   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3068     {
3069       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3070         {
3071           if (!vect_verify_full_masking (loop_vinfo)
3072               && !vect_verify_full_masking_avx512 (loop_vinfo))
3073             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3074         }
3075       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3076         if (!vect_verify_loop_lens (loop_vinfo))
3077           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3078     }
3079
3080   /* If we're vectorizing a loop that uses length "controls" and
3081      can iterate more than once, we apply decrementing IV approach
3082      in loop control.  */
3083   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3084       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3085       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3086       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3087            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3088                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3089     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3090
3091   /* If a loop uses length controls and has a decrementing loop control IV,
3092      we will normally pass that IV through a MIN_EXPR to calcaluate the
3093      basis for the length controls.  E.g. in a loop that processes one
3094      element per scalar iteration, the number of elements would be
3095      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3096
3097      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3098      step, since only the final iteration of the vector loop can have
3099      inactive lanes.
3100
3101      However, some targets have a dedicated instruction for calculating the
3102      preferred length, given the total number of elements that still need to
3103      be processed.  This is encapsulated in the SELECT_VL internal function.
3104
3105      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3106      to determine the basis for the length controls.  However, unlike the
3107      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3108      lanes inactive in any iteration of the vector loop, not just the last
3109      iteration.  This SELECT_VL approach therefore requires us to use pointer
3110      IVs with variable steps.
3111
3112      Once we've decided how many elements should be processed by one
3113      iteration of the vector loop, we need to populate the rgroup controls.
3114      If a loop has multiple rgroups, we need to make sure that those rgroups
3115      "line up" (that is, they must be consistent about which elements are
3116      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3117
3118      In principle, it would be possible to use vect_adjust_loop_lens_control
3119      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3120      However:
3121
3122      (1) In practice, it only makes sense to use SELECT_VL when a vector
3123          operation will be controlled directly by the result.  It is not
3124          worth using SELECT_VL if it would only be the input to other
3125          calculations.
3126
3127      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3128          pointer IV will need N updates by a variable amount (N-1 updates
3129          within the iteration and 1 update to move to the next iteration).
3130
3131      Because of this, we prefer to use the MIN_EXPR approach whenever there
3132      is more than one length control.
3133
3134      In addition, SELECT_VL always operates to a granularity of 1 unit.
3135      If we wanted to use it to control an SLP operation on N consecutive
3136      elements, we would need to make the SELECT_VL inputs measure scalar
3137      iterations (rather than elements) and then multiply the SELECT_VL
3138      result by N.  But using SELECT_VL this way is inefficient because
3139      of (1) above.
3140
3141      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3142         satisfied:
3143
3144      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3145      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3146
3147      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3148      we will fail to gain benefits of following unroll optimizations. We prefer
3149      using the MIN_EXPR approach in this situation.  */
3150   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3151     {
3152       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3153       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3154                                           OPTIMIZE_FOR_SPEED)
3155           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3156           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3157           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3158               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3159         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3160     }
3161
3162   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3163      assuming that the loop will be used as a main loop.  We will redo
3164      this analysis later if we instead decide to use the loop as an
3165      epilogue loop.  */
3166   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3167   if (!ok)
3168     return ok;
3169
3170   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3171      to be able to handle fewer than VF scalars, or needs to have a lower VF
3172      than the main loop.  */
3173   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3174       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3175     {
3176       poly_uint64 unscaled_vf
3177         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3178                      orig_loop_vinfo->suggested_unroll_factor);
3179       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3180         return opt_result::failure_at (vect_location,
3181                                        "Vectorization factor too high for"
3182                                        " epilogue loop.\n");
3183     }
3184
3185   /* Check the costings of the loop make vectorizing worthwhile.  */
3186   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3187   if (res < 0)
3188     {
3189       ok = opt_result::failure_at (vect_location,
3190                                    "Loop costings may not be worthwhile.\n");
3191       goto again;
3192     }
3193   if (!res)
3194     return opt_result::failure_at (vect_location,
3195                                    "Loop costings not worthwhile.\n");
3196
3197   /* If an epilogue loop is required make sure we can create one.  */
3198   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3199       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3200       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3201     {
3202       if (dump_enabled_p ())
3203         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3204       if (!vect_can_advance_ivs_p (loop_vinfo)
3205           || !slpeel_can_duplicate_loop_p (loop,
3206                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3207                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3208         {
3209           ok = opt_result::failure_at (vect_location,
3210                                        "not vectorized: can't create required "
3211                                        "epilog loop\n");
3212           goto again;
3213         }
3214     }
3215
3216   /* During peeling, we need to check if number of loop iterations is
3217      enough for both peeled prolog loop and vector loop.  This check
3218      can be merged along with threshold check of loop versioning, so
3219      increase threshold for this case if necessary.
3220
3221      If we are analyzing an epilogue we still want to check what its
3222      versioning threshold would be.  If we decide to vectorize the epilogues we
3223      will want to use the lowest versioning threshold of all epilogues and main
3224      loop.  This will enable us to enter a vectorized epilogue even when
3225      versioning the loop.  We can't simply check whether the epilogue requires
3226      versioning though since we may have skipped some versioning checks when
3227      analyzing the epilogue.  For instance, checks for alias versioning will be
3228      skipped when dealing with epilogues as we assume we already checked them
3229      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3230   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3231     {
3232       poly_uint64 niters_th = 0;
3233       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3234
3235       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3236         {
3237           /* Niters for peeled prolog loop.  */
3238           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3239             {
3240               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3241               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3242               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3243             }
3244           else
3245             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3246         }
3247
3248       /* Niters for at least one iteration of vectorized loop.  */
3249       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3250         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3251       /* One additional iteration because of peeling for gap.  */
3252       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3253         niters_th += 1;
3254
3255       /*  Use the same condition as vect_transform_loop to decide when to use
3256           the cost to determine a versioning threshold.  */
3257       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3258           && ordered_p (th, niters_th))
3259         niters_th = ordered_max (poly_uint64 (th), niters_th);
3260
3261       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3262     }
3263
3264   gcc_assert (known_eq (vectorization_factor,
3265                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3266
3267   slp_done_for_suggested_uf = slp;
3268
3269   /* Ok to vectorize!  */
3270   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3271   return opt_result::success ();
3272
3273 again:
3274   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3275   gcc_assert (!ok);
3276
3277   /* Try again with SLP forced off but if we didn't do any SLP there is
3278      no point in re-trying.  */
3279   if (!slp)
3280     return ok;
3281
3282   /* If the slp decision is true when suggested unroll factor is worked
3283      out, and we are applying suggested unroll factor, we don't need to
3284      re-try any more.  */
3285   if (applying_suggested_uf && slp_done_for_suggested_uf)
3286     return ok;
3287
3288   /* If there are reduction chains re-trying will fail anyway.  */
3289   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3290     return ok;
3291
3292   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3293      via interleaving or lane instructions.  */
3294   slp_instance instance;
3295   slp_tree node;
3296   unsigned i, j;
3297   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3298     {
3299       stmt_vec_info vinfo;
3300       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3301       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3302         continue;
3303       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3304       unsigned int size = DR_GROUP_SIZE (vinfo);
3305       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3306       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3307          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3308          && ! vect_grouped_store_supported (vectype, size))
3309         return opt_result::failure_at (vinfo->stmt,
3310                                        "unsupported grouped store\n");
3311       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3312         {
3313           vinfo = SLP_TREE_REPRESENTATIVE (node);
3314           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3315             {
3316               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3317               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3318               size = DR_GROUP_SIZE (vinfo);
3319               vectype = STMT_VINFO_VECTYPE (vinfo);
3320               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3321                   && ! vect_grouped_load_supported (vectype, single_element_p,
3322                                                     size))
3323                 return opt_result::failure_at (vinfo->stmt,
3324                                                "unsupported grouped load\n");
3325             }
3326         }
3327     }
3328
3329   if (dump_enabled_p ())
3330     dump_printf_loc (MSG_NOTE, vect_location,
3331                      "re-trying with SLP disabled\n");
3332
3333   /* Roll back state appropriately.  No SLP this time.  */
3334   slp = false;
3335   /* Restore vectorization factor as it were without SLP.  */
3336   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3337   /* Free the SLP instances.  */
3338   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3339     vect_free_slp_instance (instance);
3340   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3341   /* Reset SLP type to loop_vect on all stmts.  */
3342   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3343     {
3344       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3345       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3346            !gsi_end_p (si); gsi_next (&si))
3347         {
3348           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3349           STMT_SLP_TYPE (stmt_info) = loop_vect;
3350           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3351               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3352             {
3353               /* vectorizable_reduction adjusts reduction stmt def-types,
3354                  restore them to that of the PHI.  */
3355               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3356                 = STMT_VINFO_DEF_TYPE (stmt_info);
3357               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3358                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3359                 = STMT_VINFO_DEF_TYPE (stmt_info);
3360             }
3361         }
3362       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3363            !gsi_end_p (si); gsi_next (&si))
3364         {
3365           if (is_gimple_debug (gsi_stmt (si)))
3366             continue;
3367           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3368           STMT_SLP_TYPE (stmt_info) = loop_vect;
3369           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3370             {
3371               stmt_vec_info pattern_stmt_info
3372                 = STMT_VINFO_RELATED_STMT (stmt_info);
3373               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3374                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3375
3376               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3377               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3378               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3379                    !gsi_end_p (pi); gsi_next (&pi))
3380                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3381                   = loop_vect;
3382             }
3383         }
3384     }
3385   /* Free optimized alias test DDRS.  */
3386   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3387   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3388   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3389   /* Reset target cost data.  */
3390   delete loop_vinfo->vector_costs;
3391   loop_vinfo->vector_costs = nullptr;
3392   /* Reset accumulated rgroup information.  */
3393   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3394   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3395   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3396   /* Reset assorted flags.  */
3397   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3398   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3399   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3400   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3401   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3402     = saved_can_use_partial_vectors_p;
3403
3404   goto start_over;
3405 }
3406
3407 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3408    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3409    OLD_LOOP_VINFO is better unless something specifically indicates
3410    otherwise.
3411
3412    Note that this deliberately isn't a partial order.  */
3413
3414 static bool
3415 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3416                           loop_vec_info old_loop_vinfo)
3417 {
3418   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3419   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3420
3421   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3422   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3423
3424   /* Always prefer a VF of loop->simdlen over any other VF.  */
3425   if (loop->simdlen)
3426     {
3427       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3428       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3429       if (new_simdlen_p != old_simdlen_p)
3430         return new_simdlen_p;
3431     }
3432
3433   const auto *old_costs = old_loop_vinfo->vector_costs;
3434   const auto *new_costs = new_loop_vinfo->vector_costs;
3435   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3436     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3437
3438   return new_costs->better_main_loop_than_p (old_costs);
3439 }
3440
3441 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3442    true if we should.  */
3443
3444 static bool
3445 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3446                         loop_vec_info old_loop_vinfo)
3447 {
3448   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3449     return false;
3450
3451   if (dump_enabled_p ())
3452     dump_printf_loc (MSG_NOTE, vect_location,
3453                      "***** Preferring vector mode %s to vector mode %s\n",
3454                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3455                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3456   return true;
3457 }
3458
3459 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3460    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3461    MODE_I to the next mode useful to analyze.
3462    Return the loop_vinfo on success and wrapped null on failure.  */
3463
3464 static opt_loop_vec_info
3465 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3466                      const vect_loop_form_info *loop_form_info,
3467                      loop_vec_info main_loop_vinfo,
3468                      const vector_modes &vector_modes, unsigned &mode_i,
3469                      machine_mode &autodetected_vector_mode,
3470                      bool &fatal)
3471 {
3472   loop_vec_info loop_vinfo
3473     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3474
3475   machine_mode vector_mode = vector_modes[mode_i];
3476   loop_vinfo->vector_mode = vector_mode;
3477   unsigned int suggested_unroll_factor = 1;
3478   bool slp_done_for_suggested_uf = false;
3479
3480   /* Run the main analysis.  */
3481   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3482                                         &suggested_unroll_factor,
3483                                         slp_done_for_suggested_uf);
3484   if (dump_enabled_p ())
3485     dump_printf_loc (MSG_NOTE, vect_location,
3486                      "***** Analysis %s with vector mode %s\n",
3487                      res ? "succeeded" : " failed",
3488                      GET_MODE_NAME (loop_vinfo->vector_mode));
3489
3490   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3491     {
3492       if (dump_enabled_p ())
3493         dump_printf_loc (MSG_NOTE, vect_location,
3494                          "***** Re-trying analysis for unrolling"
3495                          " with unroll factor %d and slp %s.\n",
3496                          suggested_unroll_factor,
3497                          slp_done_for_suggested_uf ? "on" : "off");
3498       loop_vec_info unroll_vinfo
3499         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3500       unroll_vinfo->vector_mode = vector_mode;
3501       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3502       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3503                                                 slp_done_for_suggested_uf);
3504       if (new_res)
3505         {
3506           delete loop_vinfo;
3507           loop_vinfo = unroll_vinfo;
3508         }
3509       else
3510         delete unroll_vinfo;
3511     }
3512
3513   /* Remember the autodetected vector mode.  */
3514   if (vector_mode == VOIDmode)
3515     autodetected_vector_mode = loop_vinfo->vector_mode;
3516
3517   /* Advance mode_i, first skipping modes that would result in the
3518      same analysis result.  */
3519   while (mode_i + 1 < vector_modes.length ()
3520          && vect_chooses_same_modes_p (loop_vinfo,
3521                                        vector_modes[mode_i + 1]))
3522     {
3523       if (dump_enabled_p ())
3524         dump_printf_loc (MSG_NOTE, vect_location,
3525                          "***** The result for vector mode %s would"
3526                          " be the same\n",
3527                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3528       mode_i += 1;
3529     }
3530   if (mode_i + 1 < vector_modes.length ()
3531       && VECTOR_MODE_P (autodetected_vector_mode)
3532       && (related_vector_mode (vector_modes[mode_i + 1],
3533                                GET_MODE_INNER (autodetected_vector_mode))
3534           == autodetected_vector_mode)
3535       && (related_vector_mode (autodetected_vector_mode,
3536                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3537           == vector_modes[mode_i + 1]))
3538     {
3539       if (dump_enabled_p ())
3540         dump_printf_loc (MSG_NOTE, vect_location,
3541                          "***** Skipping vector mode %s, which would"
3542                          " repeat the analysis for %s\n",
3543                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3544                          GET_MODE_NAME (autodetected_vector_mode));
3545       mode_i += 1;
3546     }
3547   mode_i++;
3548
3549   if (!res)
3550     {
3551       delete loop_vinfo;
3552       if (fatal)
3553         gcc_checking_assert (main_loop_vinfo == NULL);
3554       return opt_loop_vec_info::propagate_failure (res);
3555     }
3556
3557   return opt_loop_vec_info::success (loop_vinfo);
3558 }
3559
3560 /* Function vect_analyze_loop.
3561
3562    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3563    for it.  The different analyses will record information in the
3564    loop_vec_info struct.  */
3565 opt_loop_vec_info
3566 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3567 {
3568   DUMP_VECT_SCOPE ("analyze_loop_nest");
3569
3570   if (loop_outer (loop)
3571       && loop_vec_info_for_loop (loop_outer (loop))
3572       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3573     return opt_loop_vec_info::failure_at (vect_location,
3574                                           "outer-loop already vectorized.\n");
3575
3576   if (!find_loop_nest (loop, &shared->loop_nest))
3577     return opt_loop_vec_info::failure_at
3578       (vect_location,
3579        "not vectorized: loop nest containing two or more consecutive inner"
3580        " loops cannot be vectorized\n");
3581
3582   /* Analyze the loop form.  */
3583   vect_loop_form_info loop_form_info;
3584   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3585   if (!res)
3586     {
3587       if (dump_enabled_p ())
3588         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589                          "bad loop form.\n");
3590       return opt_loop_vec_info::propagate_failure (res);
3591     }
3592   if (!integer_onep (loop_form_info.assumptions))
3593     {
3594       /* We consider to vectorize this loop by versioning it under
3595          some assumptions.  In order to do this, we need to clear
3596          existing information computed by scev and niter analyzer.  */
3597       scev_reset_htab ();
3598       free_numbers_of_iterations_estimates (loop);
3599       /* Also set flag for this loop so that following scev and niter
3600          analysis are done under the assumptions.  */
3601       loop_constraint_set (loop, LOOP_C_FINITE);
3602     }
3603   else
3604     /* Clear the existing niter information to make sure the nonwrapping flag
3605        will be calculated and set propriately.  */
3606     free_numbers_of_iterations_estimates (loop);
3607
3608   auto_vector_modes vector_modes;
3609   /* Autodetect first vector size we try.  */
3610   vector_modes.safe_push (VOIDmode);
3611   unsigned int autovec_flags
3612     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3613                                                     loop->simdlen != 0);
3614   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3615                              && !unlimited_cost_model (loop));
3616   machine_mode autodetected_vector_mode = VOIDmode;
3617   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3618   unsigned int mode_i = 0;
3619   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3620
3621   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3622      a mode has not been analyzed.  */
3623   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3624   for (unsigned i = 0; i < vector_modes.length (); ++i)
3625     cached_vf_per_mode.safe_push (0);
3626
3627   /* First determine the main loop vectorization mode, either the first
3628      one that works, starting with auto-detecting the vector mode and then
3629      following the targets order of preference, or the one with the
3630      lowest cost if pick_lowest_cost_p.  */
3631   while (1)
3632     {
3633       bool fatal;
3634       unsigned int last_mode_i = mode_i;
3635       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3636          failed.  */
3637       cached_vf_per_mode[last_mode_i] = -1;
3638       opt_loop_vec_info loop_vinfo
3639         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3640                                NULL, vector_modes, mode_i,
3641                                autodetected_vector_mode, fatal);
3642       if (fatal)
3643         break;
3644
3645       if (loop_vinfo)
3646         {
3647           /*  Analyzis has been successful so update the VF value.  The
3648               VF should always be a multiple of unroll_factor and we want to
3649               capture the original VF here.  */
3650           cached_vf_per_mode[last_mode_i]
3651             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3652                          loop_vinfo->suggested_unroll_factor);
3653           /* Once we hit the desired simdlen for the first time,
3654              discard any previous attempts.  */
3655           if (simdlen
3656               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3657             {
3658               delete first_loop_vinfo;
3659               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3660               simdlen = 0;
3661             }
3662           else if (pick_lowest_cost_p
3663                    && first_loop_vinfo
3664                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3665             {
3666               /* Pick loop_vinfo over first_loop_vinfo.  */
3667               delete first_loop_vinfo;
3668               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3669             }
3670           if (first_loop_vinfo == NULL)
3671             first_loop_vinfo = loop_vinfo;
3672           else
3673             {
3674               delete loop_vinfo;
3675               loop_vinfo = opt_loop_vec_info::success (NULL);
3676             }
3677
3678           /* Commit to first_loop_vinfo if we have no reason to try
3679              alternatives.  */
3680           if (!simdlen && !pick_lowest_cost_p)
3681             break;
3682         }
3683       if (mode_i == vector_modes.length ()
3684           || autodetected_vector_mode == VOIDmode)
3685         break;
3686
3687       /* Try the next biggest vector size.  */
3688       if (dump_enabled_p ())
3689         dump_printf_loc (MSG_NOTE, vect_location,
3690                          "***** Re-trying analysis with vector mode %s\n",
3691                          GET_MODE_NAME (vector_modes[mode_i]));
3692     }
3693   if (!first_loop_vinfo)
3694     return opt_loop_vec_info::propagate_failure (res);
3695
3696   if (dump_enabled_p ())
3697     dump_printf_loc (MSG_NOTE, vect_location,
3698                      "***** Choosing vector mode %s\n",
3699                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3700
3701   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3702      enabled, SIMDUID is not set, it is the innermost loop and we have
3703      either already found the loop's SIMDLEN or there was no SIMDLEN to
3704      begin with.
3705      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3706   bool vect_epilogues = (!simdlen
3707                          && loop->inner == NULL
3708                          && param_vect_epilogues_nomask
3709                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3710                            /* No code motion support for multiple epilogues so for now
3711                               not supported when multiple exits.  */
3712                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3713                          && !loop->simduid);
3714   if (!vect_epilogues)
3715     return first_loop_vinfo;
3716
3717   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3718   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3719
3720   /* For epilogues start the analysis from the first mode.  The motivation
3721      behind starting from the beginning comes from cases where the VECTOR_MODES
3722      array may contain length-agnostic and length-specific modes.  Their
3723      ordering is not guaranteed, so we could end up picking a mode for the main
3724      loop that is after the epilogue's optimal mode.  */
3725   vector_modes[0] = autodetected_vector_mode;
3726   mode_i = 0;
3727
3728   bool supports_partial_vectors =
3729     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3730   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3731
3732   while (1)
3733     {
3734       /* If the target does not support partial vectors we can shorten the
3735          number of modes to analyze for the epilogue as we know we can't pick a
3736          mode that would lead to a VF at least as big as the
3737          FIRST_VINFO_VF.  */
3738       if (!supports_partial_vectors
3739           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3740         {
3741           mode_i++;
3742           if (mode_i == vector_modes.length ())
3743             break;
3744           continue;
3745         }
3746
3747       if (dump_enabled_p ())
3748         dump_printf_loc (MSG_NOTE, vect_location,
3749                          "***** Re-trying epilogue analysis with vector "
3750                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3751
3752       bool fatal;
3753       opt_loop_vec_info loop_vinfo
3754         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3755                                first_loop_vinfo,
3756                                vector_modes, mode_i,
3757                                autodetected_vector_mode, fatal);
3758       if (fatal)
3759         break;
3760
3761       if (loop_vinfo)
3762         {
3763           if (pick_lowest_cost_p)
3764             {
3765               /* Keep trying to roll back vectorization attempts while the
3766                  loop_vec_infos they produced were worse than this one.  */
3767               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3768               while (!vinfos.is_empty ()
3769                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3770                 {
3771                   gcc_assert (vect_epilogues);
3772                   delete vinfos.pop ();
3773                 }
3774             }
3775           /* For now only allow one epilogue loop.  */
3776           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3777             {
3778               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3779               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3780               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3781                           || maybe_ne (lowest_th, 0U));
3782               /* Keep track of the known smallest versioning
3783                  threshold.  */
3784               if (ordered_p (lowest_th, th))
3785                 lowest_th = ordered_min (lowest_th, th);
3786             }
3787           else
3788             {
3789               delete loop_vinfo;
3790               loop_vinfo = opt_loop_vec_info::success (NULL);
3791             }
3792
3793           /* For now only allow one epilogue loop, but allow
3794              pick_lowest_cost_p to replace it, so commit to the
3795              first epilogue if we have no reason to try alternatives.  */
3796           if (!pick_lowest_cost_p)
3797             break;
3798         }
3799
3800       if (mode_i == vector_modes.length ())
3801         break;
3802
3803     }
3804
3805   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3806     {
3807       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3808       if (dump_enabled_p ())
3809         dump_printf_loc (MSG_NOTE, vect_location,
3810                          "***** Choosing epilogue vector mode %s\n",
3811                          GET_MODE_NAME
3812                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3813     }
3814
3815   return first_loop_vinfo;
3816 }
3817
3818 /* Return true if there is an in-order reduction function for CODE, storing
3819    it in *REDUC_FN if so.  */
3820
3821 static bool
3822 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3823 {
3824   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3825      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3826      (-0.0) = -0.0.  */
3827   if (code == PLUS_EXPR || code == MINUS_EXPR)
3828     {
3829       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3830       return true;
3831     }
3832   return false;
3833 }
3834
3835 /* Function reduction_fn_for_scalar_code
3836
3837    Input:
3838    CODE - tree_code of a reduction operations.
3839
3840    Output:
3841    REDUC_FN - the corresponding internal function to be used to reduce the
3842       vector of partial results into a single scalar result, or IFN_LAST
3843       if the operation is a supported reduction operation, but does not have
3844       such an internal function.
3845
3846    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3847
3848 bool
3849 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3850 {
3851   if (code.is_tree_code ())
3852     switch (tree_code (code))
3853       {
3854       case MAX_EXPR:
3855         *reduc_fn = IFN_REDUC_MAX;
3856         return true;
3857
3858       case MIN_EXPR:
3859         *reduc_fn = IFN_REDUC_MIN;
3860         return true;
3861
3862       case PLUS_EXPR:
3863         *reduc_fn = IFN_REDUC_PLUS;
3864         return true;
3865
3866       case BIT_AND_EXPR:
3867         *reduc_fn = IFN_REDUC_AND;
3868         return true;
3869
3870       case BIT_IOR_EXPR:
3871         *reduc_fn = IFN_REDUC_IOR;
3872         return true;
3873
3874       case BIT_XOR_EXPR:
3875         *reduc_fn = IFN_REDUC_XOR;
3876         return true;
3877
3878       case MULT_EXPR:
3879       case MINUS_EXPR:
3880         *reduc_fn = IFN_LAST;
3881         return true;
3882
3883       default:
3884         return false;
3885       }
3886   else
3887     switch (combined_fn (code))
3888       {
3889       CASE_CFN_FMAX:
3890         *reduc_fn = IFN_REDUC_FMAX;
3891         return true;
3892
3893       CASE_CFN_FMIN:
3894         *reduc_fn = IFN_REDUC_FMIN;
3895         return true;
3896
3897       default:
3898         return false;
3899       }
3900 }
3901
3902 /* If there is a neutral value X such that a reduction would not be affected
3903    by the introduction of additional X elements, return that X, otherwise
3904    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3905    of the scalar elements.  If the reduction has just a single initial value
3906    then INITIAL_VALUE is that value, otherwise it is null.
3907    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3908    In that case no signed zero is returned.  */
3909
3910 tree
3911 neutral_op_for_reduction (tree scalar_type, code_helper code,
3912                           tree initial_value, bool as_initial)
3913 {
3914   if (code.is_tree_code ())
3915     switch (tree_code (code))
3916       {
3917       case DOT_PROD_EXPR:
3918       case SAD_EXPR:
3919       case MINUS_EXPR:
3920       case BIT_IOR_EXPR:
3921       case BIT_XOR_EXPR:
3922         return build_zero_cst (scalar_type);
3923       case WIDEN_SUM_EXPR:
3924       case PLUS_EXPR:
3925         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3926           return build_real (scalar_type, dconstm0);
3927         else
3928           return build_zero_cst (scalar_type);
3929
3930       case MULT_EXPR:
3931         return build_one_cst (scalar_type);
3932
3933       case BIT_AND_EXPR:
3934         return build_all_ones_cst (scalar_type);
3935
3936       case MAX_EXPR:
3937       case MIN_EXPR:
3938         return initial_value;
3939
3940       default:
3941         return NULL_TREE;
3942       }
3943   else
3944     switch (combined_fn (code))
3945       {
3946       CASE_CFN_FMIN:
3947       CASE_CFN_FMAX:
3948         return initial_value;
3949
3950       default:
3951         return NULL_TREE;
3952       }
3953 }
3954
3955 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3956    STMT is printed with a message MSG. */
3957
3958 static void
3959 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3960 {
3961   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3962 }
3963
3964 /* Return true if we need an in-order reduction for operation CODE
3965    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3966    overflow must wrap.  */
3967
3968 bool
3969 needs_fold_left_reduction_p (tree type, code_helper code)
3970 {
3971   /* CHECKME: check for !flag_finite_math_only too?  */
3972   if (SCALAR_FLOAT_TYPE_P (type))
3973     {
3974       if (code.is_tree_code ())
3975         switch (tree_code (code))
3976           {
3977           case MIN_EXPR:
3978           case MAX_EXPR:
3979             return false;
3980
3981           default:
3982             return !flag_associative_math;
3983           }
3984       else
3985         switch (combined_fn (code))
3986           {
3987           CASE_CFN_FMIN:
3988           CASE_CFN_FMAX:
3989             return false;
3990
3991           default:
3992             return !flag_associative_math;
3993           }
3994     }
3995
3996   if (INTEGRAL_TYPE_P (type))
3997     return (!code.is_tree_code ()
3998             || !operation_no_trapping_overflow (type, tree_code (code)));
3999
4000   if (SAT_FIXED_POINT_TYPE_P (type))
4001     return true;
4002
4003   return false;
4004 }
4005
4006 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4007    has a handled computation expression.  Store the main reduction
4008    operation in *CODE.  */
4009
4010 static bool
4011 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4012                       tree loop_arg, code_helper *code,
4013                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4014 {
4015   auto_bitmap visited;
4016   tree lookfor = PHI_RESULT (phi);
4017   ssa_op_iter curri;
4018   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4019   while (USE_FROM_PTR (curr) != loop_arg)
4020     curr = op_iter_next_use (&curri);
4021   curri.i = curri.numops;
4022   do
4023     {
4024       path.safe_push (std::make_pair (curri, curr));
4025       tree use = USE_FROM_PTR (curr);
4026       if (use == lookfor)
4027         break;
4028       gimple *def = SSA_NAME_DEF_STMT (use);
4029       if (gimple_nop_p (def)
4030           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4031         {
4032 pop:
4033           do
4034             {
4035               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4036               curri = x.first;
4037               curr = x.second;
4038               do
4039                 curr = op_iter_next_use (&curri);
4040               /* Skip already visited or non-SSA operands (from iterating
4041                  over PHI args).  */
4042               while (curr != NULL_USE_OPERAND_P
4043                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4044                          || ! bitmap_set_bit (visited,
4045                                               SSA_NAME_VERSION
4046                                                 (USE_FROM_PTR (curr)))));
4047             }
4048           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4049           if (curr == NULL_USE_OPERAND_P)
4050             break;
4051         }
4052       else
4053         {
4054           if (gimple_code (def) == GIMPLE_PHI)
4055             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4056           else
4057             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4058           while (curr != NULL_USE_OPERAND_P
4059                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4060                      || ! bitmap_set_bit (visited,
4061                                           SSA_NAME_VERSION
4062                                             (USE_FROM_PTR (curr)))))
4063             curr = op_iter_next_use (&curri);
4064           if (curr == NULL_USE_OPERAND_P)
4065             goto pop;
4066         }
4067     }
4068   while (1);
4069   if (dump_file && (dump_flags & TDF_DETAILS))
4070     {
4071       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4072       unsigned i;
4073       std::pair<ssa_op_iter, use_operand_p> *x;
4074       FOR_EACH_VEC_ELT (path, i, x)
4075         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4076       dump_printf (MSG_NOTE, "\n");
4077     }
4078
4079   /* Check whether the reduction path detected is valid.  */
4080   bool fail = path.length () == 0;
4081   bool neg = false;
4082   int sign = -1;
4083   *code = ERROR_MARK;
4084   for (unsigned i = 1; i < path.length (); ++i)
4085     {
4086       gimple *use_stmt = USE_STMT (path[i].second);
4087       gimple_match_op op;
4088       if (!gimple_extract_op (use_stmt, &op))
4089         {
4090           fail = true;
4091           break;
4092         }
4093       unsigned int opi = op.num_ops;
4094       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4095         {
4096           /* The following make sure we can compute the operand index
4097              easily plus it mostly disallows chaining via COND_EXPR condition
4098              operands.  */
4099           for (opi = 0; opi < op.num_ops; ++opi)
4100             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4101               break;
4102         }
4103       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4104         {
4105           for (opi = 0; opi < op.num_ops; ++opi)
4106             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4107               break;
4108         }
4109       if (opi == op.num_ops)
4110         {
4111           fail = true;
4112           break;
4113         }
4114       op.code = canonicalize_code (op.code, op.type);
4115       if (op.code == MINUS_EXPR)
4116         {
4117           op.code = PLUS_EXPR;
4118           /* Track whether we negate the reduction value each iteration.  */
4119           if (op.ops[1] == op.ops[opi])
4120             neg = ! neg;
4121         }
4122       else if (op.code == IFN_COND_SUB)
4123         {
4124           op.code = IFN_COND_ADD;
4125           /* Track whether we negate the reduction value each iteration.  */
4126           if (op.ops[2] == op.ops[opi])
4127             neg = ! neg;
4128         }
4129       if (CONVERT_EXPR_CODE_P (op.code)
4130           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4131         ;
4132       else if (*code == ERROR_MARK)
4133         {
4134           *code = op.code;
4135           sign = TYPE_SIGN (op.type);
4136         }
4137       else if (op.code != *code)
4138         {
4139           fail = true;
4140           break;
4141         }
4142       else if ((op.code == MIN_EXPR
4143                 || op.code == MAX_EXPR)
4144                && sign != TYPE_SIGN (op.type))
4145         {
4146           fail = true;
4147           break;
4148         }
4149       /* Check there's only a single stmt the op is used on.  For the
4150          not value-changing tail and the last stmt allow out-of-loop uses.
4151          ???  We could relax this and handle arbitrary live stmts by
4152          forcing a scalar epilogue for example.  */
4153       imm_use_iterator imm_iter;
4154       use_operand_p use_p;
4155       gimple *op_use_stmt;
4156       unsigned cnt = 0;
4157       bool cond_fn_p = op.code.is_internal_fn ()
4158         && (conditional_internal_fn_code (internal_fn (op.code))
4159             != ERROR_MARK);
4160
4161       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4162         {
4163         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4164            op1 twice (once as definition, once as else) in the same operation.
4165            Allow this.  */
4166           if (cond_fn_p && op_use_stmt == use_stmt)
4167             {
4168               gcall *call = as_a<gcall *> (use_stmt);
4169               unsigned else_pos
4170                 = internal_fn_else_index (internal_fn (op.code));
4171
4172               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4173                 {
4174                   if (j == else_pos)
4175                     continue;
4176                   if (gimple_call_arg (call, j) == op.ops[opi])
4177                     cnt++;
4178                 }
4179             }
4180           else if (!is_gimple_debug (op_use_stmt)
4181                    && (*code != ERROR_MARK
4182                        || flow_bb_inside_loop_p (loop,
4183                                                  gimple_bb (op_use_stmt))))
4184             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4185               cnt++;
4186         }
4187
4188       if (cnt != 1)
4189         {
4190           fail = true;
4191           break;
4192         }
4193     }
4194   return ! fail && ! neg && *code != ERROR_MARK;
4195 }
4196
4197 bool
4198 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4199                       tree loop_arg, enum tree_code code)
4200 {
4201   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4202   code_helper code_;
4203   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4204           && code_ == code);
4205 }
4206
4207
4208
4209 /* Function vect_is_simple_reduction
4210
4211    (1) Detect a cross-iteration def-use cycle that represents a simple
4212    reduction computation.  We look for the following pattern:
4213
4214    loop_header:
4215      a1 = phi < a0, a2 >
4216      a3 = ...
4217      a2 = operation (a3, a1)
4218
4219    or
4220
4221    a3 = ...
4222    loop_header:
4223      a1 = phi < a0, a2 >
4224      a2 = operation (a3, a1)
4225
4226    such that:
4227    1. operation is commutative and associative and it is safe to
4228       change the order of the computation
4229    2. no uses for a2 in the loop (a2 is used out of the loop)
4230    3. no uses of a1 in the loop besides the reduction operation
4231    4. no uses of a1 outside the loop.
4232
4233    Conditions 1,4 are tested here.
4234    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4235
4236    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4237    nested cycles.
4238
4239    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4240    reductions:
4241
4242      a1 = phi < a0, a2 >
4243      inner loop (def of a3)
4244      a2 = phi < a3 >
4245
4246    (4) Detect condition expressions, ie:
4247      for (int i = 0; i < N; i++)
4248        if (a[i] < val)
4249         ret_val = a[i];
4250
4251 */
4252
4253 static stmt_vec_info
4254 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4255                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4256 {
4257   gphi *phi = as_a <gphi *> (phi_info->stmt);
4258   gimple *phi_use_stmt = NULL;
4259   imm_use_iterator imm_iter;
4260   use_operand_p use_p;
4261
4262   *double_reduc = false;
4263   *reduc_chain_p = false;
4264   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4265
4266   tree phi_name = PHI_RESULT (phi);
4267   /* ???  If there are no uses of the PHI result the inner loop reduction
4268      won't be detected as possibly double-reduction by vectorizable_reduction
4269      because that tries to walk the PHI arg from the preheader edge which
4270      can be constant.  See PR60382.  */
4271   if (has_zero_uses (phi_name))
4272     return NULL;
4273   class loop *loop = (gimple_bb (phi))->loop_father;
4274   unsigned nphi_def_loop_uses = 0;
4275   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4276     {
4277       gimple *use_stmt = USE_STMT (use_p);
4278       if (is_gimple_debug (use_stmt))
4279         continue;
4280
4281       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4282         {
4283           if (dump_enabled_p ())
4284             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4285                              "intermediate value used outside loop.\n");
4286
4287           return NULL;
4288         }
4289
4290       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4291          op1 twice (once as definition, once as else) in the same operation.
4292          Only count it as one. */
4293       if (use_stmt != phi_use_stmt)
4294         {
4295           nphi_def_loop_uses++;
4296           phi_use_stmt = use_stmt;
4297         }
4298     }
4299
4300   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4301   if (TREE_CODE (latch_def) != SSA_NAME)
4302     {
4303       if (dump_enabled_p ())
4304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4305                          "reduction: not ssa_name: %T\n", latch_def);
4306       return NULL;
4307     }
4308
4309   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4310   if (!def_stmt_info
4311       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4312     return NULL;
4313
4314   bool nested_in_vect_loop
4315     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4316   unsigned nlatch_def_loop_uses = 0;
4317   auto_vec<gphi *, 3> lcphis;
4318   bool inner_loop_of_double_reduc = false;
4319   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4320     {
4321       gimple *use_stmt = USE_STMT (use_p);
4322       if (is_gimple_debug (use_stmt))
4323         continue;
4324       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4325         nlatch_def_loop_uses++;
4326       else
4327         {
4328           /* We can have more than one loop-closed PHI.  */
4329           lcphis.safe_push (as_a <gphi *> (use_stmt));
4330           if (nested_in_vect_loop
4331               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4332                   == vect_double_reduction_def))
4333             inner_loop_of_double_reduc = true;
4334         }
4335     }
4336
4337   /* If we are vectorizing an inner reduction we are executing that
4338      in the original order only in case we are not dealing with a
4339      double reduction.  */
4340   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4341     {
4342       if (dump_enabled_p ())
4343         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4344                         "detected nested cycle: ");
4345       return def_stmt_info;
4346     }
4347
4348   /* When the inner loop of a double reduction ends up with more than
4349      one loop-closed PHI we have failed to classify alternate such
4350      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4351   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4352     {
4353       if (dump_enabled_p ())
4354         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4355                          "unhandle double reduction\n");
4356       return NULL;
4357     }
4358
4359   /* If this isn't a nested cycle or if the nested cycle reduction value
4360      is used ouside of the inner loop we cannot handle uses of the reduction
4361      value.  */
4362   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4363     {
4364       if (dump_enabled_p ())
4365         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4366                          "reduction used in loop.\n");
4367       return NULL;
4368     }
4369
4370   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4371      defined in the inner loop.  */
4372   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4373     {
4374       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4375       if (gimple_phi_num_args (def_stmt) != 1
4376           || TREE_CODE (op1) != SSA_NAME)
4377         {
4378           if (dump_enabled_p ())
4379             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4380                              "unsupported phi node definition.\n");
4381
4382           return NULL;
4383         }
4384
4385       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4386          and the latch definition op1.  */
4387       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4388       if (gimple_bb (def1)
4389           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4390           && loop->inner
4391           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4392           && (is_gimple_assign (def1) || is_gimple_call (def1))
4393           && is_a <gphi *> (phi_use_stmt)
4394           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4395           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4396                                             loop_latch_edge (loop->inner))))
4397         {
4398           if (dump_enabled_p ())
4399             report_vect_op (MSG_NOTE, def_stmt,
4400                             "detected double reduction: ");
4401
4402           *double_reduc = true;
4403           return def_stmt_info;
4404         }
4405
4406       return NULL;
4407     }
4408
4409   /* Look for the expression computing latch_def from then loop PHI result.  */
4410   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4411   code_helper code;
4412   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4413                             path))
4414     {
4415       STMT_VINFO_REDUC_CODE (phi_info) = code;
4416       if (code == COND_EXPR && !nested_in_vect_loop)
4417         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4418
4419       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4420          reduction chain for which the additional restriction is that
4421          all operations in the chain are the same.  */
4422       auto_vec<stmt_vec_info, 8> reduc_chain;
4423       unsigned i;
4424       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4425       for (i = path.length () - 1; i >= 1; --i)
4426         {
4427           gimple *stmt = USE_STMT (path[i].second);
4428           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4429           gimple_match_op op;
4430           if (!gimple_extract_op (stmt, &op))
4431             gcc_unreachable ();
4432           if (gassign *assign = dyn_cast<gassign *> (stmt))
4433             STMT_VINFO_REDUC_IDX (stmt_info)
4434               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4435           else
4436             {
4437               gcall *call = as_a<gcall *> (stmt);
4438               STMT_VINFO_REDUC_IDX (stmt_info)
4439                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4440             }
4441           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4442                                      && (i == 1 || i == path.length () - 1));
4443           if ((op.code != code && !leading_conversion)
4444               /* We can only handle the final value in epilogue
4445                  generation for reduction chains.  */
4446               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4447             is_slp_reduc = false;
4448           /* For reduction chains we support a trailing/leading
4449              conversions.  We do not store those in the actual chain.  */
4450           if (leading_conversion)
4451             continue;
4452           reduc_chain.safe_push (stmt_info);
4453         }
4454       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4455         {
4456           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4457             {
4458               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4459               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4460             }
4461           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4462           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4463
4464           /* Save the chain for further analysis in SLP detection.  */
4465           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4466           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4467
4468           *reduc_chain_p = true;
4469           if (dump_enabled_p ())
4470             dump_printf_loc (MSG_NOTE, vect_location,
4471                             "reduction: detected reduction chain\n");
4472         }
4473       else if (dump_enabled_p ())
4474         dump_printf_loc (MSG_NOTE, vect_location,
4475                          "reduction: detected reduction\n");
4476
4477       return def_stmt_info;
4478     }
4479
4480   if (dump_enabled_p ())
4481     dump_printf_loc (MSG_NOTE, vect_location,
4482                      "reduction: unknown pattern\n");
4483
4484   return NULL;
4485 }
4486
4487 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4488    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4489    or -1 if not known.  */
4490
4491 static int
4492 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4493 {
4494   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4495   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4496     {
4497       if (dump_enabled_p ())
4498         dump_printf_loc (MSG_NOTE, vect_location,
4499                          "cost model: epilogue peel iters set to vf/2 "
4500                          "because loop iterations are unknown .\n");
4501       return assumed_vf / 2;
4502     }
4503   else
4504     {
4505       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4506       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4507       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4508       /* If we need to peel for gaps, but no peeling is required, we have to
4509          peel VF iterations.  */
4510       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4511         peel_iters_epilogue = assumed_vf;
4512       return peel_iters_epilogue;
4513     }
4514 }
4515
4516 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4517 int
4518 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4519                              int *peel_iters_epilogue,
4520                              stmt_vector_for_cost *scalar_cost_vec,
4521                              stmt_vector_for_cost *prologue_cost_vec,
4522                              stmt_vector_for_cost *epilogue_cost_vec)
4523 {
4524   int retval = 0;
4525
4526   *peel_iters_epilogue
4527     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4528
4529   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4530     {
4531       /* If peeled iterations are known but number of scalar loop
4532          iterations are unknown, count a taken branch per peeled loop.  */
4533       if (peel_iters_prologue > 0)
4534         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4535                                    vect_prologue);
4536       if (*peel_iters_epilogue > 0)
4537         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4538                                     vect_epilogue);
4539     }
4540
4541   stmt_info_for_cost *si;
4542   int j;
4543   if (peel_iters_prologue)
4544     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4545       retval += record_stmt_cost (prologue_cost_vec,
4546                                   si->count * peel_iters_prologue,
4547                                   si->kind, si->stmt_info, si->misalign,
4548                                   vect_prologue);
4549   if (*peel_iters_epilogue)
4550     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4551       retval += record_stmt_cost (epilogue_cost_vec,
4552                                   si->count * *peel_iters_epilogue,
4553                                   si->kind, si->stmt_info, si->misalign,
4554                                   vect_epilogue);
4555
4556   return retval;
4557 }
4558
4559 /* Function vect_estimate_min_profitable_iters
4560
4561    Return the number of iterations required for the vector version of the
4562    loop to be profitable relative to the cost of the scalar version of the
4563    loop.
4564
4565    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4566    of iterations for vectorization.  -1 value means loop vectorization
4567    is not profitable.  This returned value may be used for dynamic
4568    profitability check.
4569
4570    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4571    for static check against estimated number of iterations.  */
4572
4573 static void
4574 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4575                                     int *ret_min_profitable_niters,
4576                                     int *ret_min_profitable_estimate,
4577                                     unsigned *suggested_unroll_factor)
4578 {
4579   int min_profitable_iters;
4580   int min_profitable_estimate;
4581   int peel_iters_prologue;
4582   int peel_iters_epilogue;
4583   unsigned vec_inside_cost = 0;
4584   int vec_outside_cost = 0;
4585   unsigned vec_prologue_cost = 0;
4586   unsigned vec_epilogue_cost = 0;
4587   int scalar_single_iter_cost = 0;
4588   int scalar_outside_cost = 0;
4589   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4590   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4591   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4592
4593   /* Cost model disabled.  */
4594   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4595     {
4596       if (dump_enabled_p ())
4597         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4598       *ret_min_profitable_niters = 0;
4599       *ret_min_profitable_estimate = 0;
4600       return;
4601     }
4602
4603   /* Requires loop versioning tests to handle misalignment.  */
4604   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4605     {
4606       /*  FIXME: Make cost depend on complexity of individual check.  */
4607       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4608       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4609       if (dump_enabled_p ())
4610         dump_printf (MSG_NOTE,
4611                      "cost model: Adding cost of checks for loop "
4612                      "versioning to treat misalignment.\n");
4613     }
4614
4615   /* Requires loop versioning with alias checks.  */
4616   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4617     {
4618       /*  FIXME: Make cost depend on complexity of individual check.  */
4619       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4620       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4621       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4622       if (len)
4623         /* Count LEN - 1 ANDs and LEN comparisons.  */
4624         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4625                               scalar_stmt, vect_prologue);
4626       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4627       if (len)
4628         {
4629           /* Count LEN - 1 ANDs and LEN comparisons.  */
4630           unsigned int nstmts = len * 2 - 1;
4631           /* +1 for each bias that needs adding.  */
4632           for (unsigned int i = 0; i < len; ++i)
4633             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4634               nstmts += 1;
4635           (void) add_stmt_cost (target_cost_data, nstmts,
4636                                 scalar_stmt, vect_prologue);
4637         }
4638       if (dump_enabled_p ())
4639         dump_printf (MSG_NOTE,
4640                      "cost model: Adding cost of checks for loop "
4641                      "versioning aliasing.\n");
4642     }
4643
4644   /* Requires loop versioning with niter checks.  */
4645   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4646     {
4647       /*  FIXME: Make cost depend on complexity of individual check.  */
4648       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4649                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4650       if (dump_enabled_p ())
4651         dump_printf (MSG_NOTE,
4652                      "cost model: Adding cost of checks for loop "
4653                      "versioning niters.\n");
4654     }
4655
4656   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4657     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4658                           vect_prologue);
4659
4660   /* Count statements in scalar loop.  Using this as scalar cost for a single
4661      iteration for now.
4662
4663      TODO: Add outer loop support.
4664
4665      TODO: Consider assigning different costs to different scalar
4666      statements.  */
4667
4668   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4669
4670   /* Add additional cost for the peeled instructions in prologue and epilogue
4671      loop.  (For fully-masked loops there will be no peeling.)
4672
4673      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4674      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4675
4676      TODO: Build an expression that represents peel_iters for prologue and
4677      epilogue to be used in a run-time test.  */
4678
4679   bool prologue_need_br_taken_cost = false;
4680   bool prologue_need_br_not_taken_cost = false;
4681
4682   /* Calculate peel_iters_prologue.  */
4683   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4684     peel_iters_prologue = 0;
4685   else if (npeel < 0)
4686     {
4687       peel_iters_prologue = assumed_vf / 2;
4688       if (dump_enabled_p ())
4689         dump_printf (MSG_NOTE, "cost model: "
4690                      "prologue peel iters set to vf/2.\n");
4691
4692       /* If peeled iterations are unknown, count a taken branch and a not taken
4693          branch per peeled loop.  Even if scalar loop iterations are known,
4694          vector iterations are not known since peeled prologue iterations are
4695          not known.  Hence guards remain the same.  */
4696       prologue_need_br_taken_cost = true;
4697       prologue_need_br_not_taken_cost = true;
4698     }
4699   else
4700     {
4701       peel_iters_prologue = npeel;
4702       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4703         /* If peeled iterations are known but number of scalar loop
4704            iterations are unknown, count a taken branch per peeled loop.  */
4705         prologue_need_br_taken_cost = true;
4706     }
4707
4708   bool epilogue_need_br_taken_cost = false;
4709   bool epilogue_need_br_not_taken_cost = false;
4710
4711   /* Calculate peel_iters_epilogue.  */
4712   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4713     /* We need to peel exactly one iteration for gaps.  */
4714     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4715   else if (npeel < 0)
4716     {
4717       /* If peeling for alignment is unknown, loop bound of main loop
4718          becomes unknown.  */
4719       peel_iters_epilogue = assumed_vf / 2;
4720       if (dump_enabled_p ())
4721         dump_printf (MSG_NOTE, "cost model: "
4722                      "epilogue peel iters set to vf/2 because "
4723                      "peeling for alignment is unknown.\n");
4724
4725       /* See the same reason above in peel_iters_prologue calculation.  */
4726       epilogue_need_br_taken_cost = true;
4727       epilogue_need_br_not_taken_cost = true;
4728     }
4729   else
4730     {
4731       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4732       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4733         /* If peeled iterations are known but number of scalar loop
4734            iterations are unknown, count a taken branch per peeled loop.  */
4735         epilogue_need_br_taken_cost = true;
4736     }
4737
4738   stmt_info_for_cost *si;
4739   int j;
4740   /* Add costs associated with peel_iters_prologue.  */
4741   if (peel_iters_prologue)
4742     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4743       {
4744         (void) add_stmt_cost (target_cost_data,
4745                               si->count * peel_iters_prologue, si->kind,
4746                               si->stmt_info, si->node, si->vectype,
4747                               si->misalign, vect_prologue);
4748       }
4749
4750   /* Add costs associated with peel_iters_epilogue.  */
4751   if (peel_iters_epilogue)
4752     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4753       {
4754         (void) add_stmt_cost (target_cost_data,
4755                               si->count * peel_iters_epilogue, si->kind,
4756                               si->stmt_info, si->node, si->vectype,
4757                               si->misalign, vect_epilogue);
4758       }
4759
4760   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4761
4762   if (prologue_need_br_taken_cost)
4763     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4764                           vect_prologue);
4765
4766   if (prologue_need_br_not_taken_cost)
4767     (void) add_stmt_cost (target_cost_data, 1,
4768                           cond_branch_not_taken, vect_prologue);
4769
4770   if (epilogue_need_br_taken_cost)
4771     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4772                           vect_epilogue);
4773
4774   if (epilogue_need_br_not_taken_cost)
4775     (void) add_stmt_cost (target_cost_data, 1,
4776                           cond_branch_not_taken, vect_epilogue);
4777
4778   /* Take care of special costs for rgroup controls of partial vectors.  */
4779   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4780       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4781           == vect_partial_vectors_avx512))
4782     {
4783       /* Calculate how many masks we need to generate.  */
4784       unsigned int num_masks = 0;
4785       bool need_saturation = false;
4786       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4787         if (rgm.type)
4788           {
4789             unsigned nvectors = rgm.factor;
4790             num_masks += nvectors;
4791             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4792                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4793               need_saturation = true;
4794           }
4795
4796       /* ???  The target isn't able to identify the costs below as
4797          producing masks so it cannot penaltize cases where we'd run
4798          out of mask registers for example.  */
4799
4800       /* ???  We are also failing to account for smaller vector masks
4801          we generate by splitting larger masks in vect_get_loop_mask.  */
4802
4803       /* In the worst case, we need to generate each mask in the prologue
4804          and in the loop body.  We need one splat per group and one
4805          compare per mask.
4806
4807          Sometimes the prologue mask will fold to a constant,
4808          so the actual prologue cost might be smaller.  However, it's
4809          simpler and safer to use the worst-case cost; if this ends up
4810          being the tie-breaker between vectorizing or not, then it's
4811          probably better not to vectorize.  */
4812       (void) add_stmt_cost (target_cost_data,
4813                             num_masks
4814                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4815                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4816                             vect_prologue);
4817       (void) add_stmt_cost (target_cost_data,
4818                             num_masks
4819                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4820                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4821
4822       /* When we need saturation we need it both in the prologue and
4823          the epilogue.  */
4824       if (need_saturation)
4825         {
4826           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4827                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4828           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4829                                 NULL, NULL, NULL_TREE, 0, vect_body);
4830         }
4831     }
4832   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4833            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4834                == vect_partial_vectors_while_ult))
4835     {
4836       /* Calculate how many masks we need to generate.  */
4837       unsigned int num_masks = 0;
4838       rgroup_controls *rgm;
4839       unsigned int num_vectors_m1;
4840       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4841                         num_vectors_m1, rgm)
4842         if (rgm->type)
4843           num_masks += num_vectors_m1 + 1;
4844       gcc_assert (num_masks > 0);
4845
4846       /* In the worst case, we need to generate each mask in the prologue
4847          and in the loop body.  One of the loop body mask instructions
4848          replaces the comparison in the scalar loop, and since we don't
4849          count the scalar comparison against the scalar body, we shouldn't
4850          count that vector instruction against the vector body either.
4851
4852          Sometimes we can use unpacks instead of generating prologue
4853          masks and sometimes the prologue mask will fold to a constant,
4854          so the actual prologue cost might be smaller.  However, it's
4855          simpler and safer to use the worst-case cost; if this ends up
4856          being the tie-breaker between vectorizing or not, then it's
4857          probably better not to vectorize.  */
4858       (void) add_stmt_cost (target_cost_data, num_masks,
4859                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4860                             vect_prologue);
4861       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4862                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4863                             vect_body);
4864     }
4865   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4866     {
4867       /* Referring to the functions vect_set_loop_condition_partial_vectors
4868          and vect_set_loop_controls_directly, we need to generate each
4869          length in the prologue and in the loop body if required. Although
4870          there are some possible optimizations, we consider the worst case
4871          here.  */
4872
4873       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4874       signed char partial_load_store_bias
4875         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4876       bool need_iterate_p
4877         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4878            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4879
4880       /* Calculate how many statements to be added.  */
4881       unsigned int prologue_stmts = 0;
4882       unsigned int body_stmts = 0;
4883
4884       rgroup_controls *rgc;
4885       unsigned int num_vectors_m1;
4886       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4887         if (rgc->type)
4888           {
4889             /* May need one SHIFT for nitems_total computation.  */
4890             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4891             if (nitems != 1 && !niters_known_p)
4892               prologue_stmts += 1;
4893
4894             /* May need one MAX and one MINUS for wrap around.  */
4895             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4896               prologue_stmts += 2;
4897
4898             /* Need one MAX and one MINUS for each batch limit excepting for
4899                the 1st one.  */
4900             prologue_stmts += num_vectors_m1 * 2;
4901
4902             unsigned int num_vectors = num_vectors_m1 + 1;
4903
4904             /* Need to set up lengths in prologue, only one MIN required
4905                for each since start index is zero.  */
4906             prologue_stmts += num_vectors;
4907
4908             /* If we have a non-zero partial load bias, we need one PLUS
4909                to adjust the load length.  */
4910             if (partial_load_store_bias != 0)
4911               body_stmts += 1;
4912
4913             unsigned int length_update_cost = 0;
4914             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4915               /* For decrement IV style, Each only need a single SELECT_VL
4916                  or MIN since beginning to calculate the number of elements
4917                  need to be processed in current iteration.  */
4918               length_update_cost = 1;
4919             else
4920               /* For increment IV stype, Each may need two MINs and one MINUS to
4921                  update lengths in body for next iteration.  */
4922               length_update_cost = 3;
4923
4924             if (need_iterate_p)
4925               body_stmts += length_update_cost * num_vectors;
4926           }
4927
4928       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4929                             scalar_stmt, vect_prologue);
4930       (void) add_stmt_cost (target_cost_data, body_stmts,
4931                             scalar_stmt, vect_body);
4932     }
4933
4934   /* FORNOW: The scalar outside cost is incremented in one of the
4935      following ways:
4936
4937      1. The vectorizer checks for alignment and aliasing and generates
4938      a condition that allows dynamic vectorization.  A cost model
4939      check is ANDED with the versioning condition.  Hence scalar code
4940      path now has the added cost of the versioning check.
4941
4942        if (cost > th & versioning_check)
4943          jmp to vector code
4944
4945      Hence run-time scalar is incremented by not-taken branch cost.
4946
4947      2. The vectorizer then checks if a prologue is required.  If the
4948      cost model check was not done before during versioning, it has to
4949      be done before the prologue check.
4950
4951        if (cost <= th)
4952          prologue = scalar_iters
4953        if (prologue == 0)
4954          jmp to vector code
4955        else
4956          execute prologue
4957        if (prologue == num_iters)
4958          go to exit
4959
4960      Hence the run-time scalar cost is incremented by a taken branch,
4961      plus a not-taken branch, plus a taken branch cost.
4962
4963      3. The vectorizer then checks if an epilogue is required.  If the
4964      cost model check was not done before during prologue check, it
4965      has to be done with the epilogue check.
4966
4967        if (prologue == 0)
4968          jmp to vector code
4969        else
4970          execute prologue
4971        if (prologue == num_iters)
4972          go to exit
4973        vector code:
4974          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4975            jmp to epilogue
4976
4977      Hence the run-time scalar cost should be incremented by 2 taken
4978      branches.
4979
4980      TODO: The back end may reorder the BBS's differently and reverse
4981      conditions/branch directions.  Change the estimates below to
4982      something more reasonable.  */
4983
4984   /* If the number of iterations is known and we do not do versioning, we can
4985      decide whether to vectorize at compile time.  Hence the scalar version
4986      do not carry cost model guard costs.  */
4987   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4988       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4989     {
4990       /* Cost model check occurs at versioning.  */
4991       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4992         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4993       else
4994         {
4995           /* Cost model check occurs at prologue generation.  */
4996           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4997             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4998               + vect_get_stmt_cost (cond_branch_not_taken);
4999           /* Cost model check occurs at epilogue generation.  */
5000           else
5001             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5002         }
5003     }
5004
5005   /* Complete the target-specific cost calculations.  */
5006   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5007                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5008                suggested_unroll_factor);
5009
5010   if (suggested_unroll_factor && *suggested_unroll_factor > 1
5011       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5012       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5013                     *suggested_unroll_factor,
5014                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5015     {
5016       if (dump_enabled_p ())
5017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5018                          "can't unroll as unrolled vectorization factor larger"
5019                          " than maximum vectorization factor: "
5020                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5021                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5022       *suggested_unroll_factor = 1;
5023     }
5024
5025   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5026
5027   if (dump_enabled_p ())
5028     {
5029       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5030       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
5031                    vec_inside_cost);
5032       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
5033                    vec_prologue_cost);
5034       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
5035                    vec_epilogue_cost);
5036       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
5037                    scalar_single_iter_cost);
5038       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
5039                    scalar_outside_cost);
5040       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
5041                    vec_outside_cost);
5042       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
5043                    peel_iters_prologue);
5044       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
5045                    peel_iters_epilogue);
5046     }
5047
5048   /* Calculate number of iterations required to make the vector version
5049      profitable, relative to the loop bodies only.  The following condition
5050      must hold true:
5051      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5052      where
5053      SIC = scalar iteration cost, VIC = vector iteration cost,
5054      VOC = vector outside cost, VF = vectorization factor,
5055      NPEEL = prologue iterations + epilogue iterations,
5056      SOC = scalar outside cost for run time cost model check.  */
5057
5058   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5059                           - vec_inside_cost);
5060   if (saving_per_viter <= 0)
5061     {
5062       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5063         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5064                     "vectorization did not happen for a simd loop");
5065
5066       if (dump_enabled_p ())
5067         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5068                          "cost model: the vector iteration cost = %d "
5069                          "divided by the scalar iteration cost = %d "
5070                          "is greater or equal to the vectorization factor = %d"
5071                          ".\n",
5072                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5073       *ret_min_profitable_niters = -1;
5074       *ret_min_profitable_estimate = -1;
5075       return;
5076     }
5077
5078   /* ??? The "if" arm is written to handle all cases; see below for what
5079      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5080   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5081     {
5082       /* Rewriting the condition above in terms of the number of
5083          vector iterations (vniters) rather than the number of
5084          scalar iterations (niters) gives:
5085
5086          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5087
5088          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5089
5090          For integer N, X and Y when X > 0:
5091
5092          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5093       int outside_overhead = (vec_outside_cost
5094                               - scalar_single_iter_cost * peel_iters_prologue
5095                               - scalar_single_iter_cost * peel_iters_epilogue
5096                               - scalar_outside_cost);
5097       /* We're only interested in cases that require at least one
5098          vector iteration.  */
5099       int min_vec_niters = 1;
5100       if (outside_overhead > 0)
5101         min_vec_niters = outside_overhead / saving_per_viter + 1;
5102
5103       if (dump_enabled_p ())
5104         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5105                      min_vec_niters);
5106
5107       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5108         {
5109           /* Now that we know the minimum number of vector iterations,
5110              find the minimum niters for which the scalar cost is larger:
5111
5112              SIC * niters > VIC * vniters + VOC - SOC
5113
5114              We know that the minimum niters is no more than
5115              vniters * VF + NPEEL, but it might be (and often is) less
5116              than that if a partial vector iteration is cheaper than the
5117              equivalent scalar code.  */
5118           int threshold = (vec_inside_cost * min_vec_niters
5119                            + vec_outside_cost
5120                            - scalar_outside_cost);
5121           if (threshold <= 0)
5122             min_profitable_iters = 1;
5123           else
5124             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5125         }
5126       else
5127         /* Convert the number of vector iterations into a number of
5128            scalar iterations.  */
5129         min_profitable_iters = (min_vec_niters * assumed_vf
5130                                 + peel_iters_prologue
5131                                 + peel_iters_epilogue);
5132     }
5133   else
5134     {
5135       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5136                               * assumed_vf
5137                               - vec_inside_cost * peel_iters_prologue
5138                               - vec_inside_cost * peel_iters_epilogue);
5139       if (min_profitable_iters <= 0)
5140         min_profitable_iters = 0;
5141       else
5142         {
5143           min_profitable_iters /= saving_per_viter;
5144
5145           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5146               <= (((int) vec_inside_cost * min_profitable_iters)
5147                   + (((int) vec_outside_cost - scalar_outside_cost)
5148                      * assumed_vf)))
5149             min_profitable_iters++;
5150         }
5151     }
5152
5153   if (dump_enabled_p ())
5154     dump_printf (MSG_NOTE,
5155                  "  Calculated minimum iters for profitability: %d\n",
5156                  min_profitable_iters);
5157
5158   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5159       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5160     /* We want the vectorized loop to execute at least once.  */
5161     min_profitable_iters = assumed_vf + peel_iters_prologue;
5162   else if (min_profitable_iters < peel_iters_prologue)
5163     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5164        vectorized loop executes at least once.  */
5165     min_profitable_iters = peel_iters_prologue;
5166
5167   if (dump_enabled_p ())
5168     dump_printf_loc (MSG_NOTE, vect_location,
5169                      "  Runtime profitability threshold = %d\n",
5170                      min_profitable_iters);
5171
5172   *ret_min_profitable_niters = min_profitable_iters;
5173
5174   /* Calculate number of iterations required to make the vector version
5175      profitable, relative to the loop bodies only.
5176
5177      Non-vectorized variant is SIC * niters and it must win over vector
5178      variant on the expected loop trip count.  The following condition must hold true:
5179      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5180
5181   if (vec_outside_cost <= 0)
5182     min_profitable_estimate = 0;
5183   /* ??? This "else if" arm is written to handle all cases; see below for
5184      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5185   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5186     {
5187       /* This is a repeat of the code above, but with + SOC rather
5188          than - SOC.  */
5189       int outside_overhead = (vec_outside_cost
5190                               - scalar_single_iter_cost * peel_iters_prologue
5191                               - scalar_single_iter_cost * peel_iters_epilogue
5192                               + scalar_outside_cost);
5193       int min_vec_niters = 1;
5194       if (outside_overhead > 0)
5195         min_vec_niters = outside_overhead / saving_per_viter + 1;
5196
5197       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5198         {
5199           int threshold = (vec_inside_cost * min_vec_niters
5200                            + vec_outside_cost
5201                            + scalar_outside_cost);
5202           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5203         }
5204       else
5205         min_profitable_estimate = (min_vec_niters * assumed_vf
5206                                    + peel_iters_prologue
5207                                    + peel_iters_epilogue);
5208     }
5209   else
5210     {
5211       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5212                                  * assumed_vf
5213                                  - vec_inside_cost * peel_iters_prologue
5214                                  - vec_inside_cost * peel_iters_epilogue)
5215                                  / ((scalar_single_iter_cost * assumed_vf)
5216                                    - vec_inside_cost);
5217     }
5218   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5219   if (dump_enabled_p ())
5220     dump_printf_loc (MSG_NOTE, vect_location,
5221                      "  Static estimate profitability threshold = %d\n",
5222                      min_profitable_estimate);
5223
5224   *ret_min_profitable_estimate = min_profitable_estimate;
5225 }
5226
5227 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5228    vector elements (not bits) for a vector with NELT elements.  */
5229 static void
5230 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5231                               vec_perm_builder *sel)
5232 {
5233   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5234      by vec_perm_indices.  */
5235   sel->new_vector (nelt, 1, 3);
5236   for (unsigned int i = 0; i < 3; i++)
5237     sel->quick_push (i + offset);
5238 }
5239
5240 /* Checks whether the target supports whole-vector shifts for vectors of mode
5241    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5242    it supports vec_perm_const with masks for all necessary shift amounts.  */
5243 static bool
5244 have_whole_vector_shift (machine_mode mode)
5245 {
5246   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5247     return true;
5248
5249   /* Variable-length vectors should be handled via the optab.  */
5250   unsigned int nelt;
5251   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5252     return false;
5253
5254   vec_perm_builder sel;
5255   vec_perm_indices indices;
5256   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5257     {
5258       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5259       indices.new_vector (sel, 2, nelt);
5260       if (!can_vec_perm_const_p (mode, mode, indices, false))
5261         return false;
5262     }
5263   return true;
5264 }
5265
5266 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5267    multiplication operands have differing signs and (b) we intend
5268    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5269    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5270
5271 static bool
5272 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5273                                  stmt_vec_info stmt_info)
5274 {
5275   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5276   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5277     return false;
5278
5279   tree rhs1 = gimple_assign_rhs1 (assign);
5280   tree rhs2 = gimple_assign_rhs2 (assign);
5281   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5282     return false;
5283
5284   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5285   gcc_assert (reduc_info->is_reduc_info);
5286   return !directly_supported_p (DOT_PROD_EXPR,
5287                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5288                                 optab_vector_mixed_sign);
5289 }
5290
5291 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5292    functions. Design better to avoid maintenance issues.  */
5293
5294 /* Function vect_model_reduction_cost.
5295
5296    Models cost for a reduction operation, including the vector ops
5297    generated within the strip-mine loop in some cases, the initial
5298    definition before the loop, and the epilogue code that must be generated.  */
5299
5300 static void
5301 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5302                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5303                            vect_reduction_type reduction_type,
5304                            int ncopies, stmt_vector_for_cost *cost_vec)
5305 {
5306   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5307   tree vectype;
5308   machine_mode mode;
5309   class loop *loop = NULL;
5310
5311   if (loop_vinfo)
5312     loop = LOOP_VINFO_LOOP (loop_vinfo);
5313
5314   /* Condition reductions generate two reductions in the loop.  */
5315   if (reduction_type == COND_REDUCTION)
5316     ncopies *= 2;
5317
5318   vectype = STMT_VINFO_VECTYPE (stmt_info);
5319   mode = TYPE_MODE (vectype);
5320   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5321
5322   gimple_match_op op;
5323   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5324     gcc_unreachable ();
5325
5326   bool emulated_mixed_dot_prod
5327     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5328   if (reduction_type == EXTRACT_LAST_REDUCTION)
5329     /* No extra instructions are needed in the prologue.  The loop body
5330        operations are costed in vectorizable_condition.  */
5331     inside_cost = 0;
5332   else if (reduction_type == FOLD_LEFT_REDUCTION)
5333     {
5334       /* No extra instructions needed in the prologue.  */
5335       prologue_cost = 0;
5336
5337       if (reduc_fn != IFN_LAST)
5338         /* Count one reduction-like operation per vector.  */
5339         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5340                                         stmt_info, 0, vect_body);
5341       else
5342         {
5343           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5344           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5345           inside_cost = record_stmt_cost (cost_vec, nelements,
5346                                           vec_to_scalar, stmt_info, 0,
5347                                           vect_body);
5348           inside_cost += record_stmt_cost (cost_vec, nelements,
5349                                            scalar_stmt, stmt_info, 0,
5350                                            vect_body);
5351         }
5352     }
5353   else
5354     {
5355       /* Add in the cost of the initial definitions.  */
5356       int prologue_stmts;
5357       if (reduction_type == COND_REDUCTION)
5358         /* For cond reductions we have four vectors: initial index, step,
5359            initial result of the data reduction, initial value of the index
5360            reduction.  */
5361         prologue_stmts = 4;
5362       else if (emulated_mixed_dot_prod)
5363         /* We need the initial reduction value and two invariants:
5364            one that contains the minimum signed value and one that
5365            contains half of its negative.  */
5366         prologue_stmts = 3;
5367       else
5368         prologue_stmts = 1;
5369       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5370                                          scalar_to_vec, stmt_info, 0,
5371                                          vect_prologue);
5372     }
5373
5374   /* Determine cost of epilogue code.
5375
5376      We have a reduction operator that will reduce the vector in one statement.
5377      Also requires scalar extract.  */
5378
5379   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5380     {
5381       if (reduc_fn != IFN_LAST)
5382         {
5383           if (reduction_type == COND_REDUCTION)
5384             {
5385               /* An EQ stmt and an COND_EXPR stmt.  */
5386               epilogue_cost += record_stmt_cost (cost_vec, 2,
5387                                                  vector_stmt, stmt_info, 0,
5388                                                  vect_epilogue);
5389               /* Reduction of the max index and a reduction of the found
5390                  values.  */
5391               epilogue_cost += record_stmt_cost (cost_vec, 2,
5392                                                  vec_to_scalar, stmt_info, 0,
5393                                                  vect_epilogue);
5394               /* A broadcast of the max value.  */
5395               epilogue_cost += record_stmt_cost (cost_vec, 1,
5396                                                  scalar_to_vec, stmt_info, 0,
5397                                                  vect_epilogue);
5398             }
5399           else
5400             {
5401               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5402                                                  stmt_info, 0, vect_epilogue);
5403               epilogue_cost += record_stmt_cost (cost_vec, 1,
5404                                                  vec_to_scalar, stmt_info, 0,
5405                                                  vect_epilogue);
5406             }
5407         }
5408       else if (reduction_type == COND_REDUCTION)
5409         {
5410           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5411           /* Extraction of scalar elements.  */
5412           epilogue_cost += record_stmt_cost (cost_vec,
5413                                              2 * estimated_nunits,
5414                                              vec_to_scalar, stmt_info, 0,
5415                                              vect_epilogue);
5416           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5417           epilogue_cost += record_stmt_cost (cost_vec,
5418                                              2 * estimated_nunits - 3,
5419                                              scalar_stmt, stmt_info, 0,
5420                                              vect_epilogue);
5421         }
5422       else if (reduction_type == EXTRACT_LAST_REDUCTION
5423                || reduction_type == FOLD_LEFT_REDUCTION)
5424         /* No extra instructions need in the epilogue.  */
5425         ;
5426       else
5427         {
5428           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5429           tree bitsize = TYPE_SIZE (op.type);
5430           int element_bitsize = tree_to_uhwi (bitsize);
5431           int nelements = vec_size_in_bits / element_bitsize;
5432
5433           if (op.code == COND_EXPR)
5434             op.code = MAX_EXPR;
5435
5436           /* We have a whole vector shift available.  */
5437           if (VECTOR_MODE_P (mode)
5438               && directly_supported_p (op.code, vectype)
5439               && have_whole_vector_shift (mode))
5440             {
5441               /* Final reduction via vector shifts and the reduction operator.
5442                  Also requires scalar extract.  */
5443               epilogue_cost += record_stmt_cost (cost_vec,
5444                                                  exact_log2 (nelements) * 2,
5445                                                  vector_stmt, stmt_info, 0,
5446                                                  vect_epilogue);
5447               epilogue_cost += record_stmt_cost (cost_vec, 1,
5448                                                  vec_to_scalar, stmt_info, 0,
5449                                                  vect_epilogue);
5450             }
5451           else
5452             /* Use extracts and reduction op for final reduction.  For N
5453                elements, we have N extracts and N-1 reduction ops.  */
5454             epilogue_cost += record_stmt_cost (cost_vec,
5455                                                nelements + nelements - 1,
5456                                                vector_stmt, stmt_info, 0,
5457                                                vect_epilogue);
5458         }
5459     }
5460
5461   if (dump_enabled_p ())
5462     dump_printf (MSG_NOTE,
5463                  "vect_model_reduction_cost: inside_cost = %d, "
5464                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5465                  prologue_cost, epilogue_cost);
5466 }
5467
5468 /* SEQ is a sequence of instructions that initialize the reduction
5469    described by REDUC_INFO.  Emit them in the appropriate place.  */
5470
5471 static void
5472 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5473                                 stmt_vec_info reduc_info, gimple *seq)
5474 {
5475   if (reduc_info->reused_accumulator)
5476     {
5477       /* When reusing an accumulator from the main loop, we only need
5478          initialization instructions if the main loop can be skipped.
5479          In that case, emit the initialization instructions at the end
5480          of the guard block that does the skip.  */
5481       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5482       gcc_assert (skip_edge);
5483       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5484       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5485     }
5486   else
5487     {
5488       /* The normal case: emit the initialization instructions on the
5489          preheader edge.  */
5490       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5491       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5492     }
5493 }
5494
5495 /* Function get_initial_def_for_reduction
5496
5497    Input:
5498    REDUC_INFO - the info_for_reduction
5499    INIT_VAL - the initial value of the reduction variable
5500    NEUTRAL_OP - a value that has no effect on the reduction, as per
5501                 neutral_op_for_reduction
5502
5503    Output:
5504    Return a vector variable, initialized according to the operation that
5505         STMT_VINFO performs. This vector will be used as the initial value
5506         of the vector of partial results.
5507
5508    The value we need is a vector in which element 0 has value INIT_VAL
5509    and every other element has value NEUTRAL_OP.  */
5510
5511 static tree
5512 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5513                                stmt_vec_info reduc_info,
5514                                tree init_val, tree neutral_op)
5515 {
5516   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5517   tree scalar_type = TREE_TYPE (init_val);
5518   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5519   tree init_def;
5520   gimple_seq stmts = NULL;
5521
5522   gcc_assert (vectype);
5523
5524   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5525               || SCALAR_FLOAT_TYPE_P (scalar_type));
5526
5527   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5528               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5529
5530   if (operand_equal_p (init_val, neutral_op))
5531     {
5532       /* If both elements are equal then the vector described above is
5533          just a splat.  */
5534       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5535       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5536     }
5537   else
5538     {
5539       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5540       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5541       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5542         {
5543           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5544              element 0.  */
5545           init_def = gimple_build_vector_from_val (&stmts, vectype,
5546                                                    neutral_op);
5547           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5548                                    vectype, init_def, init_val);
5549         }
5550       else
5551         {
5552           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5553           tree_vector_builder elts (vectype, 1, 2);
5554           elts.quick_push (init_val);
5555           elts.quick_push (neutral_op);
5556           init_def = gimple_build_vector (&stmts, &elts);
5557         }
5558     }
5559
5560   if (stmts)
5561     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5562   return init_def;
5563 }
5564
5565 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5566    which performs a reduction involving GROUP_SIZE scalar statements.
5567    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5568    is nonnull, introducing extra elements of that value will not change the
5569    result.  */
5570
5571 static void
5572 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5573                                 stmt_vec_info reduc_info,
5574                                 vec<tree> *vec_oprnds,
5575                                 unsigned int number_of_vectors,
5576                                 unsigned int group_size, tree neutral_op)
5577 {
5578   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5579   unsigned HOST_WIDE_INT nunits;
5580   unsigned j, number_of_places_left_in_vector;
5581   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5582   unsigned int i;
5583
5584   gcc_assert (group_size == initial_values.length () || neutral_op);
5585
5586   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5587      created vectors. It is greater than 1 if unrolling is performed.
5588
5589      For example, we have two scalar operands, s1 and s2 (e.g., group of
5590      strided accesses of size two), while NUNITS is four (i.e., four scalars
5591      of this type can be packed in a vector).  The output vector will contain
5592      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5593      will be 2).
5594
5595      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5596      vectors containing the operands.
5597
5598      For example, NUNITS is four as before, and the group size is 8
5599      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5600      {s5, s6, s7, s8}.  */
5601
5602   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5603     nunits = group_size;
5604
5605   number_of_places_left_in_vector = nunits;
5606   bool constant_p = true;
5607   tree_vector_builder elts (vector_type, nunits, 1);
5608   elts.quick_grow (nunits);
5609   gimple_seq ctor_seq = NULL;
5610   for (j = 0; j < nunits * number_of_vectors; ++j)
5611     {
5612       tree op;
5613       i = j % group_size;
5614
5615       /* Get the def before the loop.  In reduction chain we have only
5616          one initial value.  Else we have as many as PHIs in the group.  */
5617       if (i >= initial_values.length () || (j > i && neutral_op))
5618         op = neutral_op;
5619       else
5620         op = initial_values[i];
5621
5622       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5623       number_of_places_left_in_vector--;
5624       elts[nunits - number_of_places_left_in_vector - 1] = op;
5625       if (!CONSTANT_CLASS_P (op))
5626         constant_p = false;
5627
5628       if (number_of_places_left_in_vector == 0)
5629         {
5630           tree init;
5631           if (constant_p && !neutral_op
5632               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5633               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5634             /* Build the vector directly from ELTS.  */
5635             init = gimple_build_vector (&ctor_seq, &elts);
5636           else if (neutral_op)
5637             {
5638               /* Build a vector of the neutral value and shift the
5639                  other elements into place.  */
5640               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5641                                                    neutral_op);
5642               int k = nunits;
5643               while (k > 0 && elts[k - 1] == neutral_op)
5644                 k -= 1;
5645               while (k > 0)
5646                 {
5647                   k -= 1;
5648                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5649                                        vector_type, init, elts[k]);
5650                 }
5651             }
5652           else
5653             {
5654               /* First time round, duplicate ELTS to fill the
5655                  required number of vectors.  */
5656               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5657                                         elts, number_of_vectors, *vec_oprnds);
5658               break;
5659             }
5660           vec_oprnds->quick_push (init);
5661
5662           number_of_places_left_in_vector = nunits;
5663           elts.new_vector (vector_type, nunits, 1);
5664           elts.quick_grow (nunits);
5665           constant_p = true;
5666         }
5667     }
5668   if (ctor_seq != NULL)
5669     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5670 }
5671
5672 /* For a statement STMT_INFO taking part in a reduction operation return
5673    the stmt_vec_info the meta information is stored on.  */
5674
5675 stmt_vec_info
5676 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5677 {
5678   stmt_info = vect_orig_stmt (stmt_info);
5679   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5680   if (!is_a <gphi *> (stmt_info->stmt)
5681       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5682     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5683   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5684   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5685     {
5686       if (gimple_phi_num_args (phi) == 1)
5687         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5688     }
5689   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5690     {
5691       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5692       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5693         stmt_info = info;
5694     }
5695   return stmt_info;
5696 }
5697
5698 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5699    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5700    return false.  */
5701
5702 static bool
5703 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5704                                 stmt_vec_info reduc_info)
5705 {
5706   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5707   if (!main_loop_vinfo)
5708     return false;
5709
5710   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5711     return false;
5712
5713   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5714   auto_vec<tree, 16> main_loop_results (num_phis);
5715   auto_vec<tree, 16> initial_values (num_phis);
5716   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5717     {
5718       /* The epilogue loop can be entered either from the main loop or
5719          from an earlier guard block.  */
5720       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5721       for (tree incoming_value : reduc_info->reduc_initial_values)
5722         {
5723           /* Look for:
5724
5725                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5726                                     INITIAL_VALUE(guard block)>.  */
5727           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5728
5729           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5730           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5731
5732           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5733           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5734
5735           main_loop_results.quick_push (from_main_loop);
5736           initial_values.quick_push (from_skip);
5737         }
5738     }
5739   else
5740     /* The main loop dominates the epilogue loop.  */
5741     main_loop_results.splice (reduc_info->reduc_initial_values);
5742
5743   /* See if the main loop has the kind of accumulator we need.  */
5744   vect_reusable_accumulator *accumulator
5745     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5746   if (!accumulator
5747       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5748       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5749                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5750     return false;
5751
5752   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5753   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5754   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5755   unsigned HOST_WIDE_INT m;
5756   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5757                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5758     return false;
5759   /* Check the intermediate vector types and operations are available.  */
5760   tree prev_vectype = old_vectype;
5761   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5762   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5763     {
5764       intermediate_nunits = exact_div (intermediate_nunits, 2);
5765       tree intermediate_vectype = get_related_vectype_for_scalar_type
5766         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5767       if (!intermediate_vectype
5768           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5769                                     intermediate_vectype)
5770           || !can_vec_extract (TYPE_MODE (prev_vectype),
5771                                TYPE_MODE (intermediate_vectype)))
5772         return false;
5773       prev_vectype = intermediate_vectype;
5774     }
5775
5776   /* Non-SLP reductions might apply an adjustment after the reduction
5777      operation, in order to simplify the initialization of the accumulator.
5778      If the epilogue loop carries on from where the main loop left off,
5779      it should apply the same adjustment to the final reduction result.
5780
5781      If the epilogue loop can also be entered directly (rather than via
5782      the main loop), we need to be able to handle that case in the same way,
5783      with the same adjustment.  (In principle we could add a PHI node
5784      to select the correct adjustment, but in practice that shouldn't be
5785      necessary.)  */
5786   tree main_adjustment
5787     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5788   if (loop_vinfo->main_loop_edge && main_adjustment)
5789     {
5790       gcc_assert (num_phis == 1);
5791       tree initial_value = initial_values[0];
5792       /* Check that we can use INITIAL_VALUE as the adjustment and
5793          initialize the accumulator with a neutral value instead.  */
5794       if (!operand_equal_p (initial_value, main_adjustment))
5795         return false;
5796       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5797       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5798                                                     code, initial_value);
5799     }
5800   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5801   reduc_info->reduc_initial_values.truncate (0);
5802   reduc_info->reduc_initial_values.splice (initial_values);
5803   reduc_info->reused_accumulator = accumulator;
5804   return true;
5805 }
5806
5807 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5808    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5809
5810 static tree
5811 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5812                             gimple_seq *seq)
5813 {
5814   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5815   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5816   tree stype = TREE_TYPE (vectype);
5817   tree new_temp = vec_def;
5818   while (nunits > nunits1)
5819     {
5820       nunits /= 2;
5821       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5822                                                            stype, nunits);
5823       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5824
5825       /* The target has to make sure we support lowpart/highpart
5826          extraction, either via direct vector extract or through
5827          an integer mode punning.  */
5828       tree dst1, dst2;
5829       gimple *epilog_stmt;
5830       if (convert_optab_handler (vec_extract_optab,
5831                                  TYPE_MODE (TREE_TYPE (new_temp)),
5832                                  TYPE_MODE (vectype1))
5833           != CODE_FOR_nothing)
5834         {
5835           /* Extract sub-vectors directly once vec_extract becomes
5836              a conversion optab.  */
5837           dst1 = make_ssa_name (vectype1);
5838           epilog_stmt
5839               = gimple_build_assign (dst1, BIT_FIELD_REF,
5840                                      build3 (BIT_FIELD_REF, vectype1,
5841                                              new_temp, TYPE_SIZE (vectype1),
5842                                              bitsize_int (0)));
5843           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5844           dst2 =  make_ssa_name (vectype1);
5845           epilog_stmt
5846               = gimple_build_assign (dst2, BIT_FIELD_REF,
5847                                      build3 (BIT_FIELD_REF, vectype1,
5848                                              new_temp, TYPE_SIZE (vectype1),
5849                                              bitsize_int (bitsize)));
5850           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5851         }
5852       else
5853         {
5854           /* Extract via punning to appropriately sized integer mode
5855              vector.  */
5856           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5857           tree etype = build_vector_type (eltype, 2);
5858           gcc_assert (convert_optab_handler (vec_extract_optab,
5859                                              TYPE_MODE (etype),
5860                                              TYPE_MODE (eltype))
5861                       != CODE_FOR_nothing);
5862           tree tem = make_ssa_name (etype);
5863           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5864                                              build1 (VIEW_CONVERT_EXPR,
5865                                                      etype, new_temp));
5866           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5867           new_temp = tem;
5868           tem = make_ssa_name (eltype);
5869           epilog_stmt
5870               = gimple_build_assign (tem, BIT_FIELD_REF,
5871                                      build3 (BIT_FIELD_REF, eltype,
5872                                              new_temp, TYPE_SIZE (eltype),
5873                                              bitsize_int (0)));
5874           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5875           dst1 = make_ssa_name (vectype1);
5876           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5877                                              build1 (VIEW_CONVERT_EXPR,
5878                                                      vectype1, tem));
5879           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5880           tem = make_ssa_name (eltype);
5881           epilog_stmt
5882               = gimple_build_assign (tem, BIT_FIELD_REF,
5883                                      build3 (BIT_FIELD_REF, eltype,
5884                                              new_temp, TYPE_SIZE (eltype),
5885                                              bitsize_int (bitsize)));
5886           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5887           dst2 =  make_ssa_name (vectype1);
5888           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5889                                              build1 (VIEW_CONVERT_EXPR,
5890                                                      vectype1, tem));
5891           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5892         }
5893
5894       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5895     }
5896
5897   return new_temp;
5898 }
5899
5900 /* Function vect_create_epilog_for_reduction
5901
5902    Create code at the loop-epilog to finalize the result of a reduction
5903    computation.
5904
5905    STMT_INFO is the scalar reduction stmt that is being vectorized.
5906    SLP_NODE is an SLP node containing a group of reduction statements. The
5907      first one in this group is STMT_INFO.
5908    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5909    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5910      (counting from 0)
5911    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
5912      exit this edge is always the main loop exit.
5913
5914    This function:
5915    1. Completes the reduction def-use cycles.
5916    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5917       by calling the function specified by REDUC_FN if available, or by
5918       other means (whole-vector shifts or a scalar loop).
5919       The function also creates a new phi node at the loop exit to preserve
5920       loop-closed form, as illustrated below.
5921
5922      The flow at the entry to this function:
5923
5924         loop:
5925           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5926           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5927           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5928         loop_exit:
5929           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5930           use <s_out0>
5931           use <s_out0>
5932
5933      The above is transformed by this function into:
5934
5935         loop:
5936           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5937           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5938           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5939         loop_exit:
5940           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5941           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5942           v_out2 = reduce <v_out1>
5943           s_out3 = extract_field <v_out2, 0>
5944           s_out4 = adjust_result <s_out3>
5945           use <s_out4>
5946           use <s_out4>
5947 */
5948
5949 static void
5950 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5951                                   stmt_vec_info stmt_info,
5952                                   slp_tree slp_node,
5953                                   slp_instance slp_node_instance,
5954                                   edge loop_exit)
5955 {
5956   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5957   gcc_assert (reduc_info->is_reduc_info);
5958   /* For double reductions we need to get at the inner loop reduction
5959      stmt which has the meta info attached.  Our stmt_info is that of the
5960      loop-closed PHI of the inner loop which we remember as
5961      def for the reduction PHI generation.  */
5962   bool double_reduc = false;
5963   stmt_vec_info rdef_info = stmt_info;
5964   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5965     {
5966       gcc_assert (!slp_node);
5967       double_reduc = true;
5968       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5969                                             (stmt_info->stmt, 0));
5970       stmt_info = vect_stmt_to_vectorize (stmt_info);
5971     }
5972   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5973   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5974   tree vectype;
5975   machine_mode mode;
5976   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5977   basic_block exit_bb;
5978   tree scalar_dest;
5979   tree scalar_type;
5980   gimple *new_phi = NULL, *phi = NULL;
5981   gimple_stmt_iterator exit_gsi;
5982   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5983   gimple *epilog_stmt = NULL;
5984   gimple *exit_phi;
5985   tree bitsize;
5986   tree def;
5987   tree orig_name, scalar_result;
5988   imm_use_iterator imm_iter, phi_imm_iter;
5989   use_operand_p use_p, phi_use_p;
5990   gimple *use_stmt;
5991   auto_vec<tree> reduc_inputs;
5992   int j, i;
5993   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5994   unsigned int group_size = 1, k;
5995   /* SLP reduction without reduction chain, e.g.,
5996      # a1 = phi <a2, a0>
5997      # b1 = phi <b2, b0>
5998      a2 = operation (a1)
5999      b2 = operation (b1)  */
6000   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6001   bool direct_slp_reduc;
6002   tree induction_index = NULL_TREE;
6003
6004   if (slp_node)
6005     group_size = SLP_TREE_LANES (slp_node);
6006
6007   if (nested_in_vect_loop_p (loop, stmt_info))
6008     {
6009       outer_loop = loop;
6010       loop = loop->inner;
6011       gcc_assert (!slp_node && double_reduc);
6012     }
6013
6014   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6015   gcc_assert (vectype);
6016   mode = TYPE_MODE (vectype);
6017
6018   tree induc_val = NULL_TREE;
6019   tree adjustment_def = NULL;
6020   if (slp_node)
6021     ;
6022   else
6023     {
6024       /* Optimize: for induction condition reduction, if we can't use zero
6025          for induc_val, use initial_def.  */
6026       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6027         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6028       else if (double_reduc)
6029         ;
6030       else
6031         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6032     }
6033
6034   stmt_vec_info single_live_out_stmt[] = { stmt_info };
6035   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6036   if (slp_reduc)
6037     /* All statements produce live-out values.  */
6038     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6039
6040   unsigned vec_num;
6041   int ncopies;
6042   if (slp_node)
6043     {
6044       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6045       ncopies = 1;
6046     }
6047   else
6048     {
6049       vec_num = 1;
6050       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6051     }
6052
6053   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6054      which is updated with the current index of the loop for every match of
6055      the original loop's cond_expr (VEC_STMT).  This results in a vector
6056      containing the last time the condition passed for that vector lane.
6057      The first match will be a 1 to allow 0 to be used for non-matching
6058      indexes.  If there are no matches at all then the vector will be all
6059      zeroes.
6060
6061      PR92772: This algorithm is broken for architectures that support
6062      masked vectors, but do not provide fold_extract_last.  */
6063   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6064     {
6065       auto_vec<std::pair<tree, bool>, 2> ccompares;
6066       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6067       cond_info = vect_stmt_to_vectorize (cond_info);
6068       while (cond_info != reduc_info)
6069         {
6070           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6071             {
6072               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6073               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6074               ccompares.safe_push
6075                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6076                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6077             }
6078           cond_info
6079             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6080                                                  1 + STMT_VINFO_REDUC_IDX
6081                                                         (cond_info)));
6082           cond_info = vect_stmt_to_vectorize (cond_info);
6083         }
6084       gcc_assert (ccompares.length () != 0);
6085
6086       tree indx_before_incr, indx_after_incr;
6087       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6088       int scalar_precision
6089         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6090       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6091       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6092         (TYPE_MODE (vectype), cr_index_scalar_type,
6093          TYPE_VECTOR_SUBPARTS (vectype));
6094
6095       /* First we create a simple vector induction variable which starts
6096          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6097          vector size (STEP).  */
6098
6099       /* Create a {1,2,3,...} vector.  */
6100       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6101
6102       /* Create a vector of the step value.  */
6103       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6104       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6105
6106       /* Create an induction variable.  */
6107       gimple_stmt_iterator incr_gsi;
6108       bool insert_after;
6109       vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6110       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6111                  insert_after, &indx_before_incr, &indx_after_incr);
6112
6113       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6114          filled with zeros (VEC_ZERO).  */
6115
6116       /* Create a vector of 0s.  */
6117       tree zero = build_zero_cst (cr_index_scalar_type);
6118       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6119
6120       /* Create a vector phi node.  */
6121       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6122       new_phi = create_phi_node (new_phi_tree, loop->header);
6123       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6124                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6125
6126       /* Now take the condition from the loops original cond_exprs
6127          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6128          every match uses values from the induction variable
6129          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6130          (NEW_PHI_TREE).
6131          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6132          the new cond_expr (INDEX_COND_EXPR).  */
6133       gimple_seq stmts = NULL;
6134       for (int i = ccompares.length () - 1; i != -1; --i)
6135         {
6136           tree ccompare = ccompares[i].first;
6137           if (ccompares[i].second)
6138             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6139                                          cr_index_vector_type,
6140                                          ccompare,
6141                                          indx_before_incr, new_phi_tree);
6142           else
6143             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6144                                          cr_index_vector_type,
6145                                          ccompare,
6146                                          new_phi_tree, indx_before_incr);
6147         }
6148       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6149
6150       /* Update the phi with the vec cond.  */
6151       induction_index = new_phi_tree;
6152       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6153                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6154     }
6155
6156   /* 2. Create epilog code.
6157         The reduction epilog code operates across the elements of the vector
6158         of partial results computed by the vectorized loop.
6159         The reduction epilog code consists of:
6160
6161         step 1: compute the scalar result in a vector (v_out2)
6162         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6163         step 3: adjust the scalar result (s_out3) if needed.
6164
6165         Step 1 can be accomplished using one the following three schemes:
6166           (scheme 1) using reduc_fn, if available.
6167           (scheme 2) using whole-vector shifts, if available.
6168           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6169                      combined.
6170
6171           The overall epilog code looks like this:
6172
6173           s_out0 = phi <s_loop>         # original EXIT_PHI
6174           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6175           v_out2 = reduce <v_out1>              # step 1
6176           s_out3 = extract_field <v_out2, 0>    # step 2
6177           s_out4 = adjust_result <s_out3>       # step 3
6178
6179           (step 3 is optional, and steps 1 and 2 may be combined).
6180           Lastly, the uses of s_out0 are replaced by s_out4.  */
6181
6182
6183   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6184          v_out1 = phi <VECT_DEF>
6185          Store them in NEW_PHIS.  */
6186   if (double_reduc)
6187     loop = outer_loop;
6188   /* We need to reduce values in all exits.  */
6189   exit_bb = loop_exit->dest;
6190   exit_gsi = gsi_after_labels (exit_bb);
6191   reduc_inputs.create (slp_node ? vec_num : ncopies);
6192   for (unsigned i = 0; i < vec_num; i++)
6193     {
6194       gimple_seq stmts = NULL;
6195       if (slp_node)
6196         def = vect_get_slp_vect_def (slp_node, i);
6197       else
6198         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6199       for (j = 0; j < ncopies; j++)
6200         {
6201           tree new_def = copy_ssa_name (def);
6202           phi = create_phi_node (new_def, exit_bb);
6203           if (j)
6204             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6205           if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6206             SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6207           else
6208             {
6209               for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6210                 SET_PHI_ARG_DEF (phi, k, def);
6211             }
6212           new_def = gimple_convert (&stmts, vectype, new_def);
6213           reduc_inputs.quick_push (new_def);
6214         }
6215       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6216     }
6217
6218   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6219          (i.e. when reduc_fn is not available) and in the final adjustment
6220          code (if needed).  Also get the original scalar reduction variable as
6221          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6222          represents a reduction pattern), the tree-code and scalar-def are
6223          taken from the original stmt that the pattern-stmt (STMT) replaces.
6224          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6225          are taken from STMT.  */
6226
6227   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6228   if (orig_stmt_info != stmt_info)
6229     {
6230       /* Reduction pattern  */
6231       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6232       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6233     }
6234
6235   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6236   scalar_type = TREE_TYPE (scalar_dest);
6237   scalar_results.truncate (0);
6238   scalar_results.reserve_exact (group_size);
6239   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6240   bitsize = TYPE_SIZE (scalar_type);
6241
6242   /* True if we should implement SLP_REDUC using native reduction operations
6243      instead of scalar operations.  */
6244   direct_slp_reduc = (reduc_fn != IFN_LAST
6245                       && slp_reduc
6246                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6247
6248   /* In case of reduction chain, e.g.,
6249      # a1 = phi <a3, a0>
6250      a2 = operation (a1)
6251      a3 = operation (a2),
6252
6253      we may end up with more than one vector result.  Here we reduce them
6254      to one vector.
6255
6256      The same is true for a SLP reduction, e.g.,
6257      # a1 = phi <a2, a0>
6258      # b1 = phi <b2, b0>
6259      a2 = operation (a1)
6260      b2 = operation (a2),
6261
6262      where we can end up with more than one vector as well.  We can
6263      easily accumulate vectors when the number of vector elements is
6264      a multiple of the SLP group size.
6265
6266      The same is true if we couldn't use a single defuse cycle.  */
6267   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6268       || direct_slp_reduc
6269       || (slp_reduc
6270           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6271       || ncopies > 1)
6272     {
6273       gimple_seq stmts = NULL;
6274       tree single_input = reduc_inputs[0];
6275       for (k = 1; k < reduc_inputs.length (); k++)
6276         single_input = gimple_build (&stmts, code, vectype,
6277                                      single_input, reduc_inputs[k]);
6278       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6279
6280       reduc_inputs.truncate (0);
6281       reduc_inputs.safe_push (single_input);
6282     }
6283
6284   tree orig_reduc_input = reduc_inputs[0];
6285
6286   /* If this loop is an epilogue loop that can be skipped after the
6287      main loop, we can only share a reduction operation between the
6288      main loop and the epilogue if we put it at the target of the
6289      skip edge.
6290
6291      We can still reuse accumulators if this check fails.  Doing so has
6292      the minor(?) benefit of making the epilogue loop's scalar result
6293      independent of the main loop's scalar result.  */
6294   bool unify_with_main_loop_p = false;
6295   if (reduc_info->reused_accumulator
6296       && loop_vinfo->skip_this_loop_edge
6297       && single_succ_p (exit_bb)
6298       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6299     {
6300       unify_with_main_loop_p = true;
6301
6302       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6303       reduc_inputs[0] = make_ssa_name (vectype);
6304       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6305       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6306                    UNKNOWN_LOCATION);
6307       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6308                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6309       exit_gsi = gsi_after_labels (reduc_block);
6310     }
6311
6312   /* Shouldn't be used beyond this point.  */
6313   exit_bb = nullptr;
6314
6315   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6316       && reduc_fn != IFN_LAST)
6317     {
6318       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6319          various data values where the condition matched and another vector
6320          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6321          need to extract the last matching index (which will be the index with
6322          highest value) and use this to index into the data vector.
6323          For the case where there were no matches, the data vector will contain
6324          all default values and the index vector will be all zeros.  */
6325
6326       /* Get various versions of the type of the vector of indexes.  */
6327       tree index_vec_type = TREE_TYPE (induction_index);
6328       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6329       tree index_scalar_type = TREE_TYPE (index_vec_type);
6330       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6331
6332       /* Get an unsigned integer version of the type of the data vector.  */
6333       int scalar_precision
6334         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6335       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6336       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6337                                                 vectype);
6338
6339       /* First we need to create a vector (ZERO_VEC) of zeros and another
6340          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6341          can create using a MAX reduction and then expanding.
6342          In the case where the loop never made any matches, the max index will
6343          be zero.  */
6344
6345       /* Vector of {0, 0, 0,...}.  */
6346       tree zero_vec = build_zero_cst (vectype);
6347
6348       /* Find maximum value from the vector of found indexes.  */
6349       tree max_index = make_ssa_name (index_scalar_type);
6350       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6351                                                           1, induction_index);
6352       gimple_call_set_lhs (max_index_stmt, max_index);
6353       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6354
6355       /* Vector of {max_index, max_index, max_index,...}.  */
6356       tree max_index_vec = make_ssa_name (index_vec_type);
6357       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6358                                                       max_index);
6359       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6360                                                         max_index_vec_rhs);
6361       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6362
6363       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6364          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6365          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6366          otherwise.  Only one value should match, resulting in a vector
6367          (VEC_COND) with one data value and the rest zeros.
6368          In the case where the loop never made any matches, every index will
6369          match, resulting in a vector with all data values (which will all be
6370          the default value).  */
6371
6372       /* Compare the max index vector to the vector of found indexes to find
6373          the position of the max value.  */
6374       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6375       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6376                                                       induction_index,
6377                                                       max_index_vec);
6378       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6379
6380       /* Use the compare to choose either values from the data vector or
6381          zero.  */
6382       tree vec_cond = make_ssa_name (vectype);
6383       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6384                                                    vec_compare,
6385                                                    reduc_inputs[0],
6386                                                    zero_vec);
6387       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6388
6389       /* Finally we need to extract the data value from the vector (VEC_COND)
6390          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6391          reduction, but because this doesn't exist, we can use a MAX reduction
6392          instead.  The data value might be signed or a float so we need to cast
6393          it first.
6394          In the case where the loop never made any matches, the data values are
6395          all identical, and so will reduce down correctly.  */
6396
6397       /* Make the matched data values unsigned.  */
6398       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6399       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6400                                        vec_cond);
6401       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6402                                                         VIEW_CONVERT_EXPR,
6403                                                         vec_cond_cast_rhs);
6404       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6405
6406       /* Reduce down to a scalar value.  */
6407       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6408       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6409                                                            1, vec_cond_cast);
6410       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6411       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6412
6413       /* Convert the reduced value back to the result type and set as the
6414          result.  */
6415       gimple_seq stmts = NULL;
6416       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6417                                data_reduc);
6418       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6419       scalar_results.safe_push (new_temp);
6420     }
6421   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6422            && reduc_fn == IFN_LAST)
6423     {
6424       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6425          idx = 0;
6426          idx_val = induction_index[0];
6427          val = data_reduc[0];
6428          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6429            if (induction_index[i] > idx_val)
6430              val = data_reduc[i], idx_val = induction_index[i];
6431          return val;  */
6432
6433       tree data_eltype = TREE_TYPE (vectype);
6434       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6435       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6436       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6437       /* Enforced by vectorizable_reduction, which ensures we have target
6438          support before allowing a conditional reduction on variable-length
6439          vectors.  */
6440       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6441       tree idx_val = NULL_TREE, val = NULL_TREE;
6442       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6443         {
6444           tree old_idx_val = idx_val;
6445           tree old_val = val;
6446           idx_val = make_ssa_name (idx_eltype);
6447           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6448                                              build3 (BIT_FIELD_REF, idx_eltype,
6449                                                      induction_index,
6450                                                      bitsize_int (el_size),
6451                                                      bitsize_int (off)));
6452           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6453           val = make_ssa_name (data_eltype);
6454           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6455                                              build3 (BIT_FIELD_REF,
6456                                                      data_eltype,
6457                                                      reduc_inputs[0],
6458                                                      bitsize_int (el_size),
6459                                                      bitsize_int (off)));
6460           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6461           if (off != 0)
6462             {
6463               tree new_idx_val = idx_val;
6464               if (off != v_size - el_size)
6465                 {
6466                   new_idx_val = make_ssa_name (idx_eltype);
6467                   epilog_stmt = gimple_build_assign (new_idx_val,
6468                                                      MAX_EXPR, idx_val,
6469                                                      old_idx_val);
6470                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6471                 }
6472               tree cond = make_ssa_name (boolean_type_node);
6473               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6474                                                  idx_val, old_idx_val);
6475               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6476               tree new_val = make_ssa_name (data_eltype);
6477               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6478                                                  cond, val, old_val);
6479               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6480               idx_val = new_idx_val;
6481               val = new_val;
6482             }
6483         }
6484       /* Convert the reduced value back to the result type and set as the
6485          result.  */
6486       gimple_seq stmts = NULL;
6487       val = gimple_convert (&stmts, scalar_type, val);
6488       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6489       scalar_results.safe_push (val);
6490     }
6491
6492   /* 2.3 Create the reduction code, using one of the three schemes described
6493          above. In SLP we simply need to extract all the elements from the
6494          vector (without reducing them), so we use scalar shifts.  */
6495   else if (reduc_fn != IFN_LAST && !slp_reduc)
6496     {
6497       tree tmp;
6498       tree vec_elem_type;
6499
6500       /* Case 1:  Create:
6501          v_out2 = reduc_expr <v_out1>  */
6502
6503       if (dump_enabled_p ())
6504         dump_printf_loc (MSG_NOTE, vect_location,
6505                          "Reduce using direct vector reduction.\n");
6506
6507       gimple_seq stmts = NULL;
6508       vec_elem_type = TREE_TYPE (vectype);
6509       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6510                                vec_elem_type, reduc_inputs[0]);
6511       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6512       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6513
6514       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6515           && induc_val)
6516         {
6517           /* Earlier we set the initial value to be a vector if induc_val
6518              values.  Check the result and if it is induc_val then replace
6519              with the original initial value, unless induc_val is
6520              the same as initial_def already.  */
6521           tree zcompare = make_ssa_name (boolean_type_node);
6522           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6523                                              new_temp, induc_val);
6524           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6525           tree initial_def = reduc_info->reduc_initial_values[0];
6526           tmp = make_ssa_name (new_scalar_dest);
6527           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6528                                              initial_def, new_temp);
6529           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6530           new_temp = tmp;
6531         }
6532
6533       scalar_results.safe_push (new_temp);
6534     }
6535   else if (direct_slp_reduc)
6536     {
6537       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6538          with the elements for other SLP statements replaced with the
6539          neutral value.  We can then do a normal reduction on each vector.  */
6540
6541       /* Enforced by vectorizable_reduction.  */
6542       gcc_assert (reduc_inputs.length () == 1);
6543       gcc_assert (pow2p_hwi (group_size));
6544
6545       gimple_seq seq = NULL;
6546
6547       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6548          and the same element size as VECTYPE.  */
6549       tree index = build_index_vector (vectype, 0, 1);
6550       tree index_type = TREE_TYPE (index);
6551       tree index_elt_type = TREE_TYPE (index_type);
6552       tree mask_type = truth_type_for (index_type);
6553
6554       /* Create a vector that, for each element, identifies which of
6555          the REDUC_GROUP_SIZE results should use it.  */
6556       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6557       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6558                             build_vector_from_val (index_type, index_mask));
6559
6560       /* Get a neutral vector value.  This is simply a splat of the neutral
6561          scalar value if we have one, otherwise the initial scalar value
6562          is itself a neutral value.  */
6563       tree vector_identity = NULL_TREE;
6564       tree neutral_op = NULL_TREE;
6565       if (slp_node)
6566         {
6567           tree initial_value = NULL_TREE;
6568           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6569             initial_value = reduc_info->reduc_initial_values[0];
6570           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6571                                                  initial_value, false);
6572         }
6573       if (neutral_op)
6574         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6575                                                         neutral_op);
6576       for (unsigned int i = 0; i < group_size; ++i)
6577         {
6578           /* If there's no univeral neutral value, we can use the
6579              initial scalar value from the original PHI.  This is used
6580              for MIN and MAX reduction, for example.  */
6581           if (!neutral_op)
6582             {
6583               tree scalar_value = reduc_info->reduc_initial_values[i];
6584               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6585                                              scalar_value);
6586               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6587                                                               scalar_value);
6588             }
6589
6590           /* Calculate the equivalent of:
6591
6592              sel[j] = (index[j] == i);
6593
6594              which selects the elements of REDUC_INPUTS[0] that should
6595              be included in the result.  */
6596           tree compare_val = build_int_cst (index_elt_type, i);
6597           compare_val = build_vector_from_val (index_type, compare_val);
6598           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6599                                    index, compare_val);
6600
6601           /* Calculate the equivalent of:
6602
6603              vec = seq ? reduc_inputs[0] : vector_identity;
6604
6605              VEC is now suitable for a full vector reduction.  */
6606           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6607                                    sel, reduc_inputs[0], vector_identity);
6608
6609           /* Do the reduction and convert it to the appropriate type.  */
6610           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6611                                       TREE_TYPE (vectype), vec);
6612           scalar = gimple_convert (&seq, scalar_type, scalar);
6613           scalar_results.safe_push (scalar);
6614         }
6615       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6616     }
6617   else
6618     {
6619       bool reduce_with_shift;
6620       tree vec_temp;
6621
6622       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6623
6624       /* See if the target wants to do the final (shift) reduction
6625          in a vector mode of smaller size and first reduce upper/lower
6626          halves against each other.  */
6627       enum machine_mode mode1 = mode;
6628       tree stype = TREE_TYPE (vectype);
6629       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6630       unsigned nunits1 = nunits;
6631       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6632           && reduc_inputs.length () == 1)
6633         {
6634           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6635           /* For SLP reductions we have to make sure lanes match up, but
6636              since we're doing individual element final reduction reducing
6637              vector width here is even more important.
6638              ???  We can also separate lanes with permutes, for the common
6639              case of power-of-two group-size odd/even extracts would work.  */
6640           if (slp_reduc && nunits != nunits1)
6641             {
6642               nunits1 = least_common_multiple (nunits1, group_size);
6643               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6644             }
6645         }
6646       if (!slp_reduc
6647           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6648         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6649
6650       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6651                                                            stype, nunits1);
6652       reduce_with_shift = have_whole_vector_shift (mode1);
6653       if (!VECTOR_MODE_P (mode1)
6654           || !directly_supported_p (code, vectype1))
6655         reduce_with_shift = false;
6656
6657       /* First reduce the vector to the desired vector size we should
6658          do shift reduction on by combining upper and lower halves.  */
6659       gimple_seq stmts = NULL;
6660       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6661                                              code, &stmts);
6662       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6663       reduc_inputs[0] = new_temp;
6664
6665       if (reduce_with_shift && !slp_reduc)
6666         {
6667           int element_bitsize = tree_to_uhwi (bitsize);
6668           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6669              for variable-length vectors and also requires direct target support
6670              for loop reductions.  */
6671           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6672           int nelements = vec_size_in_bits / element_bitsize;
6673           vec_perm_builder sel;
6674           vec_perm_indices indices;
6675
6676           int elt_offset;
6677
6678           tree zero_vec = build_zero_cst (vectype1);
6679           /* Case 2: Create:
6680              for (offset = nelements/2; offset >= 1; offset/=2)
6681                 {
6682                   Create:  va' = vec_shift <va, offset>
6683                   Create:  va = vop <va, va'>
6684                 }  */
6685
6686           tree rhs;
6687
6688           if (dump_enabled_p ())
6689             dump_printf_loc (MSG_NOTE, vect_location,
6690                              "Reduce using vector shifts\n");
6691
6692           gimple_seq stmts = NULL;
6693           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6694           for (elt_offset = nelements / 2;
6695                elt_offset >= 1;
6696                elt_offset /= 2)
6697             {
6698               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6699               indices.new_vector (sel, 2, nelements);
6700               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6701               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6702                                        new_temp, zero_vec, mask);
6703               new_temp = gimple_build (&stmts, code,
6704                                        vectype1, new_name, new_temp);
6705             }
6706           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6707
6708           /* 2.4  Extract the final scalar result.  Create:
6709              s_out3 = extract_field <v_out2, bitpos>  */
6710
6711           if (dump_enabled_p ())
6712             dump_printf_loc (MSG_NOTE, vect_location,
6713                              "extract scalar result\n");
6714
6715           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6716                         bitsize, bitsize_zero_node);
6717           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6718           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6719           gimple_assign_set_lhs (epilog_stmt, new_temp);
6720           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6721           scalar_results.safe_push (new_temp);
6722         }
6723       else
6724         {
6725           /* Case 3: Create:
6726              s = extract_field <v_out2, 0>
6727              for (offset = element_size;
6728                   offset < vector_size;
6729                   offset += element_size;)
6730                {
6731                  Create:  s' = extract_field <v_out2, offset>
6732                  Create:  s = op <s, s'>  // For non SLP cases
6733                }  */
6734
6735           if (dump_enabled_p ())
6736             dump_printf_loc (MSG_NOTE, vect_location,
6737                              "Reduce using scalar code.\n");
6738
6739           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6740           int element_bitsize = tree_to_uhwi (bitsize);
6741           tree compute_type = TREE_TYPE (vectype);
6742           gimple_seq stmts = NULL;
6743           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6744             {
6745               int bit_offset;
6746               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6747                                        vec_temp, bitsize, bitsize_zero_node);
6748
6749               /* In SLP we don't need to apply reduction operation, so we just
6750                  collect s' values in SCALAR_RESULTS.  */
6751               if (slp_reduc)
6752                 scalar_results.safe_push (new_temp);
6753
6754               for (bit_offset = element_bitsize;
6755                    bit_offset < vec_size_in_bits;
6756                    bit_offset += element_bitsize)
6757                 {
6758                   tree bitpos = bitsize_int (bit_offset);
6759                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6760                                            compute_type, vec_temp,
6761                                            bitsize, bitpos);
6762                   if (slp_reduc)
6763                     {
6764                       /* In SLP we don't need to apply reduction operation, so
6765                          we just collect s' values in SCALAR_RESULTS.  */
6766                       new_temp = new_name;
6767                       scalar_results.safe_push (new_name);
6768                     }
6769                   else
6770                     new_temp = gimple_build (&stmts, code, compute_type,
6771                                              new_name, new_temp);
6772                 }
6773             }
6774
6775           /* The only case where we need to reduce scalar results in SLP, is
6776              unrolling.  If the size of SCALAR_RESULTS is greater than
6777              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6778              REDUC_GROUP_SIZE.  */
6779           if (slp_reduc)
6780             {
6781               tree res, first_res, new_res;
6782
6783               /* Reduce multiple scalar results in case of SLP unrolling.  */
6784               for (j = group_size; scalar_results.iterate (j, &res);
6785                    j++)
6786                 {
6787                   first_res = scalar_results[j % group_size];
6788                   new_res = gimple_build (&stmts, code, compute_type,
6789                                           first_res, res);
6790                   scalar_results[j % group_size] = new_res;
6791                 }
6792               scalar_results.truncate (group_size);
6793               for (k = 0; k < group_size; k++)
6794                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6795                                                     scalar_results[k]);
6796             }
6797           else
6798             {
6799               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6800               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6801               scalar_results.safe_push (new_temp);
6802             }
6803
6804           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6805         }
6806
6807       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6808           && induc_val)
6809         {
6810           /* Earlier we set the initial value to be a vector if induc_val
6811              values.  Check the result and if it is induc_val then replace
6812              with the original initial value, unless induc_val is
6813              the same as initial_def already.  */
6814           tree zcompare = make_ssa_name (boolean_type_node);
6815           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6816                                              induc_val);
6817           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6818           tree initial_def = reduc_info->reduc_initial_values[0];
6819           tree tmp = make_ssa_name (new_scalar_dest);
6820           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6821                                              initial_def, new_temp);
6822           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6823           scalar_results[0] = tmp;
6824         }
6825     }
6826
6827   /* 2.5 Adjust the final result by the initial value of the reduction
6828          variable. (When such adjustment is not needed, then
6829          'adjustment_def' is zero).  For example, if code is PLUS we create:
6830          new_temp = loop_exit_def + adjustment_def  */
6831
6832   if (adjustment_def)
6833     {
6834       gcc_assert (!slp_reduc);
6835       gimple_seq stmts = NULL;
6836       if (double_reduc)
6837         {
6838           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6839           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6840           new_temp = gimple_build (&stmts, code, vectype,
6841                                    reduc_inputs[0], adjustment_def);
6842         }
6843       else
6844         {
6845           new_temp = scalar_results[0];
6846           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6847           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6848                                            adjustment_def);
6849           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6850           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6851                                    new_temp, adjustment_def);
6852           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6853         }
6854
6855       epilog_stmt = gimple_seq_last_stmt (stmts);
6856       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6857       scalar_results[0] = new_temp;
6858     }
6859
6860   /* Record this operation if it could be reused by the epilogue loop.  */
6861   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6862       && reduc_inputs.length () == 1)
6863     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6864                                            { orig_reduc_input, reduc_info });
6865
6866   if (double_reduc)
6867     loop = outer_loop;
6868
6869   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6870           phis with new adjusted scalar results, i.e., replace use <s_out0>
6871           with use <s_out4>.
6872
6873      Transform:
6874         loop_exit:
6875           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6876           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6877           v_out2 = reduce <v_out1>
6878           s_out3 = extract_field <v_out2, 0>
6879           s_out4 = adjust_result <s_out3>
6880           use <s_out0>
6881           use <s_out0>
6882
6883      into:
6884
6885         loop_exit:
6886           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6887           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6888           v_out2 = reduce <v_out1>
6889           s_out3 = extract_field <v_out2, 0>
6890           s_out4 = adjust_result <s_out3>
6891           use <s_out4>
6892           use <s_out4> */
6893
6894   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6895   auto_vec<gimple *> phis;
6896   for (k = 0; k < live_out_stmts.size (); k++)
6897     {
6898       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6899       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6900
6901       /* Find the loop-closed-use at the loop exit of the original scalar
6902          result.  (The reduction result is expected to have two immediate uses,
6903          one at the latch block, and one at the loop exit).  For double
6904          reductions we are looking for exit phis of the outer loop.  */
6905       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6906         {
6907           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6908             {
6909               if (!is_gimple_debug (USE_STMT (use_p))
6910                   && gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6911                 phis.safe_push (USE_STMT (use_p));
6912             }
6913           else
6914             {
6915               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6916                 {
6917                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6918
6919                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6920                     {
6921                       if (!flow_bb_inside_loop_p (loop,
6922                                              gimple_bb (USE_STMT (phi_use_p)))
6923                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6924                         phis.safe_push (USE_STMT (phi_use_p));
6925                     }
6926                 }
6927             }
6928         }
6929
6930       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6931         {
6932           /* Replace the uses:  */
6933           orig_name = PHI_RESULT (exit_phi);
6934
6935           /* Look for a single use at the target of the skip edge.  */
6936           if (unify_with_main_loop_p)
6937             {
6938               use_operand_p use_p;
6939               gimple *user;
6940               if (!single_imm_use (orig_name, &use_p, &user))
6941                 gcc_unreachable ();
6942               orig_name = gimple_get_lhs (user);
6943             }
6944
6945           scalar_result = scalar_results[k];
6946           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6947             {
6948               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6949                 SET_USE (use_p, scalar_result);
6950               update_stmt (use_stmt);
6951             }
6952         }
6953
6954       phis.truncate (0);
6955     }
6956 }
6957
6958 /* Return a vector of type VECTYPE that is equal to the vector select
6959    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6960    before GSI.  */
6961
6962 static tree
6963 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6964                      tree vec, tree identity)
6965 {
6966   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6967   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6968                                           mask, vec, identity);
6969   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6970   return cond;
6971 }
6972
6973 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6974    order, starting with LHS.  Insert the extraction statements before GSI and
6975    associate the new scalar SSA names with variable SCALAR_DEST.
6976    If MASK is nonzero mask the input and then operate on it unconditionally.
6977    Return the SSA name for the result.  */
6978
6979 static tree
6980 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6981                        tree_code code, tree lhs, tree vector_rhs,
6982                        tree mask)
6983 {
6984   tree vectype = TREE_TYPE (vector_rhs);
6985   tree scalar_type = TREE_TYPE (vectype);
6986   tree bitsize = TYPE_SIZE (scalar_type);
6987   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6988   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6989
6990   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6991      to perform an unconditional element-wise reduction of it.  */
6992   if (mask)
6993     {
6994       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6995                                                    "masked_vector_rhs");
6996       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6997                                                   false);
6998       tree vector_identity = build_vector_from_val (vectype, neutral_op);
6999       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7000                                              mask, vector_rhs, vector_identity);
7001       gsi_insert_before (gsi, select, GSI_SAME_STMT);
7002       vector_rhs = masked_vector_rhs;
7003     }
7004
7005   for (unsigned HOST_WIDE_INT bit_offset = 0;
7006        bit_offset < vec_size_in_bits;
7007        bit_offset += element_bitsize)
7008     {
7009       tree bitpos = bitsize_int (bit_offset);
7010       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7011                          bitsize, bitpos);
7012
7013       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7014       rhs = make_ssa_name (scalar_dest, stmt);
7015       gimple_assign_set_lhs (stmt, rhs);
7016       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7017
7018       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7019       tree new_name = make_ssa_name (scalar_dest, stmt);
7020       gimple_assign_set_lhs (stmt, new_name);
7021       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7022       lhs = new_name;
7023     }
7024   return lhs;
7025 }
7026
7027 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
7028    type of the vector input.  */
7029
7030 static internal_fn
7031 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7032 {
7033   internal_fn mask_reduc_fn;
7034   internal_fn mask_len_reduc_fn;
7035
7036   switch (reduc_fn)
7037     {
7038     case IFN_FOLD_LEFT_PLUS:
7039       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7040       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7041       break;
7042
7043     default:
7044       return IFN_LAST;
7045     }
7046
7047   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7048                                       OPTIMIZE_FOR_SPEED))
7049     return mask_reduc_fn;
7050   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7051                                       OPTIMIZE_FOR_SPEED))
7052     return mask_len_reduc_fn;
7053   return IFN_LAST;
7054 }
7055
7056 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7057    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7058    statement.  CODE is the operation performed by STMT_INFO and OPS are
7059    its scalar operands.  REDUC_INDEX is the index of the operand in
7060    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7061    implements in-order reduction, or IFN_LAST if we should open-code it.
7062    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7063    that should be used to control the operation in a fully-masked loop.  */
7064
7065 static bool
7066 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7067                                stmt_vec_info stmt_info,
7068                                gimple_stmt_iterator *gsi,
7069                                gimple **vec_stmt, slp_tree slp_node,
7070                                gimple *reduc_def_stmt,
7071                                code_helper code, internal_fn reduc_fn,
7072                                tree *ops, int num_ops, tree vectype_in,
7073                                int reduc_index, vec_loop_masks *masks,
7074                                vec_loop_lens *lens)
7075 {
7076   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7077   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7078   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7079
7080   int ncopies;
7081   if (slp_node)
7082     ncopies = 1;
7083   else
7084     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7085
7086   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7087   gcc_assert (ncopies == 1);
7088
7089   bool is_cond_op = false;
7090   if (!code.is_tree_code ())
7091     {
7092       code = conditional_internal_fn_code (internal_fn (code));
7093       gcc_assert (code != ERROR_MARK);
7094       is_cond_op = true;
7095     }
7096
7097   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7098
7099   if (slp_node)
7100     {
7101       if (is_cond_op)
7102         {
7103           if (dump_enabled_p ())
7104             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7105                              "fold-left reduction on SLP not supported.\n");
7106           return false;
7107         }
7108
7109       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7110                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7111     }
7112
7113   /* The operands either come from a binary operation or an IFN_COND operation.
7114      The former is a gimple assign with binary rhs and the latter is a
7115      gimple call with four arguments.  */
7116   gcc_assert (num_ops == 2 || num_ops == 4);
7117   tree op0, opmask;
7118   if (!is_cond_op)
7119     op0 = ops[1 - reduc_index];
7120   else
7121     {
7122       op0 = ops[2 + (1 - reduc_index)];
7123       opmask = ops[0];
7124       gcc_assert (!slp_node);
7125     }
7126
7127   int group_size = 1;
7128   stmt_vec_info scalar_dest_def_info;
7129   auto_vec<tree> vec_oprnds0, vec_opmask;
7130   if (slp_node)
7131     {
7132       auto_vec<vec<tree> > vec_defs (2);
7133       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7134       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7135       vec_defs[0].release ();
7136       vec_defs[1].release ();
7137       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7138       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7139     }
7140   else
7141     {
7142       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7143                                      op0, &vec_oprnds0);
7144       scalar_dest_def_info = stmt_info;
7145
7146       /* For an IFN_COND_OP we also need the vector mask operand.  */
7147       if (is_cond_op)
7148           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7149                                          opmask, &vec_opmask);
7150     }
7151
7152   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7153   tree scalar_dest = gimple_get_lhs (sdef);
7154   tree scalar_type = TREE_TYPE (scalar_dest);
7155   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7156
7157   int vec_num = vec_oprnds0.length ();
7158   gcc_assert (vec_num == 1 || slp_node);
7159   tree vec_elem_type = TREE_TYPE (vectype_out);
7160   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7161
7162   tree vector_identity = NULL_TREE;
7163   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7164     {
7165       vector_identity = build_zero_cst (vectype_out);
7166       if (!HONOR_SIGNED_ZEROS (vectype_out))
7167         ;
7168       else
7169         {
7170           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7171           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7172                                         vector_identity);
7173         }
7174     }
7175
7176   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7177   int i;
7178   tree def0;
7179   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7180     {
7181       gimple *new_stmt;
7182       tree mask = NULL_TREE;
7183       tree len = NULL_TREE;
7184       tree bias = NULL_TREE;
7185       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7186         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7187       else if (is_cond_op)
7188         mask = vec_opmask[0];
7189       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7190         {
7191           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7192                                    i, 1);
7193           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7194           bias = build_int_cst (intQI_type_node, biasval);
7195           if (!is_cond_op)
7196             mask = build_minus_one_cst (truth_type_for (vectype_in));
7197         }
7198
7199       /* Handle MINUS by adding the negative.  */
7200       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7201         {
7202           tree negated = make_ssa_name (vectype_out);
7203           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7204           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7205           def0 = negated;
7206         }
7207
7208       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7209           && mask && mask_reduc_fn == IFN_LAST)
7210         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7211                                     vector_identity);
7212
7213       /* On the first iteration the input is simply the scalar phi
7214          result, and for subsequent iterations it is the output of
7215          the preceding operation.  */
7216       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7217         {
7218           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7219             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7220                                                    def0, mask, len, bias);
7221           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7222             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7223                                                    def0, mask);
7224           else
7225             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7226                                                    def0);
7227           /* For chained SLP reductions the output of the previous reduction
7228              operation serves as the input of the next. For the final statement
7229              the output cannot be a temporary - we reuse the original
7230              scalar destination of the last statement.  */
7231           if (i != vec_num - 1)
7232             {
7233               gimple_set_lhs (new_stmt, scalar_dest_var);
7234               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7235               gimple_set_lhs (new_stmt, reduc_var);
7236             }
7237         }
7238       else
7239         {
7240           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7241                                              tree_code (code), reduc_var, def0,
7242                                              mask);
7243           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7244           /* Remove the statement, so that we can use the same code paths
7245              as for statements that we've just created.  */
7246           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7247           gsi_remove (&tmp_gsi, true);
7248         }
7249
7250       if (i == vec_num - 1)
7251         {
7252           gimple_set_lhs (new_stmt, scalar_dest);
7253           vect_finish_replace_stmt (loop_vinfo,
7254                                     scalar_dest_def_info,
7255                                     new_stmt);
7256         }
7257       else
7258         vect_finish_stmt_generation (loop_vinfo,
7259                                      scalar_dest_def_info,
7260                                      new_stmt, gsi);
7261
7262       if (slp_node)
7263         slp_node->push_vec_def (new_stmt);
7264       else
7265         {
7266           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7267           *vec_stmt = new_stmt;
7268         }
7269     }
7270
7271   return true;
7272 }
7273
7274 /* Function is_nonwrapping_integer_induction.
7275
7276    Check if STMT_VINO (which is part of loop LOOP) both increments and
7277    does not cause overflow.  */
7278
7279 static bool
7280 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7281 {
7282   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7283   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7284   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7285   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7286   widest_int ni, max_loop_value, lhs_max;
7287   wi::overflow_type overflow = wi::OVF_NONE;
7288
7289   /* Make sure the loop is integer based.  */
7290   if (TREE_CODE (base) != INTEGER_CST
7291       || TREE_CODE (step) != INTEGER_CST)
7292     return false;
7293
7294   /* Check that the max size of the loop will not wrap.  */
7295
7296   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7297     return true;
7298
7299   if (! max_stmt_executions (loop, &ni))
7300     return false;
7301
7302   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7303                             &overflow);
7304   if (overflow)
7305     return false;
7306
7307   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7308                             TYPE_SIGN (lhs_type), &overflow);
7309   if (overflow)
7310     return false;
7311
7312   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7313           <= TYPE_PRECISION (lhs_type));
7314 }
7315
7316 /* Check if masking can be supported by inserting a conditional expression.
7317    CODE is the code for the operation.  COND_FN is the conditional internal
7318    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7319 static bool
7320 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7321                          tree vectype_in)
7322 {
7323   if (cond_fn != IFN_LAST
7324       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7325                                          OPTIMIZE_FOR_SPEED))
7326     return false;
7327
7328   if (code.is_tree_code ())
7329     switch (tree_code (code))
7330       {
7331       case DOT_PROD_EXPR:
7332       case SAD_EXPR:
7333         return true;
7334
7335       default:
7336         break;
7337       }
7338   return false;
7339 }
7340
7341 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7342    code for the operation.  VOP is the array of operands.  MASK is the loop
7343    mask.  GSI is a statement iterator used to place the new conditional
7344    expression.  */
7345 static void
7346 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7347                       gimple_stmt_iterator *gsi)
7348 {
7349   switch (tree_code (code))
7350     {
7351     case DOT_PROD_EXPR:
7352       {
7353         tree vectype = TREE_TYPE (vop[1]);
7354         tree zero = build_zero_cst (vectype);
7355         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7356         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7357                                                mask, vop[1], zero);
7358         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7359         vop[1] = masked_op1;
7360         break;
7361       }
7362
7363     case SAD_EXPR:
7364       {
7365         tree vectype = TREE_TYPE (vop[1]);
7366         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7367         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7368                                                mask, vop[1], vop[0]);
7369         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7370         vop[1] = masked_op1;
7371         break;
7372       }
7373
7374     default:
7375       gcc_unreachable ();
7376     }
7377 }
7378
7379 /* Function vectorizable_reduction.
7380
7381    Check if STMT_INFO performs a reduction operation that can be vectorized.
7382    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7383    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7384    Return true if STMT_INFO is vectorizable in this way.
7385
7386    This function also handles reduction idioms (patterns) that have been
7387    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7388    may be of this form:
7389      X = pattern_expr (arg0, arg1, ..., X)
7390    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7391    sequence that had been detected and replaced by the pattern-stmt
7392    (STMT_INFO).
7393
7394    This function also handles reduction of condition expressions, for example:
7395      for (int i = 0; i < N; i++)
7396        if (a[i] < value)
7397          last = a[i];
7398    This is handled by vectorising the loop and creating an additional vector
7399    containing the loop indexes for which "a[i] < value" was true.  In the
7400    function epilogue this is reduced to a single max value and then used to
7401    index into the vector of results.
7402
7403    In some cases of reduction patterns, the type of the reduction variable X is
7404    different than the type of the other arguments of STMT_INFO.
7405    In such cases, the vectype that is used when transforming STMT_INFO into
7406    a vector stmt is different than the vectype that is used to determine the
7407    vectorization factor, because it consists of a different number of elements
7408    than the actual number of elements that are being operated upon in parallel.
7409
7410    For example, consider an accumulation of shorts into an int accumulator.
7411    On some targets it's possible to vectorize this pattern operating on 8
7412    shorts at a time (hence, the vectype for purposes of determining the
7413    vectorization factor should be V8HI); on the other hand, the vectype that
7414    is used to create the vector form is actually V4SI (the type of the result).
7415
7416    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7417    indicates what is the actual level of parallelism (V8HI in the example), so
7418    that the right vectorization factor would be derived.  This vectype
7419    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7420    be used to create the vectorized stmt.  The right vectype for the vectorized
7421    stmt is obtained from the type of the result X:
7422       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7423
7424    This means that, contrary to "regular" reductions (or "regular" stmts in
7425    general), the following equation:
7426       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7427    does *NOT* necessarily hold for reduction patterns.  */
7428
7429 bool
7430 vectorizable_reduction (loop_vec_info loop_vinfo,
7431                         stmt_vec_info stmt_info, slp_tree slp_node,
7432                         slp_instance slp_node_instance,
7433                         stmt_vector_for_cost *cost_vec)
7434 {
7435   tree vectype_in = NULL_TREE;
7436   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7437   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7438   stmt_vec_info cond_stmt_vinfo = NULL;
7439   int i;
7440   int ncopies;
7441   bool single_defuse_cycle = false;
7442   bool nested_cycle = false;
7443   bool double_reduc = false;
7444   int vec_num;
7445   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7446   tree cond_reduc_val = NULL_TREE;
7447
7448   /* Make sure it was already recognized as a reduction computation.  */
7449   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7450       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7451       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7452     return false;
7453
7454   /* The stmt we store reduction analysis meta on.  */
7455   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7456   reduc_info->is_reduc_info = true;
7457
7458   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7459     {
7460       if (is_a <gphi *> (stmt_info->stmt))
7461         {
7462           if (slp_node)
7463             {
7464               /* We eventually need to set a vector type on invariant
7465                  arguments.  */
7466               unsigned j;
7467               slp_tree child;
7468               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7469                 if (!vect_maybe_update_slp_op_vectype
7470                        (child, SLP_TREE_VECTYPE (slp_node)))
7471                   {
7472                     if (dump_enabled_p ())
7473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7474                                        "incompatible vector types for "
7475                                        "invariants\n");
7476                     return false;
7477                   }
7478             }
7479           /* Analysis for double-reduction is done on the outer
7480              loop PHI, nested cycles have no further restrictions.  */
7481           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7482         }
7483       else
7484         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7485       return true;
7486     }
7487
7488   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7489   stmt_vec_info phi_info = stmt_info;
7490   if (!is_a <gphi *> (stmt_info->stmt))
7491     {
7492       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7493       return true;
7494     }
7495   if (slp_node)
7496     {
7497       slp_node_instance->reduc_phis = slp_node;
7498       /* ???  We're leaving slp_node to point to the PHIs, we only
7499          need it to get at the number of vector stmts which wasn't
7500          yet initialized for the instance root.  */
7501     }
7502   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7503     {
7504       use_operand_p use_p;
7505       gimple *use_stmt;
7506       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7507                                  &use_p, &use_stmt);
7508       gcc_assert (res);
7509       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7510     }
7511
7512   /* PHIs should not participate in patterns.  */
7513   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7514   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7515
7516   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7517      and compute the reduction chain length.  Discover the real
7518      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7519   tree reduc_def
7520     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7521                              loop_latch_edge
7522                                (gimple_bb (reduc_def_phi)->loop_father));
7523   unsigned reduc_chain_length = 0;
7524   bool only_slp_reduc_chain = true;
7525   stmt_info = NULL;
7526   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7527   while (reduc_def != PHI_RESULT (reduc_def_phi))
7528     {
7529       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7530       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7531       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7532         {
7533           if (dump_enabled_p ())
7534             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7535                              "reduction chain broken by patterns.\n");
7536           return false;
7537         }
7538       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7539         only_slp_reduc_chain = false;
7540       /* For epilogue generation live members of the chain need
7541          to point back to the PHI via their original stmt for
7542          info_for_reduction to work.  For SLP we need to look at
7543          all lanes here - even though we only will vectorize from
7544          the SLP node with live lane zero the other live lanes also
7545          need to be identified as part of a reduction to be able
7546          to skip code generation for them.  */
7547       if (slp_for_stmt_info)
7548         {
7549           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7550             if (STMT_VINFO_LIVE_P (s))
7551               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7552         }
7553       else if (STMT_VINFO_LIVE_P (vdef))
7554         STMT_VINFO_REDUC_DEF (def) = phi_info;
7555       gimple_match_op op;
7556       if (!gimple_extract_op (vdef->stmt, &op))
7557         {
7558           if (dump_enabled_p ())
7559             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7560                              "reduction chain includes unsupported"
7561                              " statement type.\n");
7562           return false;
7563         }
7564       if (CONVERT_EXPR_CODE_P (op.code))
7565         {
7566           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7567             {
7568               if (dump_enabled_p ())
7569                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7570                                  "conversion in the reduction chain.\n");
7571               return false;
7572             }
7573         }
7574       else if (!stmt_info)
7575         /* First non-conversion stmt.  */
7576         stmt_info = vdef;
7577       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7578       reduc_chain_length++;
7579       if (!stmt_info && slp_node)
7580         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7581     }
7582   /* PHIs should not participate in patterns.  */
7583   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7584
7585   if (nested_in_vect_loop_p (loop, stmt_info))
7586     {
7587       loop = loop->inner;
7588       nested_cycle = true;
7589     }
7590
7591   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7592      element.  */
7593   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7594     {
7595       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7596       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7597     }
7598   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7599     gcc_assert (slp_node
7600                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7601
7602   /* 1. Is vectorizable reduction?  */
7603   /* Not supportable if the reduction variable is used in the loop, unless
7604      it's a reduction chain.  */
7605   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7606       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7607     return false;
7608
7609   /* Reductions that are not used even in an enclosing outer-loop,
7610      are expected to be "live" (used out of the loop).  */
7611   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7612       && !STMT_VINFO_LIVE_P (stmt_info))
7613     return false;
7614
7615   /* 2. Has this been recognized as a reduction pattern?
7616
7617      Check if STMT represents a pattern that has been recognized
7618      in earlier analysis stages.  For stmts that represent a pattern,
7619      the STMT_VINFO_RELATED_STMT field records the last stmt in
7620      the original sequence that constitutes the pattern.  */
7621
7622   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7623   if (orig_stmt_info)
7624     {
7625       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7626       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7627     }
7628
7629   /* 3. Check the operands of the operation.  The first operands are defined
7630         inside the loop body. The last operand is the reduction variable,
7631         which is defined by the loop-header-phi.  */
7632
7633   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7634   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7635   gimple_match_op op;
7636   if (!gimple_extract_op (stmt_info->stmt, &op))
7637     gcc_unreachable ();
7638   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7639                             || op.code == WIDEN_SUM_EXPR
7640                             || op.code == SAD_EXPR);
7641
7642   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7643       && !SCALAR_FLOAT_TYPE_P (op.type))
7644     return false;
7645
7646   /* Do not try to vectorize bit-precision reductions.  */
7647   if (!type_has_mode_precision_p (op.type))
7648     return false;
7649
7650   /* For lane-reducing ops we're reducing the number of reduction PHIs
7651      which means the only use of that may be in the lane-reducing operation.  */
7652   if (lane_reduc_code_p
7653       && reduc_chain_length != 1
7654       && !only_slp_reduc_chain)
7655     {
7656       if (dump_enabled_p ())
7657         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7658                          "lane-reducing reduction with extra stmts.\n");
7659       return false;
7660     }
7661
7662   /* All uses but the last are expected to be defined in the loop.
7663      The last use is the reduction variable.  In case of nested cycle this
7664      assumption is not true: we use reduc_index to record the index of the
7665      reduction variable.  */
7666   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7667   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7668   /* We need to skip an extra operand for COND_EXPRs with embedded
7669      comparison.  */
7670   unsigned opno_adjust = 0;
7671   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7672     opno_adjust = 1;
7673   for (i = 0; i < (int) op.num_ops; i++)
7674     {
7675       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7676       if (i == 0 && op.code == COND_EXPR)
7677         continue;
7678
7679       stmt_vec_info def_stmt_info;
7680       enum vect_def_type dt;
7681       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7682                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7683                                &vectype_op[i], &def_stmt_info))
7684         {
7685           if (dump_enabled_p ())
7686             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7687                              "use not simple.\n");
7688           return false;
7689         }
7690       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7691         continue;
7692
7693       /* For an IFN_COND_OP we might hit the reduction definition operand
7694          twice (once as definition, once as else).  */
7695       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7696         continue;
7697
7698       /* There should be only one cycle def in the stmt, the one
7699          leading to reduc_def.  */
7700       if (VECTORIZABLE_CYCLE_DEF (dt))
7701         return false;
7702
7703       if (!vectype_op[i])
7704         vectype_op[i]
7705           = get_vectype_for_scalar_type (loop_vinfo,
7706                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7707
7708       /* To properly compute ncopies we are interested in the widest
7709          non-reduction input type in case we're looking at a widening
7710          accumulation that we later handle in vect_transform_reduction.  */
7711       if (lane_reduc_code_p
7712           && vectype_op[i]
7713           && (!vectype_in
7714               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7715                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7716         vectype_in = vectype_op[i];
7717
7718       /* Record how the non-reduction-def value of COND_EXPR is defined.
7719          ???  For a chain of multiple CONDs we'd have to match them up all.  */
7720       if (op.code == COND_EXPR && reduc_chain_length == 1)
7721         {
7722           if (dt == vect_constant_def)
7723             {
7724               cond_reduc_dt = dt;
7725               cond_reduc_val = op.ops[i];
7726             }
7727           else if (dt == vect_induction_def
7728                    && def_stmt_info
7729                    && is_nonwrapping_integer_induction (def_stmt_info, loop))
7730             {
7731               cond_reduc_dt = dt;
7732               cond_stmt_vinfo = def_stmt_info;
7733             }
7734         }
7735     }
7736   if (!vectype_in)
7737     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7738   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7739
7740   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7741   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7742   /* If we have a condition reduction, see if we can simplify it further.  */
7743   if (v_reduc_type == COND_REDUCTION)
7744     {
7745       if (slp_node)
7746         return false;
7747
7748       /* When the condition uses the reduction value in the condition, fail.  */
7749       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7750         {
7751           if (dump_enabled_p ())
7752             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7753                              "condition depends on previous iteration\n");
7754           return false;
7755         }
7756
7757       if (reduc_chain_length == 1
7758           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7759                                               OPTIMIZE_FOR_SPEED)
7760               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7761                                                  vectype_in,
7762                                                  OPTIMIZE_FOR_SPEED)))
7763         {
7764           if (dump_enabled_p ())
7765             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7766                              "optimizing condition reduction with"
7767                              " FOLD_EXTRACT_LAST.\n");
7768           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7769         }
7770       else if (cond_reduc_dt == vect_induction_def)
7771         {
7772           tree base
7773             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7774           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7775
7776           gcc_assert (TREE_CODE (base) == INTEGER_CST
7777                       && TREE_CODE (step) == INTEGER_CST);
7778           cond_reduc_val = NULL_TREE;
7779           enum tree_code cond_reduc_op_code = ERROR_MARK;
7780           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7781           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7782             ;
7783           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7784              above base; punt if base is the minimum value of the type for
7785              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7786           else if (tree_int_cst_sgn (step) == -1)
7787             {
7788               cond_reduc_op_code = MIN_EXPR;
7789               if (tree_int_cst_sgn (base) == -1)
7790                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7791               else if (tree_int_cst_lt (base,
7792                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7793                 cond_reduc_val
7794                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7795             }
7796           else
7797             {
7798               cond_reduc_op_code = MAX_EXPR;
7799               if (tree_int_cst_sgn (base) == 1)
7800                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7801               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7802                                         base))
7803                 cond_reduc_val
7804                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7805             }
7806           if (cond_reduc_val)
7807             {
7808               if (dump_enabled_p ())
7809                 dump_printf_loc (MSG_NOTE, vect_location,
7810                                  "condition expression based on "
7811                                  "integer induction.\n");
7812               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7813               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7814                 = cond_reduc_val;
7815               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7816             }
7817         }
7818       else if (cond_reduc_dt == vect_constant_def)
7819         {
7820           enum vect_def_type cond_initial_dt;
7821           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7822           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7823           if (cond_initial_dt == vect_constant_def
7824               && types_compatible_p (TREE_TYPE (cond_initial_val),
7825                                      TREE_TYPE (cond_reduc_val)))
7826             {
7827               tree e = fold_binary (LE_EXPR, boolean_type_node,
7828                                     cond_initial_val, cond_reduc_val);
7829               if (e && (integer_onep (e) || integer_zerop (e)))
7830                 {
7831                   if (dump_enabled_p ())
7832                     dump_printf_loc (MSG_NOTE, vect_location,
7833                                      "condition expression based on "
7834                                      "compile time constant.\n");
7835                   /* Record reduction code at analysis stage.  */
7836                   STMT_VINFO_REDUC_CODE (reduc_info)
7837                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7838                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7839                 }
7840             }
7841         }
7842     }
7843
7844   if (STMT_VINFO_LIVE_P (phi_info))
7845     return false;
7846
7847   if (slp_node)
7848     ncopies = 1;
7849   else
7850     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7851
7852   gcc_assert (ncopies >= 1);
7853
7854   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7855
7856   if (nested_cycle)
7857     {
7858       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7859                   == vect_double_reduction_def);
7860       double_reduc = true;
7861     }
7862
7863   /* 4.2. Check support for the epilog operation.
7864
7865           If STMT represents a reduction pattern, then the type of the
7866           reduction variable may be different than the type of the rest
7867           of the arguments.  For example, consider the case of accumulation
7868           of shorts into an int accumulator; The original code:
7869                         S1: int_a = (int) short_a;
7870           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7871
7872           was replaced with:
7873                         STMT: int_acc = widen_sum <short_a, int_acc>
7874
7875           This means that:
7876           1. The tree-code that is used to create the vector operation in the
7877              epilog code (that reduces the partial results) is not the
7878              tree-code of STMT, but is rather the tree-code of the original
7879              stmt from the pattern that STMT is replacing.  I.e, in the example
7880              above we want to use 'widen_sum' in the loop, but 'plus' in the
7881              epilog.
7882           2. The type (mode) we use to check available target support
7883              for the vector operation to be created in the *epilog*, is
7884              determined by the type of the reduction variable (in the example
7885              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7886              However the type (mode) we use to check available target support
7887              for the vector operation to be created *inside the loop*, is
7888              determined by the type of the other arguments to STMT (in the
7889              example we'd check this: optab_handler (widen_sum_optab,
7890              vect_short_mode)).
7891
7892           This is contrary to "regular" reductions, in which the types of all
7893           the arguments are the same as the type of the reduction variable.
7894           For "regular" reductions we can therefore use the same vector type
7895           (and also the same tree-code) when generating the epilog code and
7896           when generating the code inside the loop.  */
7897
7898   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7899
7900   /* If conversion might have created a conditional operation like
7901      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7902   if (orig_code.is_internal_fn ())
7903     {
7904       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7905       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7906     }
7907
7908   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7909
7910   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7911   if (reduction_type == TREE_CODE_REDUCTION)
7912     {
7913       /* Check whether it's ok to change the order of the computation.
7914          Generally, when vectorizing a reduction we change the order of the
7915          computation.  This may change the behavior of the program in some
7916          cases, so we need to check that this is ok.  One exception is when
7917          vectorizing an outer-loop: the inner-loop is executed sequentially,
7918          and therefore vectorizing reductions in the inner-loop during
7919          outer-loop vectorization is safe.  Likewise when we are vectorizing
7920          a series of reductions using SLP and the VF is one the reductions
7921          are performed in scalar order.  */
7922       if (slp_node
7923           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7924           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7925         ;
7926       else if (needs_fold_left_reduction_p (op.type, orig_code))
7927         {
7928           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7929              is not directy used in stmt.  */
7930           if (!only_slp_reduc_chain
7931               && reduc_chain_length != 1)
7932             {
7933               if (dump_enabled_p ())
7934                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7935                                  "in-order reduction chain without SLP.\n");
7936               return false;
7937             }
7938           STMT_VINFO_REDUC_TYPE (reduc_info)
7939             = reduction_type = FOLD_LEFT_REDUCTION;
7940         }
7941       else if (!commutative_binary_op_p (orig_code, op.type)
7942                || !associative_binary_op_p (orig_code, op.type))
7943         {
7944           if (dump_enabled_p ())
7945             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7946                             "reduction: not commutative/associative\n");
7947           return false;
7948         }
7949     }
7950
7951   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7952       && ncopies > 1)
7953     {
7954       if (dump_enabled_p ())
7955         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7956                          "multiple types in double reduction or condition "
7957                          "reduction or fold-left reduction.\n");
7958       return false;
7959     }
7960
7961   internal_fn reduc_fn = IFN_LAST;
7962   if (reduction_type == TREE_CODE_REDUCTION
7963       || reduction_type == FOLD_LEFT_REDUCTION
7964       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7965       || reduction_type == CONST_COND_REDUCTION)
7966     {
7967       if (reduction_type == FOLD_LEFT_REDUCTION
7968           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7969           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7970         {
7971           if (reduc_fn != IFN_LAST
7972               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7973                                                   OPTIMIZE_FOR_SPEED))
7974             {
7975               if (dump_enabled_p ())
7976                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977                                  "reduc op not supported by target.\n");
7978
7979               reduc_fn = IFN_LAST;
7980             }
7981         }
7982       else
7983         {
7984           if (!nested_cycle || double_reduc)
7985             {
7986               if (dump_enabled_p ())
7987                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7988                                  "no reduc code for scalar code.\n");
7989
7990               return false;
7991             }
7992         }
7993     }
7994   else if (reduction_type == COND_REDUCTION)
7995     {
7996       int scalar_precision
7997         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7998       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7999       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8000                                                 vectype_out);
8001
8002       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8003                                           OPTIMIZE_FOR_SPEED))
8004         reduc_fn = IFN_REDUC_MAX;
8005     }
8006   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8007
8008   if (reduction_type != EXTRACT_LAST_REDUCTION
8009       && (!nested_cycle || double_reduc)
8010       && reduc_fn == IFN_LAST
8011       && !nunits_out.is_constant ())
8012     {
8013       if (dump_enabled_p ())
8014         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8015                          "missing target support for reduction on"
8016                          " variable-length vectors.\n");
8017       return false;
8018     }
8019
8020   /* For SLP reductions, see if there is a neutral value we can use.  */
8021   tree neutral_op = NULL_TREE;
8022   if (slp_node)
8023     {
8024       tree initial_value = NULL_TREE;
8025       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8026         initial_value = vect_phi_initial_value (reduc_def_phi);
8027       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8028                                              orig_code, initial_value);
8029     }
8030
8031   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8032     {
8033       /* We can't support in-order reductions of code such as this:
8034
8035            for (int i = 0; i < n1; ++i)
8036              for (int j = 0; j < n2; ++j)
8037                l += a[j];
8038
8039          since GCC effectively transforms the loop when vectorizing:
8040
8041            for (int i = 0; i < n1 / VF; ++i)
8042              for (int j = 0; j < n2; ++j)
8043                for (int k = 0; k < VF; ++k)
8044                  l += a[j];
8045
8046          which is a reassociation of the original operation.  */
8047       if (dump_enabled_p ())
8048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8049                          "in-order double reduction not supported.\n");
8050
8051       return false;
8052     }
8053
8054   if (reduction_type == FOLD_LEFT_REDUCTION
8055       && slp_node
8056       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8057     {
8058       /* We cannot use in-order reductions in this case because there is
8059          an implicit reassociation of the operations involved.  */
8060       if (dump_enabled_p ())
8061         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8062                          "in-order unchained SLP reductions not supported.\n");
8063       return false;
8064     }
8065
8066   /* For double reductions, and for SLP reductions with a neutral value,
8067      we construct a variable-length initial vector by loading a vector
8068      full of the neutral value and then shift-and-inserting the start
8069      values into the low-numbered elements.  */
8070   if ((double_reduc || neutral_op)
8071       && !nunits_out.is_constant ()
8072       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8073                                           vectype_out, OPTIMIZE_FOR_SPEED))
8074     {
8075       if (dump_enabled_p ())
8076         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8077                          "reduction on variable-length vectors requires"
8078                          " target support for a vector-shift-and-insert"
8079                          " operation.\n");
8080       return false;
8081     }
8082
8083   /* Check extra constraints for variable-length unchained SLP reductions.  */
8084   if (slp_node
8085       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8086       && !nunits_out.is_constant ())
8087     {
8088       /* We checked above that we could build the initial vector when
8089          there's a neutral element value.  Check here for the case in
8090          which each SLP statement has its own initial value and in which
8091          that value needs to be repeated for every instance of the
8092          statement within the initial vector.  */
8093       unsigned int group_size = SLP_TREE_LANES (slp_node);
8094       if (!neutral_op
8095           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8096                                               TREE_TYPE (vectype_out)))
8097         {
8098           if (dump_enabled_p ())
8099             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8100                              "unsupported form of SLP reduction for"
8101                              " variable-length vectors: cannot build"
8102                              " initial vector.\n");
8103           return false;
8104         }
8105       /* The epilogue code relies on the number of elements being a multiple
8106          of the group size.  The duplicate-and-interleave approach to setting
8107          up the initial vector does too.  */
8108       if (!multiple_p (nunits_out, group_size))
8109         {
8110           if (dump_enabled_p ())
8111             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8112                              "unsupported form of SLP reduction for"
8113                              " variable-length vectors: the vector size"
8114                              " is not a multiple of the number of results.\n");
8115           return false;
8116         }
8117     }
8118
8119   if (reduction_type == COND_REDUCTION)
8120     {
8121       widest_int ni;
8122
8123       if (! max_loop_iterations (loop, &ni))
8124         {
8125           if (dump_enabled_p ())
8126             dump_printf_loc (MSG_NOTE, vect_location,
8127                              "loop count not known, cannot create cond "
8128                              "reduction.\n");
8129           return false;
8130         }
8131       /* Convert backedges to iterations.  */
8132       ni += 1;
8133
8134       /* The additional index will be the same type as the condition.  Check
8135          that the loop can fit into this less one (because we'll use up the
8136          zero slot for when there are no matches).  */
8137       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8138       if (wi::geu_p (ni, wi::to_widest (max_index)))
8139         {
8140           if (dump_enabled_p ())
8141             dump_printf_loc (MSG_NOTE, vect_location,
8142                              "loop size is greater than data size.\n");
8143           return false;
8144         }
8145     }
8146
8147   /* In case the vectorization factor (VF) is bigger than the number
8148      of elements that we can fit in a vectype (nunits), we have to generate
8149      more than one vector stmt - i.e - we need to "unroll" the
8150      vector stmt by a factor VF/nunits.  For more details see documentation
8151      in vectorizable_operation.  */
8152
8153   /* If the reduction is used in an outer loop we need to generate
8154      VF intermediate results, like so (e.g. for ncopies=2):
8155         r0 = phi (init, r0)
8156         r1 = phi (init, r1)
8157         r0 = x0 + r0;
8158         r1 = x1 + r1;
8159     (i.e. we generate VF results in 2 registers).
8160     In this case we have a separate def-use cycle for each copy, and therefore
8161     for each copy we get the vector def for the reduction variable from the
8162     respective phi node created for this copy.
8163
8164     Otherwise (the reduction is unused in the loop nest), we can combine
8165     together intermediate results, like so (e.g. for ncopies=2):
8166         r = phi (init, r)
8167         r = x0 + r;
8168         r = x1 + r;
8169    (i.e. we generate VF/2 results in a single register).
8170    In this case for each copy we get the vector def for the reduction variable
8171    from the vectorized reduction operation generated in the previous iteration.
8172
8173    This only works when we see both the reduction PHI and its only consumer
8174    in vectorizable_reduction and there are no intermediate stmts
8175    participating.  When unrolling we want each unrolled iteration to have its
8176    own reduction accumulator since one of the main goals of unrolling a
8177    reduction is to reduce the aggregate loop-carried latency.  */
8178   if (ncopies > 1
8179       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8180       && reduc_chain_length == 1
8181       && loop_vinfo->suggested_unroll_factor == 1)
8182     single_defuse_cycle = true;
8183
8184   if (single_defuse_cycle || lane_reduc_code_p)
8185     {
8186       gcc_assert (op.code != COND_EXPR);
8187
8188       /* 4. Supportable by target?  */
8189       bool ok = true;
8190
8191       /* 4.1. check support for the operation in the loop
8192
8193          This isn't necessary for the lane reduction codes, since they
8194          can only be produced by pattern matching, and it's up to the
8195          pattern matcher to test for support.  The main reason for
8196          specifically skipping this step is to avoid rechecking whether
8197          mixed-sign dot-products can be implemented using signed
8198          dot-products.  */
8199       machine_mode vec_mode = TYPE_MODE (vectype_in);
8200       if (!lane_reduc_code_p
8201           && !directly_supported_p (op.code, vectype_in, optab_vector))
8202         {
8203           if (dump_enabled_p ())
8204             dump_printf (MSG_NOTE, "op not supported by target.\n");
8205           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8206               || !vect_can_vectorize_without_simd_p (op.code))
8207             ok = false;
8208           else
8209             if (dump_enabled_p ())
8210               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8211         }
8212
8213       if (vect_emulated_vector_p (vectype_in)
8214           && !vect_can_vectorize_without_simd_p (op.code))
8215         {
8216           if (dump_enabled_p ())
8217             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8218           return false;
8219         }
8220
8221       /* lane-reducing operations have to go through vect_transform_reduction.
8222          For the other cases try without the single cycle optimization.  */
8223       if (!ok)
8224         {
8225           if (lane_reduc_code_p)
8226             return false;
8227           else
8228             single_defuse_cycle = false;
8229         }
8230     }
8231   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8232
8233   /* If the reduction stmt is one of the patterns that have lane
8234      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8235   if ((ncopies > 1 && ! single_defuse_cycle)
8236       && lane_reduc_code_p)
8237     {
8238       if (dump_enabled_p ())
8239         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8240                          "multi def-use cycle not possible for lane-reducing "
8241                          "reduction operation\n");
8242       return false;
8243     }
8244
8245   if (slp_node
8246       && !(!single_defuse_cycle
8247            && !lane_reduc_code_p
8248            && reduction_type != FOLD_LEFT_REDUCTION))
8249     for (i = 0; i < (int) op.num_ops; i++)
8250       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8251         {
8252           if (dump_enabled_p ())
8253             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8254                              "incompatible vector types for invariants\n");
8255           return false;
8256         }
8257
8258   if (slp_node)
8259     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8260   else
8261     vec_num = 1;
8262
8263   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8264                              reduction_type, ncopies, cost_vec);
8265   /* Cost the reduction op inside the loop if transformed via
8266      vect_transform_reduction.  Otherwise this is costed by the
8267      separate vectorizable_* routines.  */
8268   if (single_defuse_cycle || lane_reduc_code_p)
8269     {
8270       int factor = 1;
8271       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8272         /* Three dot-products and a subtraction.  */
8273         factor = 4;
8274       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8275                         stmt_info, 0, vect_body);
8276     }
8277
8278   if (dump_enabled_p ()
8279       && reduction_type == FOLD_LEFT_REDUCTION)
8280     dump_printf_loc (MSG_NOTE, vect_location,
8281                      "using an in-order (fold-left) reduction.\n");
8282   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8283   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8284      reductions go through their own vectorizable_* routines.  */
8285   if (!single_defuse_cycle
8286       && !lane_reduc_code_p
8287       && reduction_type != FOLD_LEFT_REDUCTION)
8288     {
8289       stmt_vec_info tem
8290         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8291       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8292         {
8293           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8294           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8295         }
8296       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8297       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8298     }
8299   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8300     {
8301       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8302       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8303       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8304
8305       if (reduction_type != FOLD_LEFT_REDUCTION
8306           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8307           && (cond_fn == IFN_LAST
8308               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8309                                                   OPTIMIZE_FOR_SPEED)))
8310         {
8311           if (dump_enabled_p ())
8312             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8313                              "can't operate on partial vectors because"
8314                              " no conditional operation is available.\n");
8315           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8316         }
8317       else if (reduction_type == FOLD_LEFT_REDUCTION
8318                && reduc_fn == IFN_LAST
8319                && !expand_vec_cond_expr_p (vectype_in,
8320                                            truth_type_for (vectype_in),
8321                                            SSA_NAME))
8322         {
8323           if (dump_enabled_p ())
8324             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8325                              "can't operate on partial vectors because"
8326                              " no conditional operation is available.\n");
8327           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8328         }
8329       else if (reduction_type == FOLD_LEFT_REDUCTION
8330                && internal_fn_mask_index (reduc_fn) == -1
8331                && FLOAT_TYPE_P (vectype_in)
8332                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8333         {
8334           if (dump_enabled_p ())
8335             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8336                              "can't operate on partial vectors because"
8337                              " signed zeros cannot be preserved.\n");
8338           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8339         }
8340       else
8341         {
8342           internal_fn mask_reduc_fn
8343             = get_masked_reduction_fn (reduc_fn, vectype_in);
8344
8345           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8346             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8347                                   vectype_in, 1);
8348           else
8349             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8350                                    vectype_in, NULL);
8351         }
8352     }
8353   return true;
8354 }
8355
8356 /* STMT_INFO is a dot-product reduction whose multiplication operands
8357    have different signs.  Emit a sequence to emulate the operation
8358    using a series of signed DOT_PROD_EXPRs and return the last
8359    statement generated.  VEC_DEST is the result of the vector operation
8360    and VOP lists its inputs.  */
8361
8362 static gassign *
8363 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8364                              gimple_stmt_iterator *gsi, tree vec_dest,
8365                              tree vop[3])
8366 {
8367   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8368   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8369   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8370   gimple *new_stmt;
8371
8372   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8373   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8374     std::swap (vop[0], vop[1]);
8375
8376   /* Convert all inputs to signed types.  */
8377   for (int i = 0; i < 3; ++i)
8378     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8379       {
8380         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8381         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8382         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8383         vop[i] = tmp;
8384       }
8385
8386   /* In the comments below we assume 8-bit inputs for simplicity,
8387      but the approach works for any full integer type.  */
8388
8389   /* Create a vector of -128.  */
8390   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8391   tree min_narrow = build_vector_from_val (narrow_vectype,
8392                                            min_narrow_elttype);
8393
8394   /* Create a vector of 64.  */
8395   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8396   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8397   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8398
8399   /* Emit: SUB_RES = VOP[0] - 128.  */
8400   tree sub_res = make_ssa_name (narrow_vectype);
8401   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8402   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8403
8404   /* Emit:
8405
8406        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8407        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8408        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8409
8410      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8411      Doing the two 64 * y steps first allows more time to compute x.  */
8412   tree stage1 = make_ssa_name (wide_vectype);
8413   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8414                                   vop[1], half_narrow, vop[2]);
8415   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8416
8417   tree stage2 = make_ssa_name (wide_vectype);
8418   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8419                                   vop[1], half_narrow, stage1);
8420   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8421
8422   tree stage3 = make_ssa_name (wide_vectype);
8423   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8424                                   sub_res, vop[1], stage2);
8425   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8426
8427   /* Convert STAGE3 to the reduction type.  */
8428   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8429 }
8430
8431 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8432    value.  */
8433
8434 bool
8435 vect_transform_reduction (loop_vec_info loop_vinfo,
8436                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8437                           gimple **vec_stmt, slp_tree slp_node)
8438 {
8439   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8440   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8441   int i;
8442   int ncopies;
8443   int vec_num;
8444
8445   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8446   gcc_assert (reduc_info->is_reduc_info);
8447
8448   if (nested_in_vect_loop_p (loop, stmt_info))
8449     {
8450       loop = loop->inner;
8451       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8452     }
8453
8454   gimple_match_op op;
8455   if (!gimple_extract_op (stmt_info->stmt, &op))
8456     gcc_unreachable ();
8457
8458   /* All uses but the last are expected to be defined in the loop.
8459      The last use is the reduction variable.  In case of nested cycle this
8460      assumption is not true: we use reduc_index to record the index of the
8461      reduction variable.  */
8462   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8463   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8464   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8465   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8466
8467   if (slp_node)
8468     {
8469       ncopies = 1;
8470       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8471     }
8472   else
8473     {
8474       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8475       vec_num = 1;
8476     }
8477
8478   code_helper code = canonicalize_code (op.code, op.type);
8479   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8480
8481   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8482   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8483   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8484
8485   /* Transform.  */
8486   tree new_temp = NULL_TREE;
8487   auto_vec<tree> vec_oprnds0;
8488   auto_vec<tree> vec_oprnds1;
8489   auto_vec<tree> vec_oprnds2;
8490   tree def0;
8491
8492   if (dump_enabled_p ())
8493     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8494
8495   /* FORNOW: Multiple types are not supported for condition.  */
8496   if (code == COND_EXPR)
8497     gcc_assert (ncopies == 1);
8498
8499   /* A binary COND_OP reduction must have the same definition and else
8500      value. */
8501   bool cond_fn_p = code.is_internal_fn ()
8502     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8503   if (cond_fn_p)
8504     {
8505       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8506                   || code == IFN_COND_MUL || code == IFN_COND_AND
8507                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8508       gcc_assert (op.num_ops == 4
8509                   && (op.ops[reduc_index]
8510                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8511     }
8512
8513   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8514
8515   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8516   if (reduction_type == FOLD_LEFT_REDUCTION)
8517     {
8518       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8519       gcc_assert (code.is_tree_code () || cond_fn_p);
8520       return vectorize_fold_left_reduction
8521           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8522            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8523            reduc_index, masks, lens);
8524     }
8525
8526   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8527   gcc_assert (single_defuse_cycle
8528               || code == DOT_PROD_EXPR
8529               || code == WIDEN_SUM_EXPR
8530               || code == SAD_EXPR);
8531
8532   /* Create the destination vector  */
8533   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8534   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8535
8536   /* Get NCOPIES vector definitions for all operands except the reduction
8537      definition.  */
8538   if (!cond_fn_p)
8539     {
8540       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8541                          single_defuse_cycle && reduc_index == 0
8542                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8543                          single_defuse_cycle && reduc_index == 1
8544                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8545                          op.num_ops == 3
8546                          && !(single_defuse_cycle && reduc_index == 2)
8547                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8548     }
8549   else
8550     {
8551       /* For a conditional operation pass the truth type as mask
8552          vectype.  */
8553       gcc_assert (single_defuse_cycle
8554                   && (reduc_index == 1 || reduc_index == 2));
8555       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8556                          op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8557                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8558                          NULL_TREE, &vec_oprnds1,
8559                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8560                          NULL_TREE, &vec_oprnds2);
8561     }
8562
8563   /* For single def-use cycles get one copy of the vectorized reduction
8564      definition.  */
8565   if (single_defuse_cycle)
8566     {
8567       gcc_assert (!slp_node);
8568       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8569                                      op.ops[reduc_index],
8570                                      reduc_index == 0 ? &vec_oprnds0
8571                                      : (reduc_index == 1 ? &vec_oprnds1
8572                                         : &vec_oprnds2));
8573     }
8574
8575   bool emulated_mixed_dot_prod
8576     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8577   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8578     {
8579       gimple *new_stmt;
8580       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8581       if (masked_loop_p && !mask_by_cond_expr)
8582         {
8583           /* No conditional ifns have been defined for dot-product yet.  */
8584           gcc_assert (code != DOT_PROD_EXPR);
8585
8586           /* Make sure that the reduction accumulator is vop[0].  */
8587           if (reduc_index == 1)
8588             {
8589               gcc_assert (commutative_binary_op_p (code, op.type));
8590               std::swap (vop[0], vop[1]);
8591             }
8592           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8593                                           vec_num * ncopies, vectype_in, i);
8594           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8595                                                     vop[0], vop[1], vop[0]);
8596           new_temp = make_ssa_name (vec_dest, call);
8597           gimple_call_set_lhs (call, new_temp);
8598           gimple_call_set_nothrow (call, true);
8599           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8600           new_stmt = call;
8601         }
8602       else
8603         {
8604           if (op.num_ops >= 3)
8605             vop[2] = vec_oprnds2[i];
8606
8607           if (masked_loop_p && mask_by_cond_expr)
8608             {
8609               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8610                                               vec_num * ncopies, vectype_in, i);
8611               build_vect_cond_expr (code, vop, mask, gsi);
8612             }
8613
8614           if (emulated_mixed_dot_prod)
8615             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8616                                                     vec_dest, vop);
8617
8618           else if (code.is_internal_fn () && !cond_fn_p)
8619             new_stmt = gimple_build_call_internal (internal_fn (code),
8620                                                    op.num_ops,
8621                                                    vop[0], vop[1], vop[2]);
8622           else if (code.is_internal_fn () && cond_fn_p)
8623             new_stmt = gimple_build_call_internal (internal_fn (code),
8624                                                    op.num_ops,
8625                                                    vop[0], vop[1], vop[2],
8626                                                    vop[1]);
8627           else
8628             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8629                                             vop[0], vop[1], vop[2]);
8630           new_temp = make_ssa_name (vec_dest, new_stmt);
8631           gimple_set_lhs (new_stmt, new_temp);
8632           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8633         }
8634
8635       if (slp_node)
8636         slp_node->push_vec_def (new_stmt);
8637       else if (single_defuse_cycle
8638                && i < ncopies - 1)
8639         {
8640           if (reduc_index == 0)
8641             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8642           else if (reduc_index == 1)
8643             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8644           else if (reduc_index == 2)
8645             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8646         }
8647       else
8648         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8649     }
8650
8651   if (!slp_node)
8652     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8653
8654   return true;
8655 }
8656
8657 /* Transform phase of a cycle PHI.  */
8658
8659 bool
8660 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8661                           stmt_vec_info stmt_info, gimple **vec_stmt,
8662                           slp_tree slp_node, slp_instance slp_node_instance)
8663 {
8664   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8665   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8666   int i;
8667   int ncopies;
8668   int j;
8669   bool nested_cycle = false;
8670   int vec_num;
8671
8672   if (nested_in_vect_loop_p (loop, stmt_info))
8673     {
8674       loop = loop->inner;
8675       nested_cycle = true;
8676     }
8677
8678   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8679   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8680   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8681   gcc_assert (reduc_info->is_reduc_info);
8682
8683   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8684       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8685     /* Leave the scalar phi in place.  */
8686     return true;
8687
8688   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8689   /* For a nested cycle we do not fill the above.  */
8690   if (!vectype_in)
8691     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8692   gcc_assert (vectype_in);
8693
8694   if (slp_node)
8695     {
8696       /* The size vect_schedule_slp_instance computes is off for us.  */
8697       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8698                                       * SLP_TREE_LANES (slp_node), vectype_in);
8699       ncopies = 1;
8700     }
8701   else
8702     {
8703       vec_num = 1;
8704       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8705     }
8706
8707   /* Check whether we should use a single PHI node and accumulate
8708      vectors to one before the backedge.  */
8709   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8710     ncopies = 1;
8711
8712   /* Create the destination vector  */
8713   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8714   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8715                                                vectype_out);
8716
8717   /* Get the loop-entry arguments.  */
8718   tree vec_initial_def = NULL_TREE;
8719   auto_vec<tree> vec_initial_defs;
8720   if (slp_node)
8721     {
8722       vec_initial_defs.reserve (vec_num);
8723       if (nested_cycle)
8724         {
8725           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8726           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8727                              &vec_initial_defs);
8728         }
8729       else
8730         {
8731           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8732           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8733           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8734
8735           unsigned int num_phis = stmts.length ();
8736           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8737             num_phis = 1;
8738           initial_values.reserve (num_phis);
8739           for (unsigned int i = 0; i < num_phis; ++i)
8740             {
8741               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8742               initial_values.quick_push (vect_phi_initial_value (this_phi));
8743             }
8744           if (vec_num == 1)
8745             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8746           if (!initial_values.is_empty ())
8747             {
8748               tree initial_value
8749                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8750               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8751               tree neutral_op
8752                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8753                                             code, initial_value);
8754               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8755                                               &vec_initial_defs, vec_num,
8756                                               stmts.length (), neutral_op);
8757             }
8758         }
8759     }
8760   else
8761     {
8762       /* Get at the scalar def before the loop, that defines the initial
8763          value of the reduction variable.  */
8764       tree initial_def = vect_phi_initial_value (phi);
8765       reduc_info->reduc_initial_values.safe_push (initial_def);
8766       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8767          and we can't use zero for induc_val, use initial_def.  Similarly
8768          for REDUC_MIN and initial_def larger than the base.  */
8769       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8770         {
8771           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8772           if (TREE_CODE (initial_def) == INTEGER_CST
8773               && !integer_zerop (induc_val)
8774               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8775                    && tree_int_cst_lt (initial_def, induc_val))
8776                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8777                       && tree_int_cst_lt (induc_val, initial_def))))
8778             {
8779               induc_val = initial_def;
8780               /* Communicate we used the initial_def to epilouge
8781                  generation.  */
8782               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8783             }
8784           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8785         }
8786       else if (nested_cycle)
8787         {
8788           /* Do not use an adjustment def as that case is not supported
8789              correctly if ncopies is not one.  */
8790           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8791                                          ncopies, initial_def,
8792                                          &vec_initial_defs);
8793         }
8794       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8795                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8796         /* Fill the initial vector with the initial scalar value.  */
8797         vec_initial_def
8798           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8799                                            initial_def, initial_def);
8800       else
8801         {
8802           if (ncopies == 1)
8803             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8804           if (!reduc_info->reduc_initial_values.is_empty ())
8805             {
8806               initial_def = reduc_info->reduc_initial_values[0];
8807               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8808               tree neutral_op
8809                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8810                                             code, initial_def);
8811               gcc_assert (neutral_op);
8812               /* Try to simplify the vector initialization by applying an
8813                  adjustment after the reduction has been performed.  */
8814               if (!reduc_info->reused_accumulator
8815                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8816                   && !operand_equal_p (neutral_op, initial_def))
8817                 {
8818                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8819                     = initial_def;
8820                   initial_def = neutral_op;
8821                 }
8822               vec_initial_def
8823                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8824                                                  initial_def, neutral_op);
8825             }
8826         }
8827     }
8828
8829   if (vec_initial_def)
8830     {
8831       vec_initial_defs.create (ncopies);
8832       for (i = 0; i < ncopies; ++i)
8833         vec_initial_defs.quick_push (vec_initial_def);
8834     }
8835
8836   if (auto *accumulator = reduc_info->reused_accumulator)
8837     {
8838       tree def = accumulator->reduc_input;
8839       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8840         {
8841           unsigned int nreduc;
8842           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8843                                             (TREE_TYPE (def)),
8844                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8845                                           &nreduc);
8846           gcc_assert (res);
8847           gimple_seq stmts = NULL;
8848           /* Reduce the single vector to a smaller one.  */
8849           if (nreduc != 1)
8850             {
8851               /* Perform the reduction in the appropriate type.  */
8852               tree rvectype = vectype_out;
8853               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8854                                               TREE_TYPE (TREE_TYPE (def))))
8855                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8856                                               TYPE_VECTOR_SUBPARTS
8857                                                 (vectype_out));
8858               def = vect_create_partial_epilog (def, rvectype,
8859                                                 STMT_VINFO_REDUC_CODE
8860                                                   (reduc_info),
8861                                                 &stmts);
8862             }
8863           /* The epilogue loop might use a different vector mode, like
8864              VNx2DI vs. V2DI.  */
8865           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8866             {
8867               tree reduc_type = build_vector_type_for_mode
8868                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8869               def = gimple_convert (&stmts, reduc_type, def);
8870             }
8871           /* Adjust the input so we pick up the partially reduced value
8872              for the skip edge in vect_create_epilog_for_reduction.  */
8873           accumulator->reduc_input = def;
8874           /* And the reduction could be carried out using a different sign.  */
8875           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8876             def = gimple_convert (&stmts, vectype_out, def);
8877           if (loop_vinfo->main_loop_edge)
8878             {
8879               /* While we'd like to insert on the edge this will split
8880                  blocks and disturb bookkeeping, we also will eventually
8881                  need this on the skip edge.  Rely on sinking to
8882                  fixup optimal placement and insert in the pred.  */
8883               gimple_stmt_iterator gsi
8884                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8885               /* Insert before a cond that eventually skips the
8886                  epilogue.  */
8887               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8888                 gsi_prev (&gsi);
8889               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8890             }
8891           else
8892             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8893                                               stmts);
8894         }
8895       if (loop_vinfo->main_loop_edge)
8896         vec_initial_defs[0]
8897           = vect_get_main_loop_result (loop_vinfo, def,
8898                                        vec_initial_defs[0]);
8899       else
8900         vec_initial_defs.safe_push (def);
8901     }
8902
8903   /* Generate the reduction PHIs upfront.  */
8904   for (i = 0; i < vec_num; i++)
8905     {
8906       tree vec_init_def = vec_initial_defs[i];
8907       for (j = 0; j < ncopies; j++)
8908         {
8909           /* Create the reduction-phi that defines the reduction
8910              operand.  */
8911           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8912
8913           /* Set the loop-entry arg of the reduction-phi.  */
8914           if (j != 0 && nested_cycle)
8915             vec_init_def = vec_initial_defs[j];
8916           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8917                        UNKNOWN_LOCATION);
8918
8919           /* The loop-latch arg is set in epilogue processing.  */
8920
8921           if (slp_node)
8922             slp_node->push_vec_def (new_phi);
8923           else
8924             {
8925               if (j == 0)
8926                 *vec_stmt = new_phi;
8927               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8928             }
8929         }
8930     }
8931
8932   return true;
8933 }
8934
8935 /* Vectorizes LC PHIs.  */
8936
8937 bool
8938 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8939                      stmt_vec_info stmt_info, gimple **vec_stmt,
8940                      slp_tree slp_node)
8941 {
8942   if (!loop_vinfo
8943       || !is_a <gphi *> (stmt_info->stmt)
8944       || gimple_phi_num_args (stmt_info->stmt) != 1)
8945     return false;
8946
8947   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8948       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8949     return false;
8950
8951   if (!vec_stmt) /* transformation not required.  */
8952     {
8953       /* Deal with copies from externs or constants that disguise as
8954          loop-closed PHI nodes (PR97886).  */
8955       if (slp_node
8956           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8957                                                 SLP_TREE_VECTYPE (slp_node)))
8958         {
8959           if (dump_enabled_p ())
8960             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8961                              "incompatible vector types for invariants\n");
8962           return false;
8963         }
8964       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8965       return true;
8966     }
8967
8968   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8969   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8970   basic_block bb = gimple_bb (stmt_info->stmt);
8971   edge e = single_pred_edge (bb);
8972   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8973   auto_vec<tree> vec_oprnds;
8974   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8975                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8976                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8977   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8978     {
8979       /* Create the vectorized LC PHI node.  */
8980       gphi *new_phi = create_phi_node (vec_dest, bb);
8981       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8982       if (slp_node)
8983         slp_node->push_vec_def (new_phi);
8984       else
8985         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8986     }
8987   if (!slp_node)
8988     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8989
8990   return true;
8991 }
8992
8993 /* Vectorizes PHIs.  */
8994
8995 bool
8996 vectorizable_phi (vec_info *,
8997                   stmt_vec_info stmt_info, gimple **vec_stmt,
8998                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8999 {
9000   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9001     return false;
9002
9003   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9004     return false;
9005
9006   tree vectype = SLP_TREE_VECTYPE (slp_node);
9007
9008   if (!vec_stmt) /* transformation not required.  */
9009     {
9010       slp_tree child;
9011       unsigned i;
9012       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9013         if (!child)
9014           {
9015             if (dump_enabled_p ())
9016               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9017                                "PHI node with unvectorized backedge def\n");
9018             return false;
9019           }
9020         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9021           {
9022             if (dump_enabled_p ())
9023               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9024                                "incompatible vector types for invariants\n");
9025             return false;
9026           }
9027         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9028                  && !useless_type_conversion_p (vectype,
9029                                                 SLP_TREE_VECTYPE (child)))
9030           {
9031             /* With bools we can have mask and non-mask precision vectors
9032                or different non-mask precisions.  while pattern recog is
9033                supposed to guarantee consistency here bugs in it can cause
9034                mismatches (PR103489 and PR103800 for example).
9035                Deal with them here instead of ICEing later.  */
9036             if (dump_enabled_p ())
9037               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9038                                "incompatible vector type setup from "
9039                                "bool pattern detection\n");
9040             return false;
9041           }
9042
9043       /* For single-argument PHIs assume coalescing which means zero cost
9044          for the scalar and the vector PHIs.  This avoids artificially
9045          favoring the vector path (but may pessimize it in some cases).  */
9046       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9047         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9048                           vector_stmt, stmt_info, vectype, 0, vect_body);
9049       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9050       return true;
9051     }
9052
9053   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9054   basic_block bb = gimple_bb (stmt_info->stmt);
9055   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9056   auto_vec<gphi *> new_phis;
9057   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9058     {
9059       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9060
9061       /* Skip not yet vectorized defs.  */
9062       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9063           && SLP_TREE_VEC_DEFS (child).is_empty ())
9064         continue;
9065
9066       auto_vec<tree> vec_oprnds;
9067       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9068       if (!new_phis.exists ())
9069         {
9070           new_phis.create (vec_oprnds.length ());
9071           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9072             {
9073               /* Create the vectorized LC PHI node.  */
9074               new_phis.quick_push (create_phi_node (vec_dest, bb));
9075               slp_node->push_vec_def (new_phis[j]);
9076             }
9077         }
9078       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9079       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9080         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9081     }
9082   /* We should have at least one already vectorized child.  */
9083   gcc_assert (new_phis.exists ());
9084
9085   return true;
9086 }
9087
9088 /* Vectorizes first order recurrences.  An overview of the transformation
9089    is described below. Suppose we have the following loop.
9090
9091      int t = 0;
9092      for (int i = 0; i < n; ++i)
9093        {
9094          b[i] = a[i] - t;
9095          t = a[i];
9096        }
9097
9098    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9099    looks (simplified) like:
9100
9101     scalar.preheader:
9102       init = 0;
9103
9104     scalar.body:
9105       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9106       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9107       _1 = a[i]
9108       b[i] = _1 - _2
9109       if (i < n) goto scalar.body
9110
9111    In this example, _2 is a recurrence because it's value depends on the
9112    previous iteration.  We vectorize this as (VF = 4)
9113
9114     vector.preheader:
9115       vect_init = vect_cst(..., ..., ..., 0)
9116
9117     vector.body
9118       i = PHI <0(vector.preheader), i+4(vector.body)>
9119       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9120       vect_2 = a[i, i+1, i+2, i+3];
9121       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9122       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9123       if (..) goto vector.body
9124
9125    In this function, vectorizable_recurr, we code generate both the
9126    vector PHI node and the permute since those together compute the
9127    vectorized value of the scalar PHI.  We do not yet have the
9128    backedge value to fill in there nor into the vec_perm.  Those
9129    are filled in maybe_set_vectorized_backedge_value and
9130    vect_schedule_scc.
9131
9132    TODO:  Since the scalar loop does not have a use of the recurrence
9133    outside of the loop the natural way to implement peeling via
9134    vectorizing the live value doesn't work.  For now peeling of loops
9135    with a recurrence is not implemented.  For SLP the supported cases
9136    are restricted to those requiring a single vector recurrence PHI.  */
9137
9138 bool
9139 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9140                      gimple **vec_stmt, slp_tree slp_node,
9141                      stmt_vector_for_cost *cost_vec)
9142 {
9143   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9144     return false;
9145
9146   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9147
9148   /* So far we only support first-order recurrence auto-vectorization.  */
9149   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9150     return false;
9151
9152   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9153   unsigned ncopies;
9154   if (slp_node)
9155     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9156   else
9157     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9158   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9159   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9160   /* We need to be able to make progress with a single vector.  */
9161   if (maybe_gt (dist * 2, nunits))
9162     {
9163       if (dump_enabled_p ())
9164         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9165                          "first order recurrence exceeds half of "
9166                          "a vector\n");
9167       return false;
9168     }
9169
9170   /* First-order recurrence autovectorization needs to handle permutation
9171      with indices = [nunits-1, nunits, nunits+1, ...].  */
9172   vec_perm_builder sel (nunits, 1, 3);
9173   for (int i = 0; i < 3; ++i)
9174     sel.quick_push (nunits - dist + i);
9175   vec_perm_indices indices (sel, 2, nunits);
9176
9177   if (!vec_stmt) /* transformation not required.  */
9178     {
9179       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9180                                  indices))
9181         return false;
9182
9183       if (slp_node)
9184         {
9185           /* We eventually need to set a vector type on invariant
9186              arguments.  */
9187           unsigned j;
9188           slp_tree child;
9189           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9190             if (!vect_maybe_update_slp_op_vectype
9191                   (child, SLP_TREE_VECTYPE (slp_node)))
9192               {
9193                 if (dump_enabled_p ())
9194                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9195                                    "incompatible vector types for "
9196                                    "invariants\n");
9197                 return false;
9198               }
9199         }
9200       /* The recurrence costs the initialization vector and one permute
9201          for each copy.  */
9202       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9203                                                  stmt_info, 0, vect_prologue);
9204       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9205                                                stmt_info, 0, vect_body);
9206       if (dump_enabled_p ())
9207         dump_printf_loc (MSG_NOTE, vect_location,
9208                          "vectorizable_recurr: inside_cost = %d, "
9209                          "prologue_cost = %d .\n", inside_cost,
9210                          prologue_cost);
9211
9212       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9213       return true;
9214     }
9215
9216   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9217   basic_block bb = gimple_bb (phi);
9218   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9219   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9220     {
9221       gimple_seq stmts = NULL;
9222       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9223       gsi_insert_seq_on_edge_immediate (pe, stmts);
9224     }
9225   tree vec_init = build_vector_from_val (vectype, preheader);
9226   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9227
9228   /* Create the vectorized first-order PHI node.  */
9229   tree vec_dest = vect_get_new_vect_var (vectype,
9230                                          vect_simple_var, "vec_recur_");
9231   gphi *new_phi = create_phi_node (vec_dest, bb);
9232   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9233
9234   /* Insert shuffles the first-order recurrence autovectorization.
9235        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9236   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9237
9238   /* Insert the required permute after the latch definition.  The
9239      second and later operands are tentative and will be updated when we have
9240      vectorized the latch definition.  */
9241   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9242   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9243   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9244   gsi_next (&gsi2);
9245
9246   for (unsigned i = 0; i < ncopies; ++i)
9247     {
9248       vec_dest = make_ssa_name (vectype);
9249       gassign *vperm
9250           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9251                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9252                                  NULL, perm);
9253       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9254
9255       if (slp_node)
9256         slp_node->push_vec_def (vperm);
9257       else
9258         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9259     }
9260
9261   if (!slp_node)
9262     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9263   return true;
9264 }
9265
9266 /* Return true if VECTYPE represents a vector that requires lowering
9267    by the vector lowering pass.  */
9268
9269 bool
9270 vect_emulated_vector_p (tree vectype)
9271 {
9272   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9273           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9274               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9275 }
9276
9277 /* Return true if we can emulate CODE on an integer mode representation
9278    of a vector.  */
9279
9280 bool
9281 vect_can_vectorize_without_simd_p (tree_code code)
9282 {
9283   switch (code)
9284     {
9285     case PLUS_EXPR:
9286     case MINUS_EXPR:
9287     case NEGATE_EXPR:
9288     case BIT_AND_EXPR:
9289     case BIT_IOR_EXPR:
9290     case BIT_XOR_EXPR:
9291     case BIT_NOT_EXPR:
9292       return true;
9293
9294     default:
9295       return false;
9296     }
9297 }
9298
9299 /* Likewise, but taking a code_helper.  */
9300
9301 bool
9302 vect_can_vectorize_without_simd_p (code_helper code)
9303 {
9304   return (code.is_tree_code ()
9305           && vect_can_vectorize_without_simd_p (tree_code (code)));
9306 }
9307
9308 /* Create vector init for vectorized iv.  */
9309 static tree
9310 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9311                                tree step_expr, poly_uint64 nunits,
9312                                tree vectype,
9313                                enum vect_induction_op_type induction_type)
9314 {
9315   unsigned HOST_WIDE_INT const_nunits;
9316   tree vec_shift, vec_init, new_name;
9317   unsigned i;
9318   tree itype = TREE_TYPE (vectype);
9319
9320   /* iv_loop is the loop to be vectorized. Create:
9321      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9322   new_name = gimple_convert (stmts, itype, init_expr);
9323   switch (induction_type)
9324     {
9325     case vect_step_op_shr:
9326     case vect_step_op_shl:
9327       /* Build the Initial value from shift_expr.  */
9328       vec_init = gimple_build_vector_from_val (stmts,
9329                                                vectype,
9330                                                new_name);
9331       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9332                                 build_zero_cst (itype), step_expr);
9333       vec_init = gimple_build (stmts,
9334                                (induction_type == vect_step_op_shr
9335                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9336                                vectype, vec_init, vec_shift);
9337       break;
9338
9339     case vect_step_op_neg:
9340       {
9341         vec_init = gimple_build_vector_from_val (stmts,
9342                                                  vectype,
9343                                                  new_name);
9344         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9345                                      vectype, vec_init);
9346         /* The encoding has 2 interleaved stepped patterns.  */
9347         vec_perm_builder sel (nunits, 2, 3);
9348         sel.quick_grow (6);
9349         for (i = 0; i < 3; i++)
9350           {
9351             sel[2 * i] = i;
9352             sel[2 * i + 1] = i + nunits;
9353           }
9354         vec_perm_indices indices (sel, 2, nunits);
9355         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9356            fail when vec_init is const vector. In that situation vec_perm is not
9357            really needed.  */
9358         tree perm_mask_even
9359           = vect_gen_perm_mask_any (vectype, indices);
9360         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9361                                  vectype,
9362                                  vec_init, vec_neg,
9363                                  perm_mask_even);
9364       }
9365       break;
9366
9367     case vect_step_op_mul:
9368       {
9369         /* Use unsigned mult to avoid UD integer overflow.  */
9370         gcc_assert (nunits.is_constant (&const_nunits));
9371         tree utype = unsigned_type_for (itype);
9372         tree uvectype = build_vector_type (utype,
9373                                            TYPE_VECTOR_SUBPARTS (vectype));
9374         new_name = gimple_convert (stmts, utype, new_name);
9375         vec_init = gimple_build_vector_from_val (stmts,
9376                                                  uvectype,
9377                                                  new_name);
9378         tree_vector_builder elts (uvectype, const_nunits, 1);
9379         tree elt_step = build_one_cst (utype);
9380
9381         elts.quick_push (elt_step);
9382         for (i = 1; i < const_nunits; i++)
9383           {
9384             /* Create: new_name_i = new_name + step_expr.  */
9385             elt_step = gimple_build (stmts, MULT_EXPR,
9386                                      utype, elt_step, step_expr);
9387             elts.quick_push (elt_step);
9388           }
9389         /* Create a vector from [new_name_0, new_name_1, ...,
9390            new_name_nunits-1].  */
9391         tree vec_mul = gimple_build_vector (stmts, &elts);
9392         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9393                                  vec_init, vec_mul);
9394         vec_init = gimple_convert (stmts, vectype, vec_init);
9395       }
9396       break;
9397
9398     default:
9399       gcc_unreachable ();
9400     }
9401
9402   return vec_init;
9403 }
9404
9405 /* Peel init_expr by skip_niter for induction_type.  */
9406 tree
9407 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9408                              tree skip_niters, tree step_expr,
9409                              enum vect_induction_op_type induction_type)
9410 {
9411   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9412   tree type = TREE_TYPE (init_expr);
9413   unsigned prec = TYPE_PRECISION (type);
9414   switch (induction_type)
9415     {
9416     case vect_step_op_neg:
9417       if (TREE_INT_CST_LOW (skip_niters) % 2)
9418         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9419       /* else no change.  */
9420       break;
9421
9422     case vect_step_op_shr:
9423     case vect_step_op_shl:
9424       skip_niters = gimple_convert (stmts, type, skip_niters);
9425       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9426       /* When shift mount >= precision, need to avoid UD.
9427          In the original loop, there's no UD, and according to semantic,
9428          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9429       if (!tree_fits_uhwi_p (step_expr)
9430           || tree_to_uhwi (step_expr) >= prec)
9431         {
9432           if (induction_type == vect_step_op_shl
9433               || TYPE_UNSIGNED (type))
9434             init_expr = build_zero_cst (type);
9435           else
9436             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9437                                       init_expr,
9438                                       wide_int_to_tree (type, prec - 1));
9439         }
9440       else
9441         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9442                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9443                                   type, init_expr, step_expr);
9444       break;
9445
9446     case vect_step_op_mul:
9447       {
9448         tree utype = unsigned_type_for (type);
9449         init_expr = gimple_convert (stmts, utype, init_expr);
9450         wide_int skipn = wi::to_wide (skip_niters);
9451         wide_int begin = wi::to_wide (step_expr);
9452         auto_mpz base, exp, mod, res;
9453         wi::to_mpz (begin, base, TYPE_SIGN (type));
9454         wi::to_mpz (skipn, exp, UNSIGNED);
9455         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9456         mpz_powm (res, base, exp, mod);
9457         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9458         tree mult_expr = wide_int_to_tree (utype, begin);
9459         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9460                                   init_expr, mult_expr);
9461         init_expr = gimple_convert (stmts, type, init_expr);
9462       }
9463       break;
9464
9465     default:
9466       gcc_unreachable ();
9467     }
9468
9469   return init_expr;
9470 }
9471
9472 /* Create vector step for vectorized iv.  */
9473 static tree
9474 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9475                                poly_uint64 vf,
9476                                enum vect_induction_op_type induction_type)
9477 {
9478   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9479   tree new_name = NULL;
9480   /* Step should be pow (step, vf) for mult induction.  */
9481   if (induction_type == vect_step_op_mul)
9482     {
9483       gcc_assert (vf.is_constant ());
9484       wide_int begin = wi::to_wide (step_expr);
9485
9486       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9487         begin = wi::mul (begin, wi::to_wide (step_expr));
9488
9489       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9490     }
9491   else if (induction_type == vect_step_op_neg)
9492     /* Do nothing.  */
9493     ;
9494   else
9495     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9496                              expr, step_expr);
9497   return new_name;
9498 }
9499
9500 static tree
9501 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9502                                    stmt_vec_info stmt_info,
9503                                    tree new_name, tree vectype,
9504                                    enum vect_induction_op_type induction_type)
9505 {
9506   /* No step is needed for neg induction.  */
9507   if (induction_type == vect_step_op_neg)
9508     return NULL;
9509
9510   tree t = unshare_expr (new_name);
9511   gcc_assert (CONSTANT_CLASS_P (new_name)
9512               || TREE_CODE (new_name) == SSA_NAME);
9513   tree new_vec = build_vector_from_val (vectype, t);
9514   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9515                                     new_vec, vectype, NULL);
9516   return vec_step;
9517 }
9518
9519 /* Update vectorized iv with vect_step, induc_def is init.  */
9520 static tree
9521 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9522                           tree induc_def, tree vec_step,
9523                           enum vect_induction_op_type induction_type)
9524 {
9525   tree vec_def = induc_def;
9526   switch (induction_type)
9527     {
9528     case vect_step_op_mul:
9529       {
9530         /* Use unsigned mult to avoid UD integer overflow.  */
9531         tree uvectype
9532           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9533                                TYPE_VECTOR_SUBPARTS (vectype));
9534         vec_def = gimple_convert (stmts, uvectype, vec_def);
9535         vec_step = gimple_convert (stmts, uvectype, vec_step);
9536         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9537                                 vec_def, vec_step);
9538         vec_def = gimple_convert (stmts, vectype, vec_def);
9539       }
9540       break;
9541
9542     case vect_step_op_shr:
9543       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9544                               vec_def, vec_step);
9545       break;
9546
9547     case vect_step_op_shl:
9548       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9549                               vec_def, vec_step);
9550       break;
9551     case vect_step_op_neg:
9552       vec_def = induc_def;
9553       /* Do nothing.  */
9554       break;
9555     default:
9556       gcc_unreachable ();
9557     }
9558
9559   return vec_def;
9560
9561 }
9562
9563 /* Function vectorizable_induction
9564
9565    Check if STMT_INFO performs an nonlinear induction computation that can be
9566    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9567    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9568    basic block.
9569    Return true if STMT_INFO is vectorizable in this way.  */
9570
9571 static bool
9572 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9573                                   stmt_vec_info stmt_info,
9574                                   gimple **vec_stmt, slp_tree slp_node,
9575                                   stmt_vector_for_cost *cost_vec)
9576 {
9577   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9578   unsigned ncopies;
9579   bool nested_in_vect_loop = false;
9580   class loop *iv_loop;
9581   tree vec_def;
9582   edge pe = loop_preheader_edge (loop);
9583   basic_block new_bb;
9584   tree vec_init, vec_step;
9585   tree new_name;
9586   gimple *new_stmt;
9587   gphi *induction_phi;
9588   tree induc_def, vec_dest;
9589   tree init_expr, step_expr;
9590   tree niters_skip;
9591   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9592   unsigned i;
9593   gimple_stmt_iterator si;
9594
9595   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9596
9597   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9598   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9599   enum vect_induction_op_type induction_type
9600     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9601
9602   gcc_assert (induction_type > vect_step_op_add);
9603
9604   if (slp_node)
9605     ncopies = 1;
9606   else
9607     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9608   gcc_assert (ncopies >= 1);
9609
9610   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9611   if (nested_in_vect_loop_p (loop, stmt_info))
9612     {
9613       if (dump_enabled_p ())
9614         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9615                          "nonlinear induction in nested loop.\n");
9616       return false;
9617     }
9618
9619   iv_loop = loop;
9620   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9621
9622   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9623      update for each iv and a permutation to generate wanted vector iv.  */
9624   if (slp_node)
9625     {
9626       if (dump_enabled_p ())
9627         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9628                          "SLP induction not supported for nonlinear"
9629                          " induction.\n");
9630       return false;
9631     }
9632
9633   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9634     {
9635       if (dump_enabled_p ())
9636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9637                          "floating point nonlinear induction vectorization"
9638                          " not supported.\n");
9639       return false;
9640     }
9641
9642   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9643   init_expr = vect_phi_initial_value (phi);
9644   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9645               && TREE_CODE (step_expr) == INTEGER_CST);
9646   /* step_expr should be aligned with init_expr,
9647      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9648   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9649
9650   if (TREE_CODE (init_expr) == INTEGER_CST)
9651     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9652   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9653     {
9654       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9655       if (dump_enabled_p ())
9656         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9657                          "nonlinear induction vectorization failed:"
9658                          " component type of vectype is not a nop conversion"
9659                          " from type of init_expr.\n");
9660       return false;
9661     }
9662
9663   switch (induction_type)
9664     {
9665     case vect_step_op_neg:
9666       if (TREE_CODE (init_expr) != INTEGER_CST
9667           && TREE_CODE (init_expr) != REAL_CST)
9668         {
9669           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9670           if (!directly_supported_p (NEGATE_EXPR, vectype))
9671             return false;
9672
9673           /* The encoding has 2 interleaved stepped patterns.  */
9674           vec_perm_builder sel (nunits, 2, 3);
9675           machine_mode mode = TYPE_MODE (vectype);
9676           sel.quick_grow (6);
9677           for (i = 0; i < 3; i++)
9678             {
9679               sel[i * 2] = i;
9680               sel[i * 2 + 1] = i + nunits;
9681             }
9682           vec_perm_indices indices (sel, 2, nunits);
9683           if (!can_vec_perm_const_p (mode, mode, indices))
9684             return false;
9685         }
9686       break;
9687
9688     case vect_step_op_mul:
9689       {
9690         /* Check for backend support of MULT_EXPR.  */
9691         if (!directly_supported_p (MULT_EXPR, vectype))
9692           return false;
9693
9694         /* ?? How to construct vector step for variable number vector.
9695            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9696         if (!vf.is_constant ())
9697           return false;
9698       }
9699       break;
9700
9701     case vect_step_op_shr:
9702       /* Check for backend support of RSHIFT_EXPR.  */
9703       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9704         return false;
9705
9706       /* Don't shift more than type precision to avoid UD.  */
9707       if (!tree_fits_uhwi_p (step_expr)
9708           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9709                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9710         return false;
9711       break;
9712
9713     case vect_step_op_shl:
9714       /* Check for backend support of RSHIFT_EXPR.  */
9715       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9716         return false;
9717
9718       /* Don't shift more than type precision to avoid UD.  */
9719       if (!tree_fits_uhwi_p (step_expr)
9720           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9721                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9722         return false;
9723
9724       break;
9725
9726     default:
9727       gcc_unreachable ();
9728     }
9729
9730   if (!vec_stmt) /* transformation not required.  */
9731     {
9732       unsigned inside_cost = 0, prologue_cost = 0;
9733       /* loop cost for vec_loop. Neg induction doesn't have any
9734          inside_cost.  */
9735       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9736                                       stmt_info, 0, vect_body);
9737
9738       /* loop cost for vec_loop. Neg induction doesn't have any
9739          inside_cost.  */
9740       if (induction_type == vect_step_op_neg)
9741         inside_cost = 0;
9742
9743       /* prologue cost for vec_init and vec_step.  */
9744       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9745                                         stmt_info, 0, vect_prologue);
9746
9747       if (dump_enabled_p ())
9748         dump_printf_loc (MSG_NOTE, vect_location,
9749                          "vect_model_induction_cost: inside_cost = %d, "
9750                          "prologue_cost = %d. \n", inside_cost,
9751                          prologue_cost);
9752
9753       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9754       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9755       return true;
9756     }
9757
9758   /* Transform.  */
9759
9760   /* Compute a vector variable, initialized with the first VF values of
9761      the induction variable.  E.g., for an iv with IV_PHI='X' and
9762      evolution S, for a vector of 4 units, we want to compute:
9763      [X, X + S, X + 2*S, X + 3*S].  */
9764
9765   if (dump_enabled_p ())
9766     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9767
9768   pe = loop_preheader_edge (iv_loop);
9769   /* Find the first insertion point in the BB.  */
9770   basic_block bb = gimple_bb (phi);
9771   si = gsi_after_labels (bb);
9772
9773   gimple_seq stmts = NULL;
9774
9775   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9776   /* If we are using the loop mask to "peel" for alignment then we need
9777      to adjust the start value here.  */
9778   if (niters_skip != NULL_TREE)
9779     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9780                                              step_expr, induction_type);
9781
9782   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9783                                             step_expr, nunits, vectype,
9784                                             induction_type);
9785   if (stmts)
9786     {
9787       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9788       gcc_assert (!new_bb);
9789     }
9790
9791   stmts = NULL;
9792   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9793                                             vf, induction_type);
9794   if (stmts)
9795     {
9796       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9797       gcc_assert (!new_bb);
9798     }
9799
9800   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9801                                                 new_name, vectype,
9802                                                 induction_type);
9803   /* Create the following def-use cycle:
9804      loop prolog:
9805      vec_init = ...
9806      vec_step = ...
9807      loop:
9808      vec_iv = PHI <vec_init, vec_loop>
9809      ...
9810      STMT
9811      ...
9812      vec_loop = vec_iv + vec_step;  */
9813
9814   /* Create the induction-phi that defines the induction-operand.  */
9815   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9816   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9817   induc_def = PHI_RESULT (induction_phi);
9818
9819   /* Create the iv update inside the loop.  */
9820   stmts = NULL;
9821   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9822                                       induc_def, vec_step,
9823                                       induction_type);
9824
9825   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9826   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9827
9828   /* Set the arguments of the phi node:  */
9829   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9830   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9831                UNKNOWN_LOCATION);
9832
9833   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9834   *vec_stmt = induction_phi;
9835
9836   /* In case that vectorization factor (VF) is bigger than the number
9837      of elements that we can fit in a vectype (nunits), we have to generate
9838      more than one vector stmt - i.e - we need to "unroll" the
9839      vector stmt by a factor VF/nunits.  For more details see documentation
9840      in vectorizable_operation.  */
9841
9842   if (ncopies > 1)
9843     {
9844       stmts = NULL;
9845       /* FORNOW. This restriction should be relaxed.  */
9846       gcc_assert (!nested_in_vect_loop);
9847
9848       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9849                                                 nunits, induction_type);
9850
9851       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9852                                                     new_name, vectype,
9853                                                     induction_type);
9854       vec_def = induc_def;
9855       for (i = 1; i < ncopies; i++)
9856         {
9857           /* vec_i = vec_prev + vec_step.  */
9858           stmts = NULL;
9859           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9860                                               vec_def, vec_step,
9861                                               induction_type);
9862           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9863           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9864           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9865         }
9866     }
9867
9868   if (dump_enabled_p ())
9869     dump_printf_loc (MSG_NOTE, vect_location,
9870                      "transform induction: created def-use cycle: %G%G",
9871                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9872
9873   return true;
9874 }
9875
9876 /* Function vectorizable_induction
9877
9878    Check if STMT_INFO performs an induction computation that can be vectorized.
9879    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9880    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9881    Return true if STMT_INFO is vectorizable in this way.  */
9882
9883 bool
9884 vectorizable_induction (loop_vec_info loop_vinfo,
9885                         stmt_vec_info stmt_info,
9886                         gimple **vec_stmt, slp_tree slp_node,
9887                         stmt_vector_for_cost *cost_vec)
9888 {
9889   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9890   unsigned ncopies;
9891   bool nested_in_vect_loop = false;
9892   class loop *iv_loop;
9893   tree vec_def;
9894   edge pe = loop_preheader_edge (loop);
9895   basic_block new_bb;
9896   tree new_vec, vec_init, vec_step, t;
9897   tree new_name;
9898   gimple *new_stmt;
9899   gphi *induction_phi;
9900   tree induc_def, vec_dest;
9901   tree init_expr, step_expr;
9902   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9903   unsigned i;
9904   tree expr;
9905   gimple_stmt_iterator si;
9906   enum vect_induction_op_type induction_type
9907     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9908
9909   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9910   if (!phi)
9911     return false;
9912
9913   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9914     return false;
9915
9916   /* Make sure it was recognized as induction computation.  */
9917   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9918     return false;
9919
9920   /* Handle nonlinear induction in a separate place.  */
9921   if (induction_type != vect_step_op_add)
9922     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9923                                              vec_stmt, slp_node, cost_vec);
9924
9925   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9926   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9927
9928   if (slp_node)
9929     ncopies = 1;
9930   else
9931     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9932   gcc_assert (ncopies >= 1);
9933
9934   /* FORNOW. These restrictions should be relaxed.  */
9935   if (nested_in_vect_loop_p (loop, stmt_info))
9936     {
9937       imm_use_iterator imm_iter;
9938       use_operand_p use_p;
9939       gimple *exit_phi;
9940       edge latch_e;
9941       tree loop_arg;
9942
9943       if (ncopies > 1)
9944         {
9945           if (dump_enabled_p ())
9946             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9947                              "multiple types in nested loop.\n");
9948           return false;
9949         }
9950
9951       exit_phi = NULL;
9952       latch_e = loop_latch_edge (loop->inner);
9953       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9954       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9955         {
9956           gimple *use_stmt = USE_STMT (use_p);
9957           if (is_gimple_debug (use_stmt))
9958             continue;
9959
9960           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9961             {
9962               exit_phi = use_stmt;
9963               break;
9964             }
9965         }
9966       if (exit_phi)
9967         {
9968           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9969           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9970                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9971             {
9972               if (dump_enabled_p ())
9973                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9974                                  "inner-loop induction only used outside "
9975                                  "of the outer vectorized loop.\n");
9976               return false;
9977             }
9978         }
9979
9980       nested_in_vect_loop = true;
9981       iv_loop = loop->inner;
9982     }
9983   else
9984     iv_loop = loop;
9985   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9986
9987   if (slp_node && !nunits.is_constant ())
9988     {
9989       /* The current SLP code creates the step value element-by-element.  */
9990       if (dump_enabled_p ())
9991         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9992                          "SLP induction not supported for variable-length"
9993                          " vectors.\n");
9994       return false;
9995     }
9996
9997   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9998     {
9999       if (dump_enabled_p ())
10000         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10001                          "floating point induction vectorization disabled\n");
10002       return false;
10003     }
10004
10005   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10006   gcc_assert (step_expr != NULL_TREE);
10007   if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10008       && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10009     {
10010       if (dump_enabled_p ())
10011         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10012                          "bit-precision induction vectorization not "
10013                          "supported.\n");
10014       return false;
10015     }
10016   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10017
10018   /* Check for backend support of PLUS/MINUS_EXPR. */
10019   if (!directly_supported_p (PLUS_EXPR, step_vectype)
10020       || !directly_supported_p (MINUS_EXPR, step_vectype))
10021     return false;
10022
10023   if (!vec_stmt) /* transformation not required.  */
10024     {
10025       unsigned inside_cost = 0, prologue_cost = 0;
10026       if (slp_node)
10027         {
10028           /* We eventually need to set a vector type on invariant
10029              arguments.  */
10030           unsigned j;
10031           slp_tree child;
10032           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10033             if (!vect_maybe_update_slp_op_vectype
10034                 (child, SLP_TREE_VECTYPE (slp_node)))
10035               {
10036                 if (dump_enabled_p ())
10037                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10038                                    "incompatible vector types for "
10039                                    "invariants\n");
10040                 return false;
10041               }
10042           /* loop cost for vec_loop.  */
10043           inside_cost
10044             = record_stmt_cost (cost_vec,
10045                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10046                                 vector_stmt, stmt_info, 0, vect_body);
10047           /* prologue cost for vec_init (if not nested) and step.  */
10048           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10049                                             scalar_to_vec,
10050                                             stmt_info, 0, vect_prologue);
10051         }
10052       else /* if (!slp_node) */
10053         {
10054           /* loop cost for vec_loop.  */
10055           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10056                                           stmt_info, 0, vect_body);
10057           /* prologue cost for vec_init and vec_step.  */
10058           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10059                                             stmt_info, 0, vect_prologue);
10060         }
10061       if (dump_enabled_p ())
10062         dump_printf_loc (MSG_NOTE, vect_location,
10063                          "vect_model_induction_cost: inside_cost = %d, "
10064                          "prologue_cost = %d .\n", inside_cost,
10065                          prologue_cost);
10066
10067       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10068       DUMP_VECT_SCOPE ("vectorizable_induction");
10069       return true;
10070     }
10071
10072   /* Transform.  */
10073
10074   /* Compute a vector variable, initialized with the first VF values of
10075      the induction variable.  E.g., for an iv with IV_PHI='X' and
10076      evolution S, for a vector of 4 units, we want to compute:
10077      [X, X + S, X + 2*S, X + 3*S].  */
10078
10079   if (dump_enabled_p ())
10080     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10081
10082   pe = loop_preheader_edge (iv_loop);
10083   /* Find the first insertion point in the BB.  */
10084   basic_block bb = gimple_bb (phi);
10085   si = gsi_after_labels (bb);
10086
10087   /* For SLP induction we have to generate several IVs as for example
10088      with group size 3 we need
10089        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10090        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10091   if (slp_node)
10092     {
10093       /* Enforced above.  */
10094       unsigned int const_nunits = nunits.to_constant ();
10095
10096       /* The initial values are vectorized, but any lanes > group_size
10097          need adjustment.  */
10098       slp_tree init_node
10099         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10100
10101       /* Gather steps.  Since we do not vectorize inductions as
10102          cycles we have to reconstruct the step from SCEV data.  */
10103       unsigned group_size = SLP_TREE_LANES (slp_node);
10104       tree *steps = XALLOCAVEC (tree, group_size);
10105       tree *inits = XALLOCAVEC (tree, group_size);
10106       stmt_vec_info phi_info;
10107       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10108         {
10109           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10110           if (!init_node)
10111             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10112                                            pe->dest_idx);
10113         }
10114
10115       /* Now generate the IVs.  */
10116       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10117       gcc_assert ((const_nunits * nvects) % group_size == 0);
10118       unsigned nivs;
10119       if (nested_in_vect_loop)
10120         nivs = nvects;
10121       else
10122         {
10123           /* Compute the number of distinct IVs we need.  First reduce
10124              group_size if it is a multiple of const_nunits so we get
10125              one IV for a group_size of 4 but const_nunits 2.  */
10126           unsigned group_sizep = group_size;
10127           if (group_sizep % const_nunits == 0)
10128             group_sizep = group_sizep / const_nunits;
10129           nivs = least_common_multiple (group_sizep,
10130                                         const_nunits) / const_nunits;
10131         }
10132       tree stept = TREE_TYPE (step_vectype);
10133       tree lupdate_mul = NULL_TREE;
10134       if (!nested_in_vect_loop)
10135         {
10136           /* The number of iterations covered in one vector iteration.  */
10137           unsigned lup_mul = (nvects * const_nunits) / group_size;
10138           lupdate_mul
10139             = build_vector_from_val (step_vectype,
10140                                      SCALAR_FLOAT_TYPE_P (stept)
10141                                      ? build_real_from_wide (stept, lup_mul,
10142                                                              UNSIGNED)
10143                                      : build_int_cstu (stept, lup_mul));
10144         }
10145       tree peel_mul = NULL_TREE;
10146       gimple_seq init_stmts = NULL;
10147       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10148         {
10149           if (SCALAR_FLOAT_TYPE_P (stept))
10150             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10151                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10152           else
10153             peel_mul = gimple_convert (&init_stmts, stept,
10154                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10155           peel_mul = gimple_build_vector_from_val (&init_stmts,
10156                                                    step_vectype, peel_mul);
10157         }
10158       unsigned ivn;
10159       auto_vec<tree> vec_steps;
10160       for (ivn = 0; ivn < nivs; ++ivn)
10161         {
10162           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10163           tree_vector_builder init_elts (vectype, const_nunits, 1);
10164           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10165           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10166             {
10167               /* The scalar steps of the IVs.  */
10168               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10169               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10170               step_elts.quick_push (elt);
10171               if (!init_node)
10172                 {
10173                   /* The scalar inits of the IVs if not vectorized.  */
10174                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10175                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10176                                                   TREE_TYPE (elt)))
10177                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10178                                         TREE_TYPE (vectype), elt);
10179                   init_elts.quick_push (elt);
10180                 }
10181               /* The number of steps to add to the initial values.  */
10182               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10183               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10184                                    ? build_real_from_wide (stept,
10185                                                            mul_elt, UNSIGNED)
10186                                    : build_int_cstu (stept, mul_elt));
10187             }
10188           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10189           vec_steps.safe_push (vec_step);
10190           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10191           if (peel_mul)
10192             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10193                                      step_mul, peel_mul);
10194           if (!init_node)
10195             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10196
10197           /* Create the induction-phi that defines the induction-operand.  */
10198           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10199                                             "vec_iv_");
10200           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10201           induc_def = PHI_RESULT (induction_phi);
10202
10203           /* Create the iv update inside the loop  */
10204           tree up = vec_step;
10205           if (lupdate_mul)
10206             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10207                                vec_step, lupdate_mul);
10208           gimple_seq stmts = NULL;
10209           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10210           vec_def = gimple_build (&stmts,
10211                                   PLUS_EXPR, step_vectype, vec_def, up);
10212           vec_def = gimple_convert (&stmts, vectype, vec_def);
10213           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10214           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10215                        UNKNOWN_LOCATION);
10216
10217           if (init_node)
10218             vec_init = vect_get_slp_vect_def (init_node, ivn);
10219           if (!nested_in_vect_loop
10220               && !integer_zerop (step_mul))
10221             {
10222               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10223               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10224                                  vec_step, step_mul);
10225               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10226                                       vec_def, up);
10227               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10228             }
10229
10230           /* Set the arguments of the phi node:  */
10231           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10232
10233           slp_node->push_vec_def (induction_phi);
10234         }
10235       if (!nested_in_vect_loop)
10236         {
10237           /* Fill up to the number of vectors we need for the whole group.  */
10238           nivs = least_common_multiple (group_size,
10239                                         const_nunits) / const_nunits;
10240           vec_steps.reserve (nivs-ivn);
10241           for (; ivn < nivs; ++ivn)
10242             {
10243               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10244               vec_steps.quick_push (vec_steps[0]);
10245             }
10246         }
10247
10248       /* Re-use IVs when we can.  We are generating further vector
10249          stmts by adding VF' * stride to the IVs generated above.  */
10250       if (ivn < nvects)
10251         {
10252           unsigned vfp
10253             = least_common_multiple (group_size, const_nunits) / group_size;
10254           tree lupdate_mul
10255             = build_vector_from_val (step_vectype,
10256                                      SCALAR_FLOAT_TYPE_P (stept)
10257                                      ? build_real_from_wide (stept,
10258                                                              vfp, UNSIGNED)
10259                                      : build_int_cstu (stept, vfp));
10260           for (; ivn < nvects; ++ivn)
10261             {
10262               gimple *iv
10263                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10264               tree def = gimple_get_lhs (iv);
10265               if (ivn < 2*nivs)
10266                 vec_steps[ivn - nivs]
10267                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10268                                   vec_steps[ivn - nivs], lupdate_mul);
10269               gimple_seq stmts = NULL;
10270               def = gimple_convert (&stmts, step_vectype, def);
10271               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10272                                   def, vec_steps[ivn % nivs]);
10273               def = gimple_convert (&stmts, vectype, def);
10274               if (gimple_code (iv) == GIMPLE_PHI)
10275                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10276               else
10277                 {
10278                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10279                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10280                 }
10281               slp_node->push_vec_def (def);
10282             }
10283         }
10284
10285       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10286       gcc_assert (!new_bb);
10287
10288       return true;
10289     }
10290
10291   init_expr = vect_phi_initial_value (phi);
10292
10293   gimple_seq stmts = NULL;
10294   if (!nested_in_vect_loop)
10295     {
10296       /* Convert the initial value to the IV update type.  */
10297       tree new_type = TREE_TYPE (step_expr);
10298       init_expr = gimple_convert (&stmts, new_type, init_expr);
10299
10300       /* If we are using the loop mask to "peel" for alignment then we need
10301          to adjust the start value here.  */
10302       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10303       if (skip_niters != NULL_TREE)
10304         {
10305           if (FLOAT_TYPE_P (vectype))
10306             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10307                                         skip_niters);
10308           else
10309             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10310           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10311                                          skip_niters, step_expr);
10312           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10313                                     init_expr, skip_step);
10314         }
10315     }
10316
10317   if (stmts)
10318     {
10319       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10320       gcc_assert (!new_bb);
10321     }
10322
10323   /* Create the vector that holds the initial_value of the induction.  */
10324   if (nested_in_vect_loop)
10325     {
10326       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10327          been created during vectorization of previous stmts.  We obtain it
10328          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10329       auto_vec<tree> vec_inits;
10330       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10331                                      init_expr, &vec_inits);
10332       vec_init = vec_inits[0];
10333       /* If the initial value is not of proper type, convert it.  */
10334       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10335         {
10336           new_stmt
10337             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10338                                                           vect_simple_var,
10339                                                           "vec_iv_"),
10340                                    VIEW_CONVERT_EXPR,
10341                                    build1 (VIEW_CONVERT_EXPR, vectype,
10342                                            vec_init));
10343           vec_init = gimple_assign_lhs (new_stmt);
10344           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10345                                                  new_stmt);
10346           gcc_assert (!new_bb);
10347         }
10348     }
10349   else
10350     {
10351       /* iv_loop is the loop to be vectorized. Create:
10352          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10353       stmts = NULL;
10354       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10355
10356       unsigned HOST_WIDE_INT const_nunits;
10357       if (nunits.is_constant (&const_nunits))
10358         {
10359           tree_vector_builder elts (step_vectype, const_nunits, 1);
10360           elts.quick_push (new_name);
10361           for (i = 1; i < const_nunits; i++)
10362             {
10363               /* Create: new_name_i = new_name + step_expr  */
10364               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10365                                        new_name, step_expr);
10366               elts.quick_push (new_name);
10367             }
10368           /* Create a vector from [new_name_0, new_name_1, ...,
10369              new_name_nunits-1]  */
10370           vec_init = gimple_build_vector (&stmts, &elts);
10371         }
10372       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10373         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10374         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10375                                  new_name, step_expr);
10376       else
10377         {
10378           /* Build:
10379                 [base, base, base, ...]
10380                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10381           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10382           gcc_assert (flag_associative_math);
10383           tree index = build_index_vector (step_vectype, 0, 1);
10384           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10385                                                         new_name);
10386           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10387                                                         step_expr);
10388           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10389           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10390                                    vec_init, step_vec);
10391           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10392                                    vec_init, base_vec);
10393         }
10394       vec_init = gimple_convert (&stmts, vectype, vec_init);
10395
10396       if (stmts)
10397         {
10398           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10399           gcc_assert (!new_bb);
10400         }
10401     }
10402
10403
10404   /* Create the vector that holds the step of the induction.  */
10405   gimple_stmt_iterator *step_iv_si = NULL;
10406   if (nested_in_vect_loop)
10407     /* iv_loop is nested in the loop to be vectorized. Generate:
10408        vec_step = [S, S, S, S]  */
10409     new_name = step_expr;
10410   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10411     {
10412       /* When we're using loop_len produced by SELEC_VL, the non-final
10413          iterations are not always processing VF elements.  So vectorize
10414          induction variable instead of
10415
10416            _21 = vect_vec_iv_.6_22 + { VF, ... };
10417
10418          We should generate:
10419
10420            _35 = .SELECT_VL (ivtmp_33, VF);
10421            vect_cst__22 = [vec_duplicate_expr] _35;
10422            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10423       gcc_assert (!slp_node);
10424       gimple_seq seq = NULL;
10425       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10426       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10427       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10428                                                  unshare_expr (len)),
10429                                    &seq, true, NULL_TREE);
10430       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10431                                step_expr);
10432       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10433       step_iv_si = &si;
10434     }
10435   else
10436     {
10437       /* iv_loop is the loop to be vectorized. Generate:
10438           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10439       gimple_seq seq = NULL;
10440       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10441         {
10442           expr = build_int_cst (integer_type_node, vf);
10443           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10444         }
10445       else
10446         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10447       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10448                                expr, step_expr);
10449       if (seq)
10450         {
10451           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10452           gcc_assert (!new_bb);
10453         }
10454     }
10455
10456   t = unshare_expr (new_name);
10457   gcc_assert (CONSTANT_CLASS_P (new_name)
10458               || TREE_CODE (new_name) == SSA_NAME);
10459   new_vec = build_vector_from_val (step_vectype, t);
10460   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10461                                new_vec, step_vectype, step_iv_si);
10462
10463
10464   /* Create the following def-use cycle:
10465      loop prolog:
10466          vec_init = ...
10467          vec_step = ...
10468      loop:
10469          vec_iv = PHI <vec_init, vec_loop>
10470          ...
10471          STMT
10472          ...
10473          vec_loop = vec_iv + vec_step;  */
10474
10475   /* Create the induction-phi that defines the induction-operand.  */
10476   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10477   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10478   induc_def = PHI_RESULT (induction_phi);
10479
10480   /* Create the iv update inside the loop  */
10481   stmts = NULL;
10482   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10483   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10484   vec_def = gimple_convert (&stmts, vectype, vec_def);
10485   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10486   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10487
10488   /* Set the arguments of the phi node:  */
10489   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10490   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10491                UNKNOWN_LOCATION);
10492
10493   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10494   *vec_stmt = induction_phi;
10495
10496   /* In case that vectorization factor (VF) is bigger than the number
10497      of elements that we can fit in a vectype (nunits), we have to generate
10498      more than one vector stmt - i.e - we need to "unroll" the
10499      vector stmt by a factor VF/nunits.  For more details see documentation
10500      in vectorizable_operation.  */
10501
10502   if (ncopies > 1)
10503     {
10504       gimple_seq seq = NULL;
10505       /* FORNOW. This restriction should be relaxed.  */
10506       gcc_assert (!nested_in_vect_loop);
10507       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10508       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10509
10510       /* Create the vector that holds the step of the induction.  */
10511       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10512         {
10513           expr = build_int_cst (integer_type_node, nunits);
10514           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10515         }
10516       else
10517         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10518       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10519                                expr, step_expr);
10520       if (seq)
10521         {
10522           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10523           gcc_assert (!new_bb);
10524         }
10525
10526       t = unshare_expr (new_name);
10527       gcc_assert (CONSTANT_CLASS_P (new_name)
10528                   || TREE_CODE (new_name) == SSA_NAME);
10529       new_vec = build_vector_from_val (step_vectype, t);
10530       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10531                                    new_vec, step_vectype, NULL);
10532
10533       vec_def = induc_def;
10534       for (i = 1; i < ncopies + 1; i++)
10535         {
10536           /* vec_i = vec_prev + vec_step  */
10537           gimple_seq stmts = NULL;
10538           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10539           vec_def = gimple_build (&stmts,
10540                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10541           vec_def = gimple_convert (&stmts, vectype, vec_def);
10542
10543           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10544           if (i < ncopies)
10545             {
10546               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10547               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10548             }
10549           else
10550             {
10551               /* vec_1 = vec_iv + (VF/n * S)
10552                  vec_2 = vec_1 + (VF/n * S)
10553                  ...
10554                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10555
10556                  vec_n is used as vec_loop to save the large step register and
10557                  related operations.  */
10558               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10559                            UNKNOWN_LOCATION);
10560             }
10561         }
10562     }
10563
10564   if (dump_enabled_p ())
10565     dump_printf_loc (MSG_NOTE, vect_location,
10566                      "transform induction: created def-use cycle: %G%G",
10567                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10568
10569   return true;
10570 }
10571
10572 /* Function vectorizable_live_operation_1.
10573
10574    helper function for vectorizable_live_operation.  */
10575
10576 static tree
10577 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10578                                stmt_vec_info stmt_info, basic_block exit_bb,
10579                                tree vectype, int ncopies, slp_tree slp_node,
10580                                tree bitsize, tree bitstart, tree vec_lhs,
10581                                tree lhs_type, gimple_stmt_iterator *exit_gsi)
10582 {
10583   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10584
10585   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10586   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10587   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10588     SET_PHI_ARG_DEF (phi, i, vec_lhs);
10589
10590   gimple_seq stmts = NULL;
10591   tree new_tree;
10592
10593   /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
10594   if (integer_zerop (bitstart))
10595     {
10596       tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10597                                       vec_lhs_phi, bitsize, bitstart);
10598
10599       /* Convert the extracted vector element to the scalar type.  */
10600       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10601     }
10602   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10603     {
10604       /* Emit:
10605
10606          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10607
10608          where VEC_LHS is the vectorized live-out result and MASK is
10609          the loop mask for the final iteration.  */
10610       gcc_assert (ncopies == 1 && !slp_node);
10611       gimple_seq tem = NULL;
10612       gimple_stmt_iterator gsi = gsi_last (tem);
10613       tree len = vect_get_loop_len (loop_vinfo, &gsi,
10614                                     &LOOP_VINFO_LENS (loop_vinfo),
10615                                     1, vectype, 0, 0);
10616
10617       /* BIAS - 1.  */
10618       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10619       tree bias_minus_one
10620         = int_const_binop (MINUS_EXPR,
10621                            build_int_cst (TREE_TYPE (len), biasval),
10622                            build_one_cst (TREE_TYPE (len)));
10623
10624       /* LAST_INDEX = LEN + (BIAS - 1).  */
10625       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10626                                      len, bias_minus_one);
10627
10628       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10629       tree scalar_res
10630         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10631                         vec_lhs_phi, last_index);
10632
10633       /* Convert the extracted vector element to the scalar type.  */
10634       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10635     }
10636   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10637     {
10638       /* Emit:
10639
10640          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10641
10642          where VEC_LHS is the vectorized live-out result and MASK is
10643          the loop mask for the final iteration.  */
10644       gcc_assert (!slp_node);
10645       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10646       gimple_seq tem = NULL;
10647       gimple_stmt_iterator gsi = gsi_last (tem);
10648       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10649                                       &LOOP_VINFO_MASKS (loop_vinfo),
10650                                       1, vectype, 0);
10651       tree scalar_res;
10652       gimple_seq_add_seq (&stmts, tem);
10653
10654       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10655                                  mask, vec_lhs_phi);
10656
10657       /* Convert the extracted vector element to the scalar type.  */
10658       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10659     }
10660   else
10661     {
10662       tree bftype = TREE_TYPE (vectype);
10663       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10664         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10665       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10666       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10667                                        &stmts, true, NULL_TREE);
10668     }
10669
10670   *exit_gsi = gsi_after_labels (exit_bb);
10671   if (stmts)
10672     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10673
10674   return new_tree;
10675 }
10676
10677 /* Function vectorizable_live_operation.
10678
10679    STMT_INFO computes a value that is used outside the loop.  Check if
10680    it can be supported.  */
10681
10682 bool
10683 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10684                              slp_tree slp_node, slp_instance slp_node_instance,
10685                              int slp_index, bool vec_stmt_p,
10686                              stmt_vector_for_cost *cost_vec)
10687 {
10688   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10689   imm_use_iterator imm_iter;
10690   tree lhs, lhs_type, bitsize;
10691   tree vectype = (slp_node
10692                   ? SLP_TREE_VECTYPE (slp_node)
10693                   : STMT_VINFO_VECTYPE (stmt_info));
10694   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10695   int ncopies;
10696   gimple *use_stmt;
10697   use_operand_p use_p;
10698   auto_vec<tree> vec_oprnds;
10699   int vec_entry = 0;
10700   poly_uint64 vec_index = 0;
10701
10702   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10703               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10704
10705   /* If a stmt of a reduction is live, vectorize it via
10706      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10707      validity so just trigger the transform here.  */
10708   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10709     {
10710       if (!vec_stmt_p)
10711         return true;
10712       /* For SLP reductions we vectorize the epilogue for all involved stmts
10713          together.  */
10714       if (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) && slp_index != 0)
10715         return true;
10716       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10717       gcc_assert (reduc_info->is_reduc_info);
10718       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10719           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10720         return true;
10721
10722       if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10723           || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10724         vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10725                                           slp_node_instance,
10726                                           LOOP_VINFO_IV_EXIT (loop_vinfo));
10727
10728       /* If early break we only have to materialize the reduction on the merge
10729          block, but we have to find an alternate exit first.  */
10730       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10731         {
10732           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10733             if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10734               {
10735                 vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10736                                                   slp_node, slp_node_instance,
10737                                                   exit);
10738                 break;
10739               }
10740           if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10741             vect_create_epilog_for_reduction (loop_vinfo, reduc_info, slp_node,
10742                                               slp_node_instance,
10743                                               LOOP_VINFO_IV_EXIT (loop_vinfo));
10744         }
10745
10746       return true;
10747     }
10748
10749   /* If STMT is not relevant and it is a simple assignment and its inputs are
10750      invariant then it can remain in place, unvectorized.  The original last
10751      scalar value that it computes will be used.  */
10752   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10753     {
10754       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10755       if (dump_enabled_p ())
10756         dump_printf_loc (MSG_NOTE, vect_location,
10757                          "statement is simple and uses invariant.  Leaving in "
10758                          "place.\n");
10759       return true;
10760     }
10761
10762   if (slp_node)
10763     ncopies = 1;
10764   else
10765     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10766
10767   if (slp_node)
10768     {
10769       gcc_assert (slp_index >= 0);
10770
10771       /* Get the last occurrence of the scalar index from the concatenation of
10772          all the slp vectors. Calculate which slp vector it is and the index
10773          within.  */
10774       int num_scalar = SLP_TREE_LANES (slp_node);
10775       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10776       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10777
10778       /* Calculate which vector contains the result, and which lane of
10779          that vector we need.  */
10780       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10781         {
10782           if (dump_enabled_p ())
10783             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10784                              "Cannot determine which vector holds the"
10785                              " final result.\n");
10786           return false;
10787         }
10788     }
10789
10790   if (!vec_stmt_p)
10791     {
10792       /* No transformation required.  */
10793       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10794         {
10795           if (slp_node)
10796             {
10797               if (dump_enabled_p ())
10798                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10799                                  "can't operate on partial vectors "
10800                                  "because an SLP statement is live after "
10801                                  "the loop.\n");
10802               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10803             }
10804           else if (ncopies > 1)
10805             {
10806               if (dump_enabled_p ())
10807                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10808                                  "can't operate on partial vectors "
10809                                  "because ncopies is greater than 1.\n");
10810               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10811             }
10812           else
10813             {
10814               gcc_assert (ncopies == 1 && !slp_node);
10815               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10816                                                   OPTIMIZE_FOR_SPEED))
10817                 vect_record_loop_mask (loop_vinfo,
10818                                        &LOOP_VINFO_MASKS (loop_vinfo),
10819                                        1, vectype, NULL);
10820               else if (can_vec_extract_var_idx_p (
10821                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10822                 vect_record_loop_len (loop_vinfo,
10823                                       &LOOP_VINFO_LENS (loop_vinfo),
10824                                       1, vectype, 1);
10825               else
10826                 {
10827                   if (dump_enabled_p ())
10828                     dump_printf_loc (
10829                       MSG_MISSED_OPTIMIZATION, vect_location,
10830                       "can't operate on partial vectors "
10831                       "because the target doesn't support extract "
10832                       "last reduction.\n");
10833                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10834                 }
10835             }
10836         }
10837       /* ???  Enable for loop costing as well.  */
10838       if (!loop_vinfo)
10839         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10840                           0, vect_epilogue);
10841       return true;
10842     }
10843
10844   /* Use the lhs of the original scalar statement.  */
10845   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10846   if (dump_enabled_p ())
10847     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10848                      "stmt %G", stmt);
10849
10850   lhs = gimple_get_lhs (stmt);
10851   lhs_type = TREE_TYPE (lhs);
10852
10853   bitsize = vector_element_bits_tree (vectype);
10854
10855   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10856   tree vec_lhs, vec_lhs0, bitstart;
10857   gimple *vec_stmt, *vec_stmt0;
10858   if (slp_node)
10859     {
10860       gcc_assert (!loop_vinfo
10861                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10862                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10863
10864       /* Get the correct slp vectorized stmt.  */
10865       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10866       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10867
10868       /* In case we need to early break vectorize also get the first stmt.  */
10869       vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10870       vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10871
10872       /* Get entry to use.  */
10873       bitstart = bitsize_int (vec_index);
10874       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10875     }
10876   else
10877     {
10878       /* For multiple copies, get the last copy.  */
10879       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10880       vec_lhs = gimple_get_lhs (vec_stmt);
10881
10882       /* In case we need to early break vectorize also get the first stmt.  */
10883       vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10884       vec_lhs0 = gimple_get_lhs (vec_stmt0);
10885
10886       /* Get the last lane in the vector.  */
10887       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10888     }
10889
10890   if (loop_vinfo)
10891     {
10892       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10893          requirement, insert one phi node for it.  It looks like:
10894            loop;
10895          BB:
10896            # lhs' = PHI <lhs>
10897          ==>
10898            loop;
10899          BB:
10900            # vec_lhs' = PHI <vec_lhs>
10901            new_tree = lane_extract <vec_lhs', ...>;
10902            lhs' = new_tree;  */
10903
10904       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10905       /* Check if we have a loop where the chosen exit is not the main exit,
10906          in these cases for an early break we restart the iteration the vector code
10907          did.  For the live values we want the value at the start of the iteration
10908          rather than at the end.  */
10909       edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10910       bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10911       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10912         if (!is_gimple_debug (use_stmt)
10913             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10914           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10915             {
10916               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10917                                            phi_arg_index_from_use (use_p));
10918               gcc_assert (loop_exit_edge_p (loop, e));
10919               bool main_exit_edge = e == main_e;
10920               tree tmp_vec_lhs = vec_lhs;
10921               tree tmp_bitstart = bitstart;
10922
10923               /* For early exit where the exit is not in the BB that leads
10924                  to the latch then we're restarting the iteration in the
10925                  scalar loop.  So get the first live value.  */
10926               if ((all_exits_as_early_p || !main_exit_edge)
10927                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10928                 {
10929                   tmp_vec_lhs = vec_lhs0;
10930                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10931                 }
10932
10933               gimple_stmt_iterator exit_gsi;
10934               tree new_tree
10935                 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10936                                                  e->dest, vectype, ncopies,
10937                                                  slp_node, bitsize,
10938                                                  tmp_bitstart, tmp_vec_lhs,
10939                                                  lhs_type, &exit_gsi);
10940
10941               auto gsi = gsi_for_stmt (use_stmt);
10942               remove_phi_node (&gsi, false);
10943               tree lhs_phi = gimple_phi_result (use_stmt);
10944               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10945               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10946               break;
10947             }
10948
10949       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
10950       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10951         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10952     }
10953   else
10954     {
10955       /* For basic-block vectorization simply insert the lane-extraction.  */
10956       tree bftype = TREE_TYPE (vectype);
10957       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10958         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10959       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10960                               vec_lhs, bitsize, bitstart);
10961       gimple_seq stmts = NULL;
10962       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10963                                        &stmts, true, NULL_TREE);
10964       if (TREE_CODE (new_tree) == SSA_NAME
10965           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10966         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10967       if (is_a <gphi *> (vec_stmt))
10968         {
10969           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10970           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10971         }
10972       else
10973         {
10974           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10975           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10976         }
10977
10978       /* Replace use of lhs with newly computed result.  If the use stmt is a
10979          single arg PHI, just replace all uses of PHI result.  It's necessary
10980          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10981       use_operand_p use_p;
10982       stmt_vec_info use_stmt_info;
10983       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10984         if (!is_gimple_debug (use_stmt)
10985             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10986                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10987           {
10988             /* ???  This can happen when the live lane ends up being
10989                rooted in a vector construction code-generated by an
10990                external SLP node (and code-generation for that already
10991                happened).  See gcc.dg/vect/bb-slp-47.c.
10992                Doing this is what would happen if that vector CTOR
10993                were not code-generated yet so it is not too bad.
10994                ???  In fact we'd likely want to avoid this situation
10995                in the first place.  */
10996             if (TREE_CODE (new_tree) == SSA_NAME
10997                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10998                 && gimple_code (use_stmt) != GIMPLE_PHI
10999                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11000                                                 use_stmt))
11001               {
11002                 if (dump_enabled_p ())
11003                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11004                                    "Using original scalar computation for "
11005                                    "live lane because use preceeds vector "
11006                                    "def\n");
11007                 continue;
11008               }
11009             /* ???  It can also happen that we end up pulling a def into
11010                a loop where replacing out-of-loop uses would require
11011                a new LC SSA PHI node.  Retain the original scalar in
11012                those cases as well.  PR98064.  */
11013             if (TREE_CODE (new_tree) == SSA_NAME
11014                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11015                 && (gimple_bb (use_stmt)->loop_father
11016                     != gimple_bb (vec_stmt)->loop_father)
11017                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11018                                         gimple_bb (use_stmt)->loop_father))
11019               {
11020                 if (dump_enabled_p ())
11021                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11022                                    "Using original scalar computation for "
11023                                    "live lane because there is an out-of-loop "
11024                                    "definition for it\n");
11025                 continue;
11026               }
11027             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11028               SET_USE (use_p, new_tree);
11029             update_stmt (use_stmt);
11030           }
11031     }
11032
11033   return true;
11034 }
11035
11036 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
11037
11038 static void
11039 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11040 {
11041   ssa_op_iter op_iter;
11042   imm_use_iterator imm_iter;
11043   def_operand_p def_p;
11044   gimple *ustmt;
11045
11046   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11047     {
11048       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11049         {
11050           basic_block bb;
11051
11052           if (!is_gimple_debug (ustmt))
11053             continue;
11054
11055           bb = gimple_bb (ustmt);
11056
11057           if (!flow_bb_inside_loop_p (loop, bb))
11058             {
11059               if (gimple_debug_bind_p (ustmt))
11060                 {
11061                   if (dump_enabled_p ())
11062                     dump_printf_loc (MSG_NOTE, vect_location,
11063                                      "killing debug use\n");
11064
11065                   gimple_debug_bind_reset_value (ustmt);
11066                   update_stmt (ustmt);
11067                 }
11068               else
11069                 gcc_unreachable ();
11070             }
11071         }
11072     }
11073 }
11074
11075 /* Given loop represented by LOOP_VINFO, return true if computation of
11076    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11077    otherwise.  */
11078
11079 static bool
11080 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11081 {
11082   /* Constant case.  */
11083   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11084     {
11085       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11086       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11087
11088       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11089       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11090       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11091         return true;
11092     }
11093
11094   widest_int max;
11095   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11096   /* Check the upper bound of loop niters.  */
11097   if (get_max_loop_iterations (loop, &max))
11098     {
11099       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11100       signop sgn = TYPE_SIGN (type);
11101       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11102       if (max < type_max)
11103         return true;
11104     }
11105   return false;
11106 }
11107
11108 /* Return a mask type with half the number of elements as OLD_TYPE,
11109    given that it should have mode NEW_MODE.  */
11110
11111 tree
11112 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11113 {
11114   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11115   return build_truth_vector_type_for_mode (nunits, new_mode);
11116 }
11117
11118 /* Return a mask type with twice as many elements as OLD_TYPE,
11119    given that it should have mode NEW_MODE.  */
11120
11121 tree
11122 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11123 {
11124   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11125   return build_truth_vector_type_for_mode (nunits, new_mode);
11126 }
11127
11128 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11129    contain a sequence of NVECTORS masks that each control a vector of type
11130    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11131    these vector masks with the vector version of SCALAR_MASK.  */
11132
11133 void
11134 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11135                        unsigned int nvectors, tree vectype, tree scalar_mask)
11136 {
11137   gcc_assert (nvectors != 0);
11138
11139   if (scalar_mask)
11140     {
11141       scalar_cond_masked_key cond (scalar_mask, nvectors);
11142       loop_vinfo->scalar_cond_masked_set.add (cond);
11143     }
11144
11145   masks->mask_set.add (std::make_pair (vectype, nvectors));
11146 }
11147
11148 /* Given a complete set of masks MASKS, extract mask number INDEX
11149    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11150    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11151
11152    See the comment above vec_loop_masks for more details about the mask
11153    arrangement.  */
11154
11155 tree
11156 vect_get_loop_mask (loop_vec_info loop_vinfo,
11157                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11158                     unsigned int nvectors, tree vectype, unsigned int index)
11159 {
11160   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11161       == vect_partial_vectors_while_ult)
11162     {
11163       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11164       tree mask_type = rgm->type;
11165
11166       /* Populate the rgroup's mask array, if this is the first time we've
11167          used it.  */
11168       if (rgm->controls.is_empty ())
11169         {
11170           rgm->controls.safe_grow_cleared (nvectors, true);
11171           for (unsigned int i = 0; i < nvectors; ++i)
11172             {
11173               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11174               /* Provide a dummy definition until the real one is available.  */
11175               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11176               rgm->controls[i] = mask;
11177             }
11178         }
11179
11180       tree mask = rgm->controls[index];
11181       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11182                     TYPE_VECTOR_SUBPARTS (vectype)))
11183         {
11184           /* A loop mask for data type X can be reused for data type Y
11185              if X has N times more elements than Y and if Y's elements
11186              are N times bigger than X's.  In this case each sequence
11187              of N elements in the loop mask will be all-zero or all-one.
11188              We can then view-convert the mask so that each sequence of
11189              N elements is replaced by a single element.  */
11190           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11191                                   TYPE_VECTOR_SUBPARTS (vectype)));
11192           gimple_seq seq = NULL;
11193           mask_type = truth_type_for (vectype);
11194           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11195           if (seq)
11196             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11197         }
11198       return mask;
11199     }
11200   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11201            == vect_partial_vectors_avx512)
11202     {
11203       /* The number of scalars per iteration and the number of vectors are
11204          both compile-time constants.  */
11205       unsigned int nscalars_per_iter
11206         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11207                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11208
11209       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11210
11211       /* The stored nV is dependent on the mask type produced.  */
11212       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11213                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11214                   == rgm->factor);
11215       nvectors = rgm->factor;
11216
11217       /* Populate the rgroup's mask array, if this is the first time we've
11218          used it.  */
11219       if (rgm->controls.is_empty ())
11220         {
11221           rgm->controls.safe_grow_cleared (nvectors, true);
11222           for (unsigned int i = 0; i < nvectors; ++i)
11223             {
11224               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11225               /* Provide a dummy definition until the real one is available.  */
11226               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11227               rgm->controls[i] = mask;
11228             }
11229         }
11230       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11231                     TYPE_VECTOR_SUBPARTS (vectype)))
11232         return rgm->controls[index];
11233
11234       /* Split the vector if needed.  Since we are dealing with integer mode
11235          masks with AVX512 we can operate on the integer representation
11236          performing the whole vector shifting.  */
11237       unsigned HOST_WIDE_INT factor;
11238       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11239                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11240       gcc_assert (ok);
11241       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11242       tree mask_type = truth_type_for (vectype);
11243       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11244       unsigned vi = index / factor;
11245       unsigned vpart = index % factor;
11246       tree vec = rgm->controls[vi];
11247       gimple_seq seq = NULL;
11248       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11249                           lang_hooks.types.type_for_mode
11250                                 (TYPE_MODE (rgm->type), 1), vec);
11251       /* For integer mode masks simply shift the right bits into position.  */
11252       if (vpart != 0)
11253         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11254                             build_int_cst (integer_type_node,
11255                                            (TYPE_VECTOR_SUBPARTS (vectype)
11256                                             * vpart)));
11257       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11258                                     (TYPE_MODE (mask_type), 1), vec);
11259       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11260       if (seq)
11261         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11262       return vec;
11263     }
11264   else
11265     gcc_unreachable ();
11266 }
11267
11268 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11269    lengths for controlling an operation on VECTYPE.  The operation splits
11270    each element of VECTYPE into FACTOR separate subelements, measuring the
11271    length as a number of these subelements.  */
11272
11273 void
11274 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11275                       unsigned int nvectors, tree vectype, unsigned int factor)
11276 {
11277   gcc_assert (nvectors != 0);
11278   if (lens->length () < nvectors)
11279     lens->safe_grow_cleared (nvectors, true);
11280   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11281
11282   /* The number of scalars per iteration, scalar occupied bytes and
11283      the number of vectors are both compile-time constants.  */
11284   unsigned int nscalars_per_iter
11285     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11286                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11287
11288   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11289     {
11290       /* For now, we only support cases in which all loads and stores fall back
11291          to VnQI or none do.  */
11292       gcc_assert (!rgl->max_nscalars_per_iter
11293                   || (rgl->factor == 1 && factor == 1)
11294                   || (rgl->max_nscalars_per_iter * rgl->factor
11295                       == nscalars_per_iter * factor));
11296       rgl->max_nscalars_per_iter = nscalars_per_iter;
11297       rgl->type = vectype;
11298       rgl->factor = factor;
11299     }
11300 }
11301
11302 /* Given a complete set of lengths LENS, extract length number INDEX
11303    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11304    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11305    multipled by the number of elements that should be processed.
11306    Insert any set-up statements before GSI.  */
11307
11308 tree
11309 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11310                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11311                    unsigned int index, unsigned int factor)
11312 {
11313   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11314   bool use_bias_adjusted_len =
11315     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11316
11317   /* Populate the rgroup's len array, if this is the first time we've
11318      used it.  */
11319   if (rgl->controls.is_empty ())
11320     {
11321       rgl->controls.safe_grow_cleared (nvectors, true);
11322       for (unsigned int i = 0; i < nvectors; ++i)
11323         {
11324           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11325           gcc_assert (len_type != NULL_TREE);
11326
11327           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11328
11329           /* Provide a dummy definition until the real one is available.  */
11330           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11331           rgl->controls[i] = len;
11332
11333           if (use_bias_adjusted_len)
11334             {
11335               gcc_assert (i == 0);
11336               tree adjusted_len =
11337                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11338               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11339               rgl->bias_adjusted_ctrl = adjusted_len;
11340             }
11341         }
11342     }
11343
11344   if (use_bias_adjusted_len)
11345     return rgl->bias_adjusted_ctrl;
11346
11347   tree loop_len = rgl->controls[index];
11348   if (rgl->factor == 1 && factor == 1)
11349     {
11350       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11351       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11352       if (maybe_ne (nunits1, nunits2))
11353         {
11354           /* A loop len for data type X can be reused for data type Y
11355              if X has N times more elements than Y and if Y's elements
11356              are N times bigger than X's.  */
11357           gcc_assert (multiple_p (nunits1, nunits2));
11358           factor = exact_div (nunits1, nunits2).to_constant ();
11359           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11360           gimple_seq seq = NULL;
11361           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11362                                    build_int_cst (iv_type, factor));
11363           if (seq)
11364             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11365         }
11366     }
11367   return loop_len;
11368 }
11369
11370 /* Scale profiling counters by estimation for LOOP which is vectorized
11371    by factor VF.
11372    If FLAT is true, the loop we started with had unrealistically flat
11373    profile.  */
11374
11375 static void
11376 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11377 {
11378   /* For flat profiles do not scale down proportionally by VF and only
11379      cap by known iteration count bounds.  */
11380   if (flat)
11381     {
11382       if (dump_file && (dump_flags & TDF_DETAILS))
11383         fprintf (dump_file,
11384                  "Vectorized loop profile seems flat; not scaling iteration "
11385                  "count down by the vectorization factor %i\n", vf);
11386       scale_loop_profile (loop, profile_probability::always (),
11387                           get_likely_max_loop_iterations_int (loop));
11388       return;
11389     }
11390   /* Loop body executes VF fewer times and exit increases VF times.  */
11391   profile_count entry_count = loop_preheader_edge (loop)->count ();
11392
11393   /* If we have unreliable loop profile avoid dropping entry
11394      count bellow header count.  This can happen since loops
11395      has unrealistically low trip counts.  */
11396   while (vf > 1
11397          && loop->header->count > entry_count
11398          && loop->header->count < entry_count * vf)
11399     {
11400       if (dump_file && (dump_flags & TDF_DETAILS))
11401         fprintf (dump_file,
11402                  "Vectorization factor %i seems too large for profile "
11403                  "prevoiusly believed to be consistent; reducing.\n", vf);
11404       vf /= 2;
11405     }
11406
11407   if (entry_count.nonzero_p ())
11408     set_edge_probability_and_rescale_others
11409             (exit_e,
11410              entry_count.probability_in (loop->header->count / vf));
11411   /* Avoid producing very large exit probability when we do not have
11412      sensible profile.  */
11413   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11414     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11415   loop->latch->count = single_pred_edge (loop->latch)->count ();
11416
11417   scale_loop_profile (loop, profile_probability::always () / vf,
11418                       get_likely_max_loop_iterations_int (loop));
11419 }
11420
11421 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11422    latch edge values originally defined by it.  */
11423
11424 static void
11425 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11426                                      stmt_vec_info def_stmt_info)
11427 {
11428   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11429   if (!def || TREE_CODE (def) != SSA_NAME)
11430     return;
11431   stmt_vec_info phi_info;
11432   imm_use_iterator iter;
11433   use_operand_p use_p;
11434   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11435     {
11436       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11437       if (!phi)
11438         continue;
11439       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11440             && (phi_info = loop_vinfo->lookup_stmt (phi))
11441             && STMT_VINFO_RELEVANT_P (phi_info)))
11442         continue;
11443       loop_p loop = gimple_bb (phi)->loop_father;
11444       edge e = loop_latch_edge (loop);
11445       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11446         continue;
11447
11448       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11449           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11450           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11451         {
11452           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11453           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11454           gcc_assert (phi_defs.length () == latch_defs.length ());
11455           for (unsigned i = 0; i < phi_defs.length (); ++i)
11456             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11457                          gimple_get_lhs (latch_defs[i]), e,
11458                          gimple_phi_arg_location (phi, e->dest_idx));
11459         }
11460       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11461         {
11462           /* For first order recurrences we have to update both uses of
11463              the latch definition, the one in the PHI node and the one
11464              in the generated VEC_PERM_EXPR.  */
11465           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11466           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11467           gcc_assert (phi_defs.length () == latch_defs.length ());
11468           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11469           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11470           for (unsigned i = 0; i < phi_defs.length (); ++i)
11471             {
11472               gassign *perm = as_a <gassign *> (phi_defs[i]);
11473               if (i > 0)
11474                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11475               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11476               update_stmt (perm);
11477             }
11478           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11479                        gimple_phi_arg_location (phi, e->dest_idx));
11480         }
11481     }
11482 }
11483
11484 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11485    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11486    stmt_vec_info.  */
11487
11488 static bool
11489 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11490                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11491 {
11492   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11493   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11494
11495   if (dump_enabled_p ())
11496     dump_printf_loc (MSG_NOTE, vect_location,
11497                      "------>vectorizing statement: %G", stmt_info->stmt);
11498
11499   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11500     vect_loop_kill_debug_uses (loop, stmt_info);
11501
11502   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11503       && !STMT_VINFO_LIVE_P (stmt_info))
11504     {
11505       if (is_gimple_call (stmt_info->stmt)
11506           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11507         {
11508           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11509           *seen_store = stmt_info;
11510           return false;
11511         }
11512       return false;
11513     }
11514
11515   if (STMT_VINFO_VECTYPE (stmt_info))
11516     {
11517       poly_uint64 nunits
11518         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11519       if (!STMT_SLP_TYPE (stmt_info)
11520           && maybe_ne (nunits, vf)
11521           && dump_enabled_p ())
11522         /* For SLP VF is set according to unrolling factor, and not
11523            to vector size, hence for SLP this print is not valid.  */
11524         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11525     }
11526
11527   /* Pure SLP statements have already been vectorized.  We still need
11528      to apply loop vectorization to hybrid SLP statements.  */
11529   if (PURE_SLP_STMT (stmt_info))
11530     return false;
11531
11532   if (dump_enabled_p ())
11533     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11534
11535   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11536     *seen_store = stmt_info;
11537
11538   return true;
11539 }
11540
11541 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11542    in the hash_map with its corresponding values.  */
11543
11544 static tree
11545 find_in_mapping (tree t, void *context)
11546 {
11547   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11548
11549   tree *value = mapping->get (t);
11550   return value ? *value : t;
11551 }
11552
11553 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11554    original loop that has now been vectorized.
11555
11556    The inits of the data_references need to be advanced with the number of
11557    iterations of the main loop.  This has been computed in vect_do_peeling and
11558    is stored in parameter ADVANCE.  We first restore the data_references
11559    initial offset with the values recored in ORIG_DRS_INIT.
11560
11561    Since the loop_vec_info of this EPILOGUE was constructed for the original
11562    loop, its stmt_vec_infos all point to the original statements.  These need
11563    to be updated to point to their corresponding copies as well as the SSA_NAMES
11564    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11565
11566    The data_reference's connections also need to be updated.  Their
11567    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11568    stmt_vec_infos, their statements need to point to their corresponding copy,
11569    if they are gather loads or scatter stores then their reference needs to be
11570    updated to point to its corresponding copy and finally we set
11571    'base_misaligned' to false as we have already peeled for alignment in the
11572    prologue of the main loop.  */
11573
11574 static void
11575 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11576 {
11577   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11578   auto_vec<gimple *> stmt_worklist;
11579   hash_map<tree,tree> mapping;
11580   gimple *orig_stmt, *new_stmt;
11581   gimple_stmt_iterator epilogue_gsi;
11582   gphi_iterator epilogue_phi_gsi;
11583   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11584   basic_block *epilogue_bbs = get_loop_body (epilogue);
11585   unsigned i;
11586
11587   free (LOOP_VINFO_BBS (epilogue_vinfo));
11588   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11589
11590   /* Advance data_reference's with the number of iterations of the previous
11591      loop and its prologue.  */
11592   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11593
11594
11595   /* The EPILOGUE loop is a copy of the original loop so they share the same
11596      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11597      point to the copied statements.  We also create a mapping of all LHS' in
11598      the original loop and all the LHS' in the EPILOGUE and create worklists to
11599      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11600   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11601     {
11602       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11603            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11604         {
11605           new_stmt = epilogue_phi_gsi.phi ();
11606
11607           gcc_assert (gimple_uid (new_stmt) > 0);
11608           stmt_vinfo
11609             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11610
11611           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11612           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11613
11614           mapping.put (gimple_phi_result (orig_stmt),
11615                        gimple_phi_result (new_stmt));
11616           /* PHI nodes can not have patterns or related statements.  */
11617           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11618                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11619         }
11620
11621       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11622            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11623         {
11624           new_stmt = gsi_stmt (epilogue_gsi);
11625           if (is_gimple_debug (new_stmt))
11626             continue;
11627
11628           gcc_assert (gimple_uid (new_stmt) > 0);
11629           stmt_vinfo
11630             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11631
11632           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11633           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11634
11635           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11636             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11637
11638           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11639             {
11640               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11641               for (gimple_stmt_iterator gsi = gsi_start (seq);
11642                    !gsi_end_p (gsi); gsi_next (&gsi))
11643                 stmt_worklist.safe_push (gsi_stmt (gsi));
11644             }
11645
11646           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11647           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11648             {
11649               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11650               stmt_worklist.safe_push (stmt);
11651               /* Set BB such that the assert in
11652                 'get_initial_def_for_reduction' is able to determine that
11653                 the BB of the related stmt is inside this loop.  */
11654               gimple_set_bb (stmt,
11655                              gimple_bb (new_stmt));
11656               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11657               gcc_assert (related_vinfo == NULL
11658                           || related_vinfo == stmt_vinfo);
11659             }
11660         }
11661     }
11662
11663   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11664      using the original main loop and thus need to be updated to refer to the
11665      cloned variables used in the epilogue.  */
11666   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11667     {
11668       gimple *stmt = stmt_worklist[i];
11669       tree *new_op;
11670
11671       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11672         {
11673           tree op = gimple_op (stmt, j);
11674           if ((new_op = mapping.get(op)))
11675             gimple_set_op (stmt, j, *new_op);
11676           else
11677             {
11678               /* PR92429: The last argument of simplify_replace_tree disables
11679                  folding when replacing arguments.  This is required as
11680                  otherwise you might end up with different statements than the
11681                  ones analyzed in vect_loop_analyze, leading to different
11682                  vectorization.  */
11683               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11684                                           &find_in_mapping, &mapping, false);
11685               gimple_set_op (stmt, j, op);
11686             }
11687         }
11688     }
11689
11690   struct data_reference *dr;
11691   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11692   FOR_EACH_VEC_ELT (datarefs, i, dr)
11693     {
11694       orig_stmt = DR_STMT (dr);
11695       gcc_assert (gimple_uid (orig_stmt) > 0);
11696       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11697       /* Data references for gather loads and scatter stores do not use the
11698          updated offset we set using ADVANCE.  Instead we have to make sure the
11699          reference in the data references point to the corresponding copy of
11700          the original in the epilogue.  Make sure to update both
11701          gather/scatters recognized by dataref analysis and also other
11702          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11703       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11704       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11705           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11706         {
11707           DR_REF (dr)
11708             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11709                                      &find_in_mapping, &mapping);
11710           DR_BASE_ADDRESS (dr)
11711             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11712                                      &find_in_mapping, &mapping);
11713         }
11714       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11715       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11716       /* The vector size of the epilogue is smaller than that of the main loop
11717          so the alignment is either the same or lower. This means the dr will
11718          thus by definition be aligned.  */
11719       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11720     }
11721
11722   epilogue_vinfo->shared->datarefs_copy.release ();
11723   epilogue_vinfo->shared->save_datarefs ();
11724 }
11725
11726 /*  When vectorizing early break statements instructions that happen before
11727     the early break in the current BB need to be moved to after the early
11728     break.  This function deals with that and assumes that any validity
11729     checks has already been performed.
11730
11731     While moving the instructions if it encounters a VUSE or VDEF it then
11732     corrects the VUSES as it moves the statements along.  GDEST is the location
11733     in which to insert the new statements.  */
11734
11735 static void
11736 move_early_exit_stmts (loop_vec_info loop_vinfo)
11737 {
11738   DUMP_VECT_SCOPE ("move_early_exit_stmts");
11739
11740   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11741     return;
11742
11743   /* Move all stmts that need moving.  */
11744   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11745   gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11746
11747   tree last_seen_vuse = NULL_TREE;
11748   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11749     {
11750       /* We have to update crossed degenerate virtual PHIs.  Simply
11751          elide them.  */
11752       if (gphi *vphi = dyn_cast <gphi *> (stmt))
11753         {
11754           tree vdef = gimple_phi_result (vphi);
11755           tree vuse = gimple_phi_arg_def (vphi, 0);
11756           imm_use_iterator iter;
11757           use_operand_p use_p;
11758           gimple *use_stmt;
11759           FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11760             {
11761               FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11762                 SET_USE (use_p, vuse);
11763             }
11764           auto gsi = gsi_for_stmt (stmt);
11765           remove_phi_node (&gsi, true);
11766           last_seen_vuse = vuse;
11767           continue;
11768         }
11769
11770       /* Check to see if statement is still required for vect or has been
11771          elided.  */
11772       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11773       if (!stmt_info)
11774         continue;
11775
11776       if (dump_enabled_p ())
11777         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11778
11779       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11780       gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11781       last_seen_vuse = gimple_vuse (stmt);
11782     }
11783
11784   /* Update all the stmts with their new reaching VUSES.  */
11785   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11786     {
11787       if (dump_enabled_p ())
11788           dump_printf_loc (MSG_NOTE, vect_location,
11789                            "updating vuse to %T for load %G",
11790                            last_seen_vuse, p);
11791       gimple_set_vuse (p, last_seen_vuse);
11792       update_stmt (p);
11793     }
11794
11795   /* And update the LC PHIs on exits.  */
11796   for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP  (loop_vinfo)))
11797     if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11798       if (gphi *phi = get_virtual_phi (e->dest))
11799         SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11800 }
11801
11802 /* Function vect_transform_loop.
11803
11804    The analysis phase has determined that the loop is vectorizable.
11805    Vectorize the loop - created vectorized stmts to replace the scalar
11806    stmts in the loop, and update the loop exit condition.
11807    Returns scalar epilogue loop if any.  */
11808
11809 class loop *
11810 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11811 {
11812   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11813   class loop *epilogue = NULL;
11814   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11815   int nbbs = loop->num_nodes;
11816   int i;
11817   tree niters_vector = NULL_TREE;
11818   tree step_vector = NULL_TREE;
11819   tree niters_vector_mult_vf = NULL_TREE;
11820   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11821   unsigned int lowest_vf = constant_lower_bound (vf);
11822   gimple *stmt;
11823   bool check_profitability = false;
11824   unsigned int th;
11825   bool flat = maybe_flat_loop_profile (loop);
11826
11827   DUMP_VECT_SCOPE ("vec_transform_loop");
11828
11829   loop_vinfo->shared->check_datarefs ();
11830
11831   /* Use the more conservative vectorization threshold.  If the number
11832      of iterations is constant assume the cost check has been performed
11833      by our caller.  If the threshold makes all loops profitable that
11834      run at least the (estimated) vectorization factor number of times
11835      checking is pointless, too.  */
11836   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11837   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11838     {
11839       if (dump_enabled_p ())
11840         dump_printf_loc (MSG_NOTE, vect_location,
11841                          "Profitability threshold is %d loop iterations.\n",
11842                          th);
11843       check_profitability = true;
11844     }
11845
11846   /* Make sure there exists a single-predecessor exit bb.  Do this before
11847      versioning.   */
11848   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11849   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11850     {
11851       split_loop_exit_edge (e, true);
11852       if (dump_enabled_p ())
11853         dump_printf (MSG_NOTE, "split exit edge\n");
11854     }
11855
11856   /* Version the loop first, if required, so the profitability check
11857      comes first.  */
11858
11859   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11860     {
11861       class loop *sloop
11862         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11863       sloop->force_vectorize = false;
11864       check_profitability = false;
11865     }
11866
11867   /* Make sure there exists a single-predecessor exit bb also on the
11868      scalar loop copy.  Do this after versioning but before peeling
11869      so CFG structure is fine for both scalar and if-converted loop
11870      to make slpeel_duplicate_current_defs_from_edges face matched
11871      loop closed PHI nodes on the exit.  */
11872   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11873     {
11874       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11875       if (! single_pred_p (e->dest))
11876         {
11877           split_loop_exit_edge (e, true);
11878           if (dump_enabled_p ())
11879             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11880         }
11881     }
11882
11883   tree niters = vect_build_loop_niters (loop_vinfo);
11884   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11885   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11886   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11887   tree advance;
11888   drs_init_vec orig_drs_init;
11889
11890   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11891                               &step_vector, &niters_vector_mult_vf, th,
11892                               check_profitability, niters_no_overflow,
11893                               &advance);
11894   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11895       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11896     {
11897       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11898          block after loop exit.  We need to scale all that.  */
11899       basic_block preheader
11900         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11901       preheader->count
11902         = preheader->count.apply_probability
11903               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11904       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11905                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11906       LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11907     }
11908
11909   if (niters_vector == NULL_TREE)
11910     {
11911       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11912           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11913           && known_eq (lowest_vf, vf))
11914         {
11915           niters_vector
11916             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11917                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11918           step_vector = build_one_cst (TREE_TYPE (niters));
11919         }
11920       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11921         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11922                                      &step_vector, niters_no_overflow);
11923       else
11924         /* vect_do_peeling subtracted the number of peeled prologue
11925            iterations from LOOP_VINFO_NITERS.  */
11926         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11927                                      &niters_vector, &step_vector,
11928                                      niters_no_overflow);
11929     }
11930
11931   /* 1) Make sure the loop header has exactly two entries
11932      2) Make sure we have a preheader basic block.  */
11933
11934   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11935
11936   split_edge (loop_preheader_edge (loop));
11937
11938   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11939     /* This will deal with any possible peeling.  */
11940     vect_prepare_for_masked_peels (loop_vinfo);
11941
11942   /* Handle any code motion that we need to for early-break vectorization after
11943      we've done peeling but just before we start vectorizing.  */
11944   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11945     move_early_exit_stmts (loop_vinfo);
11946
11947   /* Schedule the SLP instances first, then handle loop vectorization
11948      below.  */
11949   if (!loop_vinfo->slp_instances.is_empty ())
11950     {
11951       DUMP_VECT_SCOPE ("scheduling SLP instances");
11952       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11953     }
11954
11955   /* FORNOW: the vectorizer supports only loops which body consist
11956      of one basic block (header + empty latch). When the vectorizer will
11957      support more involved loop forms, the order by which the BBs are
11958      traversed need to be reconsidered.  */
11959
11960   for (i = 0; i < nbbs; i++)
11961     {
11962       basic_block bb = bbs[i];
11963       stmt_vec_info stmt_info;
11964
11965       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11966            gsi_next (&si))
11967         {
11968           gphi *phi = si.phi ();
11969           if (dump_enabled_p ())
11970             dump_printf_loc (MSG_NOTE, vect_location,
11971                              "------>vectorizing phi: %G", (gimple *) phi);
11972           stmt_info = loop_vinfo->lookup_stmt (phi);
11973           if (!stmt_info)
11974             continue;
11975
11976           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11977             vect_loop_kill_debug_uses (loop, stmt_info);
11978
11979           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11980               && !STMT_VINFO_LIVE_P (stmt_info))
11981             continue;
11982
11983           if (STMT_VINFO_VECTYPE (stmt_info)
11984               && (maybe_ne
11985                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11986               && dump_enabled_p ())
11987             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11988
11989           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11990                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11991                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11992                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11993                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11994                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11995               && ! PURE_SLP_STMT (stmt_info))
11996             {
11997               if (dump_enabled_p ())
11998                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11999               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12000             }
12001         }
12002
12003       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12004            gsi_next (&si))
12005         {
12006           gphi *phi = si.phi ();
12007           stmt_info = loop_vinfo->lookup_stmt (phi);
12008           if (!stmt_info)
12009             continue;
12010
12011           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12012               && !STMT_VINFO_LIVE_P (stmt_info))
12013             continue;
12014
12015           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12016                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12017                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12018                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12019                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12020                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12021               && ! PURE_SLP_STMT (stmt_info))
12022             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12023         }
12024
12025       for (gimple_stmt_iterator si = gsi_start_bb (bb);
12026            !gsi_end_p (si);)
12027         {
12028           stmt = gsi_stmt (si);
12029           /* During vectorization remove existing clobber stmts.  */
12030           if (gimple_clobber_p (stmt))
12031             {
12032               unlink_stmt_vdef (stmt);
12033               gsi_remove (&si, true);
12034               release_defs (stmt);
12035             }
12036           else
12037             {
12038               /* Ignore vector stmts created in the outer loop.  */
12039               stmt_info = loop_vinfo->lookup_stmt (stmt);
12040
12041               /* vector stmts created in the outer-loop during vectorization of
12042                  stmts in an inner-loop may not have a stmt_info, and do not
12043                  need to be vectorized.  */
12044               stmt_vec_info seen_store = NULL;
12045               if (stmt_info)
12046                 {
12047                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12048                     {
12049                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12050                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12051                            !gsi_end_p (subsi); gsi_next (&subsi))
12052                         {
12053                           stmt_vec_info pat_stmt_info
12054                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12055                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12056                                                     &si, &seen_store);
12057                         }
12058                       stmt_vec_info pat_stmt_info
12059                         = STMT_VINFO_RELATED_STMT (stmt_info);
12060                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12061                                                     &si, &seen_store))
12062                         maybe_set_vectorized_backedge_value (loop_vinfo,
12063                                                              pat_stmt_info);
12064                     }
12065                   else
12066                     {
12067                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12068                                                     &seen_store))
12069                         maybe_set_vectorized_backedge_value (loop_vinfo,
12070                                                              stmt_info);
12071                     }
12072                 }
12073               gsi_next (&si);
12074               if (seen_store)
12075                 {
12076                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12077                     /* Interleaving.  If IS_STORE is TRUE, the
12078                        vectorization of the interleaving chain was
12079                        completed - free all the stores in the chain.  */
12080                     vect_remove_stores (loop_vinfo,
12081                                         DR_GROUP_FIRST_ELEMENT (seen_store));
12082                   else
12083                     /* Free the attached stmt_vec_info and remove the stmt.  */
12084                     loop_vinfo->remove_stmt (stmt_info);
12085                 }
12086             }
12087         }
12088
12089       /* Stub out scalar statements that must not survive vectorization.
12090          Doing this here helps with grouped statements, or statements that
12091          are involved in patterns.  */
12092       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12093            !gsi_end_p (gsi); gsi_next (&gsi))
12094         {
12095           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12096           if (!call || !gimple_call_internal_p (call))
12097             continue;
12098           internal_fn ifn = gimple_call_internal_fn (call);
12099           if (ifn == IFN_MASK_LOAD)
12100             {
12101               tree lhs = gimple_get_lhs (call);
12102               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12103                 {
12104                   tree zero = build_zero_cst (TREE_TYPE (lhs));
12105                   gimple *new_stmt = gimple_build_assign (lhs, zero);
12106                   gsi_replace (&gsi, new_stmt, true);
12107                 }
12108             }
12109           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12110             {
12111               tree lhs = gimple_get_lhs (call);
12112               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12113                 {
12114                   tree else_arg
12115                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12116                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12117                   gsi_replace (&gsi, new_stmt, true);
12118                 }
12119             }
12120         }
12121     }                           /* BBs in loop */
12122
12123   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12124      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
12125   if (integer_onep (step_vector))
12126     niters_no_overflow = true;
12127   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12128                            niters_vector, step_vector, niters_vector_mult_vf,
12129                            !niters_no_overflow);
12130
12131   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12132
12133   /* True if the final iteration might not handle a full vector's
12134      worth of scalar iterations.  */
12135   bool final_iter_may_be_partial
12136     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12137       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
12138   /* The minimum number of iterations performed by the epilogue.  This
12139      is 1 when peeling for gaps because we always need a final scalar
12140      iteration.  */
12141   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12142   /* +1 to convert latch counts to loop iteration counts,
12143      -min_epilogue_iters to remove iterations that cannot be performed
12144        by the vector code.  */
12145   int bias_for_lowest = 1 - min_epilogue_iters;
12146   int bias_for_assumed = bias_for_lowest;
12147   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12148   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12149     {
12150       /* When the amount of peeling is known at compile time, the first
12151          iteration will have exactly alignment_npeels active elements.
12152          In the worst case it will have at least one.  */
12153       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12154       bias_for_lowest += lowest_vf - min_first_active;
12155       bias_for_assumed += assumed_vf - min_first_active;
12156     }
12157   /* In these calculations the "- 1" converts loop iteration counts
12158      back to latch counts.  */
12159   if (loop->any_upper_bound)
12160     {
12161       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12162       loop->nb_iterations_upper_bound
12163         = (final_iter_may_be_partial
12164            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12165                             lowest_vf) - 1
12166            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12167                              lowest_vf) - 1);
12168       if (main_vinfo
12169           /* Both peeling for alignment and peeling for gaps can end up
12170              with the scalar epilogue running for more than VF-1 iterations.  */
12171           && !main_vinfo->peeling_for_alignment
12172           && !main_vinfo->peeling_for_gaps)
12173         {
12174           unsigned int bound;
12175           poly_uint64 main_iters
12176             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12177                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12178           main_iters
12179             = upper_bound (main_iters,
12180                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12181           if (can_div_away_from_zero_p (main_iters,
12182                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12183                                         &bound))
12184             loop->nb_iterations_upper_bound
12185               = wi::umin ((bound_wide_int) (bound - 1),
12186                           loop->nb_iterations_upper_bound);
12187       }
12188   }
12189   if (loop->any_likely_upper_bound)
12190     loop->nb_iterations_likely_upper_bound
12191       = (final_iter_may_be_partial
12192          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12193                           + bias_for_lowest, lowest_vf) - 1
12194          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12195                            + bias_for_lowest, lowest_vf) - 1);
12196   if (loop->any_estimate)
12197     loop->nb_iterations_estimate
12198       = (final_iter_may_be_partial
12199          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12200                           assumed_vf) - 1
12201          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12202                            assumed_vf) - 1);
12203   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12204                                assumed_vf, flat);
12205
12206   if (dump_enabled_p ())
12207     {
12208       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12209         {
12210           dump_printf_loc (MSG_NOTE, vect_location,
12211                            "LOOP VECTORIZED\n");
12212           if (loop->inner)
12213             dump_printf_loc (MSG_NOTE, vect_location,
12214                              "OUTER LOOP VECTORIZED\n");
12215           dump_printf (MSG_NOTE, "\n");
12216         }
12217       else
12218         dump_printf_loc (MSG_NOTE, vect_location,
12219                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12220                          GET_MODE_NAME (loop_vinfo->vector_mode));
12221     }
12222
12223   /* Loops vectorized with a variable factor won't benefit from
12224      unrolling/peeling.  */
12225   if (!vf.is_constant ())
12226     {
12227       loop->unroll = 1;
12228       if (dump_enabled_p ())
12229         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12230                          " variable-length vectorization factor\n");
12231     }
12232   /* Free SLP instances here because otherwise stmt reference counting
12233      won't work.  */
12234   slp_instance instance;
12235   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12236     vect_free_slp_instance (instance);
12237   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12238   /* Clear-up safelen field since its value is invalid after vectorization
12239      since vectorized loop can have loop-carried dependencies.  */
12240   loop->safelen = 0;
12241
12242   if (epilogue)
12243     {
12244       update_epilogue_loop_vinfo (epilogue, advance);
12245
12246       epilogue->simduid = loop->simduid;
12247       epilogue->force_vectorize = loop->force_vectorize;
12248       epilogue->dont_vectorize = false;
12249     }
12250
12251   return epilogue;
12252 }
12253
12254 /* The code below is trying to perform simple optimization - revert
12255    if-conversion for masked stores, i.e. if the mask of a store is zero
12256    do not perform it and all stored value producers also if possible.
12257    For example,
12258      for (i=0; i<n; i++)
12259        if (c[i])
12260         {
12261           p1[i] += 1;
12262           p2[i] = p3[i] +2;
12263         }
12264    this transformation will produce the following semi-hammock:
12265
12266    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12267      {
12268        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12269        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12270        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12271        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12272        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12273        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12274      }
12275 */
12276
12277 void
12278 optimize_mask_stores (class loop *loop)
12279 {
12280   basic_block *bbs = get_loop_body (loop);
12281   unsigned nbbs = loop->num_nodes;
12282   unsigned i;
12283   basic_block bb;
12284   class loop *bb_loop;
12285   gimple_stmt_iterator gsi;
12286   gimple *stmt;
12287   auto_vec<gimple *> worklist;
12288   auto_purge_vect_location sentinel;
12289
12290   vect_location = find_loop_location (loop);
12291   /* Pick up all masked stores in loop if any.  */
12292   for (i = 0; i < nbbs; i++)
12293     {
12294       bb = bbs[i];
12295       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12296            gsi_next (&gsi))
12297         {
12298           stmt = gsi_stmt (gsi);
12299           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12300             worklist.safe_push (stmt);
12301         }
12302     }
12303
12304   free (bbs);
12305   if (worklist.is_empty ())
12306     return;
12307
12308   /* Loop has masked stores.  */
12309   while (!worklist.is_empty ())
12310     {
12311       gimple *last, *last_store;
12312       edge e, efalse;
12313       tree mask;
12314       basic_block store_bb, join_bb;
12315       gimple_stmt_iterator gsi_to;
12316       tree vdef, new_vdef;
12317       gphi *phi;
12318       tree vectype;
12319       tree zero;
12320
12321       last = worklist.pop ();
12322       mask = gimple_call_arg (last, 2);
12323       bb = gimple_bb (last);
12324       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12325          the same loop as if_bb.  It could be different to LOOP when two
12326          level loop-nest is vectorized and mask_store belongs to the inner
12327          one.  */
12328       e = split_block (bb, last);
12329       bb_loop = bb->loop_father;
12330       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12331       join_bb = e->dest;
12332       store_bb = create_empty_bb (bb);
12333       add_bb_to_loop (store_bb, bb_loop);
12334       e->flags = EDGE_TRUE_VALUE;
12335       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12336       /* Put STORE_BB to likely part.  */
12337       efalse->probability = profile_probability::likely ();
12338       e->probability = efalse->probability.invert ();
12339       store_bb->count = efalse->count ();
12340       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12341       if (dom_info_available_p (CDI_DOMINATORS))
12342         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12343       if (dump_enabled_p ())
12344         dump_printf_loc (MSG_NOTE, vect_location,
12345                          "Create new block %d to sink mask stores.",
12346                          store_bb->index);
12347       /* Create vector comparison with boolean result.  */
12348       vectype = TREE_TYPE (mask);
12349       zero = build_zero_cst (vectype);
12350       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12351       gsi = gsi_last_bb (bb);
12352       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12353       /* Create new PHI node for vdef of the last masked store:
12354          .MEM_2 = VDEF <.MEM_1>
12355          will be converted to
12356          .MEM.3 = VDEF <.MEM_1>
12357          and new PHI node will be created in join bb
12358          .MEM_2 = PHI <.MEM_1, .MEM_3>
12359       */
12360       vdef = gimple_vdef (last);
12361       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12362       gimple_set_vdef (last, new_vdef);
12363       phi = create_phi_node (vdef, join_bb);
12364       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12365
12366       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12367       while (true)
12368         {
12369           gimple_stmt_iterator gsi_from;
12370           gimple *stmt1 = NULL;
12371
12372           /* Move masked store to STORE_BB.  */
12373           last_store = last;
12374           gsi = gsi_for_stmt (last);
12375           gsi_from = gsi;
12376           /* Shift GSI to the previous stmt for further traversal.  */
12377           gsi_prev (&gsi);
12378           gsi_to = gsi_start_bb (store_bb);
12379           gsi_move_before (&gsi_from, &gsi_to);
12380           /* Setup GSI_TO to the non-empty block start.  */
12381           gsi_to = gsi_start_bb (store_bb);
12382           if (dump_enabled_p ())
12383             dump_printf_loc (MSG_NOTE, vect_location,
12384                              "Move stmt to created bb\n%G", last);
12385           /* Move all stored value producers if possible.  */
12386           while (!gsi_end_p (gsi))
12387             {
12388               tree lhs;
12389               imm_use_iterator imm_iter;
12390               use_operand_p use_p;
12391               bool res;
12392
12393               /* Skip debug statements.  */
12394               if (is_gimple_debug (gsi_stmt (gsi)))
12395                 {
12396                   gsi_prev (&gsi);
12397                   continue;
12398                 }
12399               stmt1 = gsi_stmt (gsi);
12400               /* Do not consider statements writing to memory or having
12401                  volatile operand.  */
12402               if (gimple_vdef (stmt1)
12403                   || gimple_has_volatile_ops (stmt1))
12404                 break;
12405               gsi_from = gsi;
12406               gsi_prev (&gsi);
12407               lhs = gimple_get_lhs (stmt1);
12408               if (!lhs)
12409                 break;
12410
12411               /* LHS of vectorized stmt must be SSA_NAME.  */
12412               if (TREE_CODE (lhs) != SSA_NAME)
12413                 break;
12414
12415               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12416                 {
12417                   /* Remove dead scalar statement.  */
12418                   if (has_zero_uses (lhs))
12419                     {
12420                       gsi_remove (&gsi_from, true);
12421                       continue;
12422                     }
12423                 }
12424
12425               /* Check that LHS does not have uses outside of STORE_BB.  */
12426               res = true;
12427               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12428                 {
12429                   gimple *use_stmt;
12430                   use_stmt = USE_STMT (use_p);
12431                   if (is_gimple_debug (use_stmt))
12432                     continue;
12433                   if (gimple_bb (use_stmt) != store_bb)
12434                     {
12435                       res = false;
12436                       break;
12437                     }
12438                 }
12439               if (!res)
12440                 break;
12441
12442               if (gimple_vuse (stmt1)
12443                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12444                 break;
12445
12446               /* Can move STMT1 to STORE_BB.  */
12447               if (dump_enabled_p ())
12448                 dump_printf_loc (MSG_NOTE, vect_location,
12449                                  "Move stmt to created bb\n%G", stmt1);
12450               gsi_move_before (&gsi_from, &gsi_to);
12451               /* Shift GSI_TO for further insertion.  */
12452               gsi_prev (&gsi_to);
12453             }
12454           /* Put other masked stores with the same mask to STORE_BB.  */
12455           if (worklist.is_empty ()
12456               || gimple_call_arg (worklist.last (), 2) != mask
12457               || worklist.last () != stmt1)
12458             break;
12459           last = worklist.pop ();
12460         }
12461       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12462     }
12463 }
12464
12465 /* Decide whether it is possible to use a zero-based induction variable
12466    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12467    the value that the induction variable must be able to hold in order
12468    to ensure that the rgroups eventually have no active vector elements.
12469    Return -1 otherwise.  */
12470
12471 widest_int
12472 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12473 {
12474   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12475   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12476   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12477
12478   /* Calculate the value that the induction variable must be able
12479      to hit in order to ensure that we end the loop with an all-false mask.
12480      This involves adding the maximum number of inactive trailing scalar
12481      iterations.  */
12482   widest_int iv_limit = -1;
12483   if (max_loop_iterations (loop, &iv_limit))
12484     {
12485       if (niters_skip)
12486         {
12487           /* Add the maximum number of skipped iterations to the
12488              maximum iteration count.  */
12489           if (TREE_CODE (niters_skip) == INTEGER_CST)
12490             iv_limit += wi::to_widest (niters_skip);
12491           else
12492             iv_limit += max_vf - 1;
12493         }
12494       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12495         /* Make a conservatively-correct assumption.  */
12496         iv_limit += max_vf - 1;
12497
12498       /* IV_LIMIT is the maximum number of latch iterations, which is also
12499          the maximum in-range IV value.  Round this value down to the previous
12500          vector alignment boundary and then add an extra full iteration.  */
12501       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12502       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12503     }
12504   return iv_limit;
12505 }
12506
12507 /* For the given rgroup_controls RGC, check whether an induction variable
12508    would ever hit a value that produces a set of all-false masks or zero
12509    lengths before wrapping around.  Return true if it's possible to wrap
12510    around before hitting the desirable value, otherwise return false.  */
12511
12512 bool
12513 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12514 {
12515   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12516
12517   if (iv_limit == -1)
12518     return true;
12519
12520   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12521   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12522   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12523
12524   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12525     return true;
12526
12527   return false;
12528 }