gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57 #include "case-cfn-macros.h"
  58
  59 /* Loop Vectorization Pass.
  60
  61    This pass tries to vectorize loops.
  62
  63    For example, the vectorizer transforms the following simple loop:
  64
  65         short a[N]; short b[N]; short c[N]; int i;
  66
  67         for (i=0; i<N; i++){
  68           a[i] = b[i] + c[i];
  69         }
  70
  71    as if it was manually vectorized by rewriting the source code into:
  72
  73         typedef int __attribute__((mode(V8HI))) v8hi;
  74         short a[N];  short b[N]; short c[N];   int i;
  75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  76         v8hi va, vb, vc;
  77
  78         for (i=0; i<N/8; i++){
  79           vb = pb[i];
  80           vc = pc[i];
  81           va = vb + vc;
  82           pa[i] = va;
  83         }
  84
  85         The main entry to this pass is vectorize_loops(), in which
  86    the vectorizer applies a set of analyses on a given set of loops,
  87    followed by the actual vectorization transformation for the loops that
  88    had successfully passed the analysis phase.
  89         Throughout this pass we make a distinction between two types of
  90    data: scalars (which are represented by SSA_NAMES), and memory references
  91    ("data-refs").  These two types of data require different handling both
  92    during analysis and transformation. The types of data-refs that the
  93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  95    accesses are required to have a simple (consecutive) access pattern.
  96
  97    Analysis phase:
  98    ===============
  99         The driver for the analysis phase is vect_analyze_loop().
 100    It applies a set of analyses, some of which rely on the scalar evolution
 101    analyzer (scev) developed by Sebastian Pop.
 102
 103         During the analysis phase the vectorizer records some information
 104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 105    loop, as well as general information about the loop as a whole, which is
 106    recorded in a "loop_vec_info" struct attached to each loop.
 107
 108    Transformation phase:
 109    =====================
 110         The loop transformation phase scans all the stmts in the loop, and
 111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 112    the loop that needs to be vectorized.  It inserts the vector code sequence
 113    just before the scalar stmt S, and records a pointer to the vector code
 114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 115    attached to S).  This pointer will be used for the vectorization of following
 116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 117    otherwise, we rely on dead code elimination for removing it.
 118
 119         For example, say stmt S1 was vectorized into stmt VS1:
 120
 121    VS1: vb = px[i];
 122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 123    S2:  a = b;
 124
 125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 128    resulting sequence would be:
 129
 130    VS1: vb = px[i];
 131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 132    VS2: va = vb;
 133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 134
 135         Operands that are not SSA_NAMEs, are data-refs that appear in
 136    load/store operations (like 'x[i]' in S1), and are handled differently.
 137
 138    Target modeling:
 139    =================
 140         Currently the only target specific information that is used is the
 141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 142    Targets that can support different sizes of vectors, for now will need
 143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 144    flexibility will be added in the future.
 145
 146         Since we only vectorize operations which vector form can be
 147    expressed using existing tree codes, to verify that an operation is
 148    supported, the vectorizer checks the relevant optab at the relevant
 149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 150    the value found is CODE_FOR_nothing, then there's no target support, and
 151    we can't vectorize the stmt.
 152
 153    For additional information on this project see:
 154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 155 */
 156
 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 158                                                 unsigned *);
 159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 160                                                bool *, bool *, bool);
 161
 162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 163    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 164    may already be set for general statements (not just data refs).  */
 165
 166 static opt_result
 167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 168                               bool vectype_maybe_set_p,
 169                               poly_uint64 *vf)
 170 {
 171   gimple *stmt = stmt_info->stmt;
 172
 173   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 174        && !STMT_VINFO_LIVE_P (stmt_info))
 175       || gimple_clobber_p (stmt))
 176     {
 177       if (dump_enabled_p ())
 178         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 179       return opt_result::success ();
 180     }
 181
 182   tree stmt_vectype, nunits_vectype;
 183   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 184                                                    &stmt_vectype,
 185                                                    &nunits_vectype);
 186   if (!res)
 187     return res;
 188
 189   if (stmt_vectype)
 190     {
 191       if (STMT_VINFO_VECTYPE (stmt_info))
 192         /* The only case when a vectype had been already set is for stmts
 193            that contain a data ref, or for "pattern-stmts" (stmts generated
 194            by the vectorizer to represent/replace a certain idiom).  */
 195         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 196                      || vectype_maybe_set_p)
 197                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 198       else
 199         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 200     }
 201
 202   if (nunits_vectype)
 203     vect_update_max_nunits (vf, nunits_vectype);
 204
 205   return opt_result::success ();
 206 }
 207
 208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 209    types of STMT_INFO and all attached pattern statements and update
 210    the vectorization factor VF accordingly.  Return true on success
 211    or false if something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (vec_info *vinfo,
 215                             stmt_vec_info stmt_info, poly_uint64 *vf)
 216 {
 217   if (dump_enabled_p ())
 218     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 219                      stmt_info->stmt);
 220   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 221   if (!res)
 222     return res;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 228       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 229
 230       /* If a pattern statement has def stmts, analyze them too.  */
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             dump_printf_loc (MSG_NOTE, vect_location,
 237                              "==> examining pattern def stmt: %G",
 238                              def_stmt_info->stmt);
 239           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 240           if (!res)
 241             return res;
 242         }
 243
 244       if (dump_enabled_p ())
 245         dump_printf_loc (MSG_NOTE, vect_location,
 246                          "==> examining pattern statement: %G",
 247                          stmt_info->stmt);
 248       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 249       if (!res)
 250         return res;
 251     }
 252
 253   return opt_result::success ();
 254 }
 255
 256 /* Function vect_determine_vectorization_factor
 257
 258    Determine the vectorization factor (VF).  VF is the number of data elements
 259    that are operated upon in parallel in a single iteration of the vectorized
 260    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 261    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 262    elements can fit in a single vector register.
 263
 264    We currently support vectorization of loops in which all types operated upon
 265    are of the same size.  Therefore this function currently sets VF according to
 266    the size of the types operated upon, and fails if there are multiple sizes
 267    in the loop.
 268
 269    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 270    original loop:
 271         for (i=0; i<N; i++){
 272           a[i] = b[i] + c[i];
 273         }
 274
 275    vectorized loop:
 276         for (i=0; i<N; i+=VF){
 277           a[i:VF] = b[i:VF] + c[i:VF];
 278         }
 279 */
 280
 281 static opt_result
 282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 283 {
 284   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 285   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 286   unsigned nbbs = loop->num_nodes;
 287   poly_uint64 vectorization_factor = 1;
 288   tree scalar_type = NULL_TREE;
 289   gphi *phi;
 290   tree vectype;
 291   stmt_vec_info stmt_info;
 292   unsigned i;
 293
 294   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 295
 296   for (i = 0; i < nbbs; i++)
 297     {
 298       basic_block bb = bbs[i];
 299
 300       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 301            gsi_next (&si))
 302         {
 303           phi = si.phi ();
 304           stmt_info = loop_vinfo->lookup_stmt (phi);
 305           if (dump_enabled_p ())
 306             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 307                              (gimple *) phi);
 308
 309           gcc_assert (stmt_info);
 310
 311           if (STMT_VINFO_RELEVANT_P (stmt_info)
 312               || STMT_VINFO_LIVE_P (stmt_info))
 313             {
 314               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 315               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 316
 317               if (dump_enabled_p ())
 318                 dump_printf_loc (MSG_NOTE, vect_location,
 319                                  "get vectype for scalar type:  %T\n",
 320                                  scalar_type);
 321
 322               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 323               if (!vectype)
 324                 return opt_result::failure_at (phi,
 325                                                "not vectorized: unsupported "
 326                                                "data-type %T\n",
 327                                                scalar_type);
 328               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 329
 330               if (dump_enabled_p ())
 331                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 332                                  vectype);
 333
 334               if (dump_enabled_p ())
 335                 {
 336                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 337                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 338                   dump_printf (MSG_NOTE, "\n");
 339                 }
 340
 341               vect_update_max_nunits (&vectorization_factor, vectype);
 342             }
 343         }
 344
 345       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 346            gsi_next (&si))
 347         {
 348           if (is_gimple_debug (gsi_stmt (si)))
 349             continue;
 350           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 351           opt_result res
 352             = vect_determine_vf_for_stmt (loop_vinfo,
 353                                           stmt_info, &vectorization_factor);
 354           if (!res)
 355             return res;
 356         }
 357     }
 358
 359   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 360   if (dump_enabled_p ())
 361     {
 362       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 363       dump_dec (MSG_NOTE, vectorization_factor);
 364       dump_printf (MSG_NOTE, "\n");
 365     }
 366
 367   if (known_le (vectorization_factor, 1U))
 368     return opt_result::failure_at (vect_location,
 369                                    "not vectorized: unsupported data-type\n");
 370   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 371   return opt_result::success ();
 372 }
 373
 374
 375 /* Function vect_is_simple_iv_evolution.
 376
 377    FORNOW: A simple evolution of an induction variables in the loop is
 378    considered a polynomial evolution.  */
 379
 380 static bool
 381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 382                              tree * step)
 383 {
 384   tree init_expr;
 385   tree step_expr;
 386   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 387   basic_block bb;
 388
 389   /* When there is no evolution in this loop, the evolution function
 390      is not "simple".  */
 391   if (evolution_part == NULL_TREE)
 392     return false;
 393
 394   /* When the evolution is a polynomial of degree >= 2
 395      the evolution function is not "simple".  */
 396   if (tree_is_chrec (evolution_part))
 397     return false;
 398
 399   step_expr = evolution_part;
 400   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 401
 402   if (dump_enabled_p ())
 403     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 404                      step_expr, init_expr);
 405
 406   *init = init_expr;
 407   *step = step_expr;
 408
 409   if (TREE_CODE (step_expr) != INTEGER_CST
 410       && (TREE_CODE (step_expr) != SSA_NAME
 411           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 412               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 413           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 414               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 415                   || !flag_associative_math)))
 416       && (TREE_CODE (step_expr) != REAL_CST
 417           || !flag_associative_math))
 418     {
 419       if (dump_enabled_p ())
 420         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 421                          "step unknown.\n");
 422       return false;
 423     }
 424
 425   return true;
 426 }
 427
 428 /* Function vect_is_nonlinear_iv_evolution
 429
 430    Only support nonlinear induction for integer type
 431    1. neg
 432    2. mul by constant
 433    3. lshift/rshift by constant.
 434
 435    For neg induction, return a fake step as integer -1.  */
 436 static bool
 437 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 438                                 gphi* loop_phi_node, tree *init, tree *step)
 439 {
 440   tree init_expr, ev_expr, result, op1, op2;
 441   gimple* def;
 442
 443   if (gimple_phi_num_args (loop_phi_node) != 2)
 444     return false;
 445
 446   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 447   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 448
 449   /* Support nonlinear induction only for integer type.  */
 450   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 451     return false;
 452
 453   *init = init_expr;
 454   result = PHI_RESULT (loop_phi_node);
 455
 456   if (TREE_CODE (ev_expr) != SSA_NAME
 457       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 458       || !is_gimple_assign (def))
 459     return false;
 460
 461   enum tree_code t_code = gimple_assign_rhs_code (def);
 462   switch (t_code)
 463     {
 464     case NEGATE_EXPR:
 465       if (gimple_assign_rhs1 (def) != result)
 466         return false;
 467       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 468       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 469       break;
 470
 471     case RSHIFT_EXPR:
 472     case LSHIFT_EXPR:
 473     case MULT_EXPR:
 474       op1 = gimple_assign_rhs1 (def);
 475       op2 = gimple_assign_rhs2 (def);
 476       if (TREE_CODE (op2) != INTEGER_CST
 477           || op1 != result)
 478         return false;
 479       *step = op2;
 480       if (t_code == LSHIFT_EXPR)
 481         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 482       else if (t_code == RSHIFT_EXPR)
 483         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 484       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 485       else
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 487       break;
 488
 489     default:
 490       return false;
 491     }
 492
 493   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 494   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 495
 496   return true;
 497 }
 498
 499 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 500    what we are assuming is a double reduction.  For example, given
 501    a structure like this:
 502
 503       outer1:
 504         x_1 = PHI <x_4(outer2), ...>;
 505         ...
 506
 507       inner:
 508         x_2 = PHI <x_1(outer1), ...>;
 509         ...
 510         x_3 = ...;
 511         ...
 512
 513       outer2:
 514         x_4 = PHI <x_3(inner)>;
 515         ...
 516
 517    outer loop analysis would treat x_1 as a double reduction phi and
 518    this function would then return true for x_2.  */
 519
 520 static bool
 521 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 522 {
 523   use_operand_p use_p;
 524   ssa_op_iter op_iter;
 525   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 526     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 527       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 528         return true;
 529   return false;
 530 }
 531
 532 /* Returns true if Phi is a first-order recurrence. A first-order
 533    recurrence is a non-reduction recurrence relation in which the value of
 534    the recurrence in the current loop iteration equals a value defined in
 535    the previous iteration.  */
 536
 537 static bool
 538 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 539                                    gphi *phi)
 540 {
 541   /* A nested cycle isn't vectorizable as first order recurrence.  */
 542   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 543     return false;
 544
 545   /* Ensure the loop latch definition is from within the loop.  */
 546   edge latch = loop_latch_edge (loop);
 547   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 548   if (TREE_CODE (ldef) != SSA_NAME
 549       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 550       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 551       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 552     return false;
 553
 554   tree def = gimple_phi_result (phi);
 555
 556   /* Ensure every use_stmt of the phi node is dominated by the latch
 557      definition.  */
 558   imm_use_iterator imm_iter;
 559   use_operand_p use_p;
 560   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 561     if (!is_gimple_debug (USE_STMT (use_p))
 562         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 563             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 564                                             USE_STMT (use_p))))
 565       return false;
 566
 567   /* First-order recurrence autovectorization needs shuffle vector.  */
 568   tree scalar_type = TREE_TYPE (def);
 569   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 570   if (!vectype)
 571     return false;
 572
 573   return true;
 574 }
 575
 576 /* Function vect_analyze_scalar_cycles_1.
 577
 578    Examine the cross iteration def-use cycles of scalar variables
 579    in LOOP.  LOOP_VINFO represents the loop that is now being
 580    considered for vectorization (can be LOOP, or an outer-loop
 581    enclosing LOOP).  SLP indicates there will be some subsequent
 582    slp analyses or not.  */
 583
 584 static void
 585 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 586                               bool slp)
 587 {
 588   basic_block bb = loop->header;
 589   tree init, step;
 590   auto_vec<stmt_vec_info, 64> worklist;
 591   gphi_iterator gsi;
 592   bool double_reduc, reduc_chain;
 593
 594   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 595
 596   /* First - identify all inductions.  Reduction detection assumes that all the
 597      inductions have been identified, therefore, this order must not be
 598      changed.  */
 599   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 600     {
 601       gphi *phi = gsi.phi ();
 602       tree access_fn = NULL;
 603       tree def = PHI_RESULT (phi);
 604       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 605
 606       if (dump_enabled_p ())
 607         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 608                          (gimple *) phi);
 609
 610       /* Skip virtual phi's.  The data dependences that are associated with
 611          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 612       if (virtual_operand_p (def))
 613         continue;
 614
 615       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 616
 617       /* Analyze the evolution function.  */
 618       access_fn = analyze_scalar_evolution (loop, def);
 619       if (access_fn)
 620         {
 621           STRIP_NOPS (access_fn);
 622           if (dump_enabled_p ())
 623             dump_printf_loc (MSG_NOTE, vect_location,
 624                              "Access function of PHI: %T\n", access_fn);
 625           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 626             = initial_condition_in_loop_num (access_fn, loop->num);
 627           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 628             = evolution_part_in_loop_num (access_fn, loop->num);
 629         }
 630
 631       if ((!access_fn
 632            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 633            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 634                                             &init, &step)
 635            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 636                && TREE_CODE (step) != INTEGER_CST))
 637           /* Only handle nonlinear iv for same loop.  */
 638           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 640                                                   phi, &init, &step)))
 641         {
 642           worklist.safe_push (stmt_vinfo);
 643           continue;
 644         }
 645
 646       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 647                   != NULL_TREE);
 648       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 649
 650       if (dump_enabled_p ())
 651         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 652       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 653     }
 654
 655
 656   /* Second - identify all reductions and nested cycles.  */
 657   while (worklist.length () > 0)
 658     {
 659       stmt_vec_info stmt_vinfo = worklist.pop ();
 660       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 661       tree def = PHI_RESULT (phi);
 662
 663       if (dump_enabled_p ())
 664         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 665                          (gimple *) phi);
 666
 667       gcc_assert (!virtual_operand_p (def)
 668                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 669
 670       stmt_vec_info reduc_stmt_info
 671         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 672                                     &reduc_chain, slp);
 673       if (reduc_stmt_info)
 674         {
 675           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 676           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 677           if (double_reduc)
 678             {
 679               if (dump_enabled_p ())
 680                 dump_printf_loc (MSG_NOTE, vect_location,
 681                                  "Detected double reduction.\n");
 682
 683               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 684               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 685             }
 686           else
 687             {
 688               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 689                 {
 690                   if (dump_enabled_p ())
 691                     dump_printf_loc (MSG_NOTE, vect_location,
 692                                      "Detected vectorizable nested cycle.\n");
 693
 694                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 695                 }
 696               else
 697                 {
 698                   if (dump_enabled_p ())
 699                     dump_printf_loc (MSG_NOTE, vect_location,
 700                                      "Detected reduction.\n");
 701
 702                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 703                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 704                   /* Store the reduction cycles for possible vectorization in
 705                      loop-aware SLP if it was not detected as reduction
 706                      chain.  */
 707                   if (! reduc_chain)
 708                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 709                       (reduc_stmt_info);
 710                 }
 711             }
 712         }
 713       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 714         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 715       else
 716         if (dump_enabled_p ())
 717           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 718                            "Unknown def-use cycle pattern.\n");
 719     }
 720 }
 721
 722
 723 /* Function vect_analyze_scalar_cycles.
 724
 725    Examine the cross iteration def-use cycles of scalar variables, by
 726    analyzing the loop-header PHIs of scalar variables.  Classify each
 727    cycle as one of the following: invariant, induction, reduction, unknown.
 728    We do that for the loop represented by LOOP_VINFO, and also to its
 729    inner-loop, if exists.
 730    Examples for scalar cycles:
 731
 732    Example1: reduction:
 733
 734               loop1:
 735               for (i=0; i<N; i++)
 736                  sum += a[i];
 737
 738    Example2: induction:
 739
 740               loop2:
 741               for (i=0; i<N; i++)
 742                  a[i] = i;  */
 743
 744 static void
 745 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 746 {
 747   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 748
 749   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 750
 751   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 752      Reductions in such inner-loop therefore have different properties than
 753      the reductions in the nest that gets vectorized:
 754      1. When vectorized, they are executed in the same order as in the original
 755         scalar loop, so we can't change the order of computation when
 756         vectorizing them.
 757      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 758         current checks are too strict.  */
 759
 760   if (loop->inner)
 761     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 762 }
 763
 764 /* Transfer group and reduction information from STMT_INFO to its
 765    pattern stmt.  */
 766
 767 static void
 768 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 769 {
 770   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 771   stmt_vec_info stmtp;
 772   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 773               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 774   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 775   do
 776     {
 777       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 778       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 779                            == STMT_VINFO_DEF_TYPE (stmt_info));
 780       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 781       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 782       if (stmt_info)
 783         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 784           = STMT_VINFO_RELATED_STMT (stmt_info);
 785     }
 786   while (stmt_info);
 787 }
 788
 789 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 790
 791 static void
 792 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 793 {
 794   stmt_vec_info first;
 795   unsigned i;
 796
 797   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 798     {
 799       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 800       while (next)
 801         {
 802           if ((STMT_VINFO_IN_PATTERN_P (next)
 803                != STMT_VINFO_IN_PATTERN_P (first))
 804               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 805             break;
 806           next = REDUC_GROUP_NEXT_ELEMENT (next);
 807         }
 808       /* If all reduction chain members are well-formed patterns adjust
 809          the group to group the pattern stmts instead.  */
 810       if (! next
 811           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 812         {
 813           if (STMT_VINFO_IN_PATTERN_P (first))
 814             {
 815               vect_fixup_reduc_chain (first);
 816               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 817                 = STMT_VINFO_RELATED_STMT (first);
 818             }
 819         }
 820       /* If not all stmt in the chain are patterns or if we failed
 821          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 822          it as regular reduction instead.  */
 823       else
 824         {
 825           stmt_vec_info vinfo = first;
 826           stmt_vec_info last = NULL;
 827           while (vinfo)
 828             {
 829               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 830               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 831               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 832               last = vinfo;
 833               vinfo = next;
 834             }
 835           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 836             = vect_internal_def;
 837           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 838           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 839           --i;
 840         }
 841     }
 842 }
 843
 844 /* Function vect_get_loop_niters.
 845
 846    Determine how many iterations the loop is executed and place it
 847    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 848    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 849    niter information holds in ASSUMPTIONS.
 850
 851    Return the loop exit condition.  */
 852
 853
 854 static gcond *
 855 vect_get_loop_niters (class loop *loop, tree *assumptions,
 856                       tree *number_of_iterations, tree *number_of_iterationsm1)
 857 {
 858   edge exit = single_exit (loop);
 859   class tree_niter_desc niter_desc;
 860   tree niter_assumptions, niter, may_be_zero;
 861   gcond *cond = get_loop_exit_condition (loop);
 862
 863   *assumptions = boolean_true_node;
 864   *number_of_iterationsm1 = chrec_dont_know;
 865   *number_of_iterations = chrec_dont_know;
 866   DUMP_VECT_SCOPE ("get_loop_niters");
 867
 868   if (!exit)
 869     return cond;
 870
 871   may_be_zero = NULL_TREE;
 872   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 873       || chrec_contains_undetermined (niter_desc.niter))
 874     return cond;
 875
 876   niter_assumptions = niter_desc.assumptions;
 877   may_be_zero = niter_desc.may_be_zero;
 878   niter = niter_desc.niter;
 879
 880   if (may_be_zero && integer_zerop (may_be_zero))
 881     may_be_zero = NULL_TREE;
 882
 883   if (may_be_zero)
 884     {
 885       if (COMPARISON_CLASS_P (may_be_zero))
 886         {
 887           /* Try to combine may_be_zero with assumptions, this can simplify
 888              computation of niter expression.  */
 889           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 890             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 891                                              niter_assumptions,
 892                                              fold_build1 (TRUTH_NOT_EXPR,
 893                                                           boolean_type_node,
 894                                                           may_be_zero));
 895           else
 896             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 897                                  build_int_cst (TREE_TYPE (niter), 0),
 898                                  rewrite_to_non_trapping_overflow (niter));
 899
 900           may_be_zero = NULL_TREE;
 901         }
 902       else if (integer_nonzerop (may_be_zero))
 903         {
 904           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 905           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 906           return cond;
 907         }
 908       else
 909         return cond;
 910     }
 911
 912   *assumptions = niter_assumptions;
 913   *number_of_iterationsm1 = niter;
 914
 915   /* We want the number of loop header executions which is the number
 916      of latch executions plus one.
 917      ???  For UINT_MAX latch executions this number overflows to zero
 918      for loops like do { n++; } while (n != 0);  */
 919   if (niter && !chrec_contains_undetermined (niter))
 920     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 921                           build_int_cst (TREE_TYPE (niter), 1));
 922   *number_of_iterations = niter;
 923
 924   return cond;
 925 }
 926
 927 /* Function bb_in_loop_p
 928
 929    Used as predicate for dfs order traversal of the loop bbs.  */
 930
 931 static bool
 932 bb_in_loop_p (const_basic_block bb, const void *data)
 933 {
 934   const class loop *const loop = (const class loop *)data;
 935   if (flow_bb_inside_loop_p (loop, bb))
 936     return true;
 937   return false;
 938 }
 939
 940
 941 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 942    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 943
 944 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 945   : vec_info (vec_info::loop, shared),
 946     loop (loop_in),
 947     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 948     num_itersm1 (NULL_TREE),
 949     num_iters (NULL_TREE),
 950     num_iters_unchanged (NULL_TREE),
 951     num_iters_assumptions (NULL_TREE),
 952     vector_costs (nullptr),
 953     scalar_costs (nullptr),
 954     th (0),
 955     versioning_threshold (0),
 956     vectorization_factor (0),
 957     main_loop_edge (nullptr),
 958     skip_main_loop_edge (nullptr),
 959     skip_this_loop_edge (nullptr),
 960     reusable_accumulators (),
 961     suggested_unroll_factor (1),
 962     max_vectorization_factor (0),
 963     mask_skip_niters (NULL_TREE),
 964     rgroup_compare_type (NULL_TREE),
 965     simd_if_cond (NULL_TREE),
 966     unaligned_dr (NULL),
 967     peeling_for_alignment (0),
 968     ptr_mask (0),
 969     ivexpr_map (NULL),
 970     scan_map (NULL),
 971     slp_unrolling_factor (1),
 972     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 973     vectorizable (false),
 974     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 975     using_partial_vectors_p (false),
 976     epil_using_partial_vectors_p (false),
 977     partial_load_store_bias (0),
 978     peeling_for_gaps (false),
 979     peeling_for_niter (false),
 980     no_data_dependencies (false),
 981     has_mask_store (false),
 982     scalar_loop_scaling (profile_probability::uninitialized ()),
 983     scalar_loop (NULL),
 984     orig_loop_info (NULL)
 985 {
 986   /* CHECKME: We want to visit all BBs before their successors (except for
 987      latch blocks, for which this assertion wouldn't hold).  In the simple
 988      case of the loop forms we allow, a dfs order of the BBs would the same
 989      as reversed postorder traversal, so we are safe.  */
 990
 991   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 992                                           bbs, loop->num_nodes, loop);
 993   gcc_assert (nbbs == loop->num_nodes);
 994
 995   for (unsigned int i = 0; i < nbbs; i++)
 996     {
 997       basic_block bb = bbs[i];
 998       gimple_stmt_iterator si;
 999
1000       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1001         {
1002           gimple *phi = gsi_stmt (si);
1003           gimple_set_uid (phi, 0);
1004           add_stmt (phi);
1005         }
1006
1007       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1008         {
1009           gimple *stmt = gsi_stmt (si);
1010           gimple_set_uid (stmt, 0);
1011           if (is_gimple_debug (stmt))
1012             continue;
1013           add_stmt (stmt);
1014           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1015              third argument is the #pragma omp simd if (x) condition, when 0,
1016              loop shouldn't be vectorized, when non-zero constant, it should
1017              be vectorized normally, otherwise versioned with vectorized loop
1018              done if the condition is non-zero at runtime.  */
1019           if (loop_in->simduid
1020               && is_gimple_call (stmt)
1021               && gimple_call_internal_p (stmt)
1022               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1023               && gimple_call_num_args (stmt) >= 3
1024               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1025               && (loop_in->simduid
1026                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1027             {
1028               tree arg = gimple_call_arg (stmt, 2);
1029               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1030                 simd_if_cond = arg;
1031               else
1032                 gcc_assert (integer_nonzerop (arg));
1033             }
1034         }
1035     }
1036
1037   epilogue_vinfos.create (6);
1038 }
1039
1040 /* Free all levels of rgroup CONTROLS.  */
1041
1042 void
1043 release_vec_loop_controls (vec<rgroup_controls> *controls)
1044 {
1045   rgroup_controls *rgc;
1046   unsigned int i;
1047   FOR_EACH_VEC_ELT (*controls, i, rgc)
1048     rgc->controls.release ();
1049   controls->release ();
1050 }
1051
1052 /* Free all memory used by the _loop_vec_info, as well as all the
1053    stmt_vec_info structs of all the stmts in the loop.  */
1054
1055 _loop_vec_info::~_loop_vec_info ()
1056 {
1057   free (bbs);
1058
1059   release_vec_loop_controls (&masks);
1060   release_vec_loop_controls (&lens);
1061   delete ivexpr_map;
1062   delete scan_map;
1063   epilogue_vinfos.release ();
1064   delete scalar_costs;
1065   delete vector_costs;
1066
1067   /* When we release an epiloge vinfo that we do not intend to use
1068      avoid clearing AUX of the main loop which should continue to
1069      point to the main loop vinfo since otherwise we'll leak that.  */
1070   if (loop->aux == this)
1071     loop->aux = NULL;
1072 }
1073
1074 /* Return an invariant or register for EXPR and emit necessary
1075    computations in the LOOP_VINFO loop preheader.  */
1076
1077 tree
1078 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1079 {
1080   if (is_gimple_reg (expr)
1081       || is_gimple_min_invariant (expr))
1082     return expr;
1083
1084   if (! loop_vinfo->ivexpr_map)
1085     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1086   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1087   if (! cached)
1088     {
1089       gimple_seq stmts = NULL;
1090       cached = force_gimple_operand (unshare_expr (expr),
1091                                      &stmts, true, NULL_TREE);
1092       if (stmts)
1093         {
1094           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1095           gsi_insert_seq_on_edge_immediate (e, stmts);
1096         }
1097     }
1098   return cached;
1099 }
1100
1101 /* Return true if we can use CMP_TYPE as the comparison type to produce
1102    all masks required to mask LOOP_VINFO.  */
1103
1104 static bool
1105 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1106 {
1107   rgroup_controls *rgm;
1108   unsigned int i;
1109   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1110     if (rgm->type != NULL_TREE
1111         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1112                                             cmp_type, rgm->type,
1113                                             OPTIMIZE_FOR_SPEED))
1114       return false;
1115   return true;
1116 }
1117
1118 /* Calculate the maximum number of scalars per iteration for every
1119    rgroup in LOOP_VINFO.  */
1120
1121 static unsigned int
1122 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1123 {
1124   unsigned int res = 1;
1125   unsigned int i;
1126   rgroup_controls *rgm;
1127   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1128     res = MAX (res, rgm->max_nscalars_per_iter);
1129   return res;
1130 }
1131
1132 /* Calculate the minimum precision necessary to represent:
1133
1134       MAX_NITERS * FACTOR
1135
1136    as an unsigned integer, where MAX_NITERS is the maximum number of
1137    loop header iterations for the original scalar form of LOOP_VINFO.  */
1138
1139 static unsigned
1140 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1141 {
1142   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1143
1144   /* Get the maximum number of iterations that is representable
1145      in the counter type.  */
1146   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1147   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1148
1149   /* Get a more refined estimate for the number of iterations.  */
1150   widest_int max_back_edges;
1151   if (max_loop_iterations (loop, &max_back_edges))
1152     max_ni = wi::smin (max_ni, max_back_edges + 1);
1153
1154   /* Work out how many bits we need to represent the limit.  */
1155   return wi::min_precision (max_ni * factor, UNSIGNED);
1156 }
1157
1158 /* True if the loop needs peeling or partial vectors when vectorized.  */
1159
1160 static bool
1161 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1162 {
1163   unsigned HOST_WIDE_INT const_vf;
1164   HOST_WIDE_INT max_niter
1165     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1166
1167   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1168   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1169     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1170                                           (loop_vinfo));
1171
1172   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1173       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1174     {
1175       /* Work out the (constant) number of iterations that need to be
1176          peeled for reasons other than niters.  */
1177       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1178       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1179         peel_niter += 1;
1180       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1181                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1182         return true;
1183     }
1184   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1185       /* ??? When peeling for gaps but not alignment, we could
1186          try to check whether the (variable) niters is known to be
1187          VF * N + 1.  That's something of a niche case though.  */
1188       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1189       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1190       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1191            < (unsigned) exact_log2 (const_vf))
1192           /* In case of versioning, check if the maximum number of
1193              iterations is greater than th.  If they are identical,
1194              the epilogue is unnecessary.  */
1195           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1196               || ((unsigned HOST_WIDE_INT) max_niter
1197                   > (th / const_vf) * const_vf))))
1198     return true;
1199
1200   return false;
1201 }
1202
1203 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1204    whether we can actually generate the masks required.  Return true if so,
1205    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1206
1207 static bool
1208 vect_verify_full_masking (loop_vec_info loop_vinfo)
1209 {
1210   unsigned int min_ni_width;
1211   unsigned int max_nscalars_per_iter
1212     = vect_get_max_nscalars_per_iter (loop_vinfo);
1213
1214   /* Use a normal loop if there are no statements that need masking.
1215      This only happens in rare degenerate cases: it means that the loop
1216      has no loads, no stores, and no live-out values.  */
1217   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1218     return false;
1219
1220   /* Work out how many bits we need to represent the limit.  */
1221   min_ni_width
1222     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1223
1224   /* Find a scalar mode for which WHILE_ULT is supported.  */
1225   opt_scalar_int_mode cmp_mode_iter;
1226   tree cmp_type = NULL_TREE;
1227   tree iv_type = NULL_TREE;
1228   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1229   unsigned int iv_precision = UINT_MAX;
1230
1231   if (iv_limit != -1)
1232     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1233                                       UNSIGNED);
1234
1235   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1236     {
1237       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1238       if (cmp_bits >= min_ni_width
1239           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1240         {
1241           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1242           if (this_type
1243               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1244             {
1245               /* Although we could stop as soon as we find a valid mode,
1246                  there are at least two reasons why that's not always the
1247                  best choice:
1248
1249                  - An IV that's Pmode or wider is more likely to be reusable
1250                    in address calculations than an IV that's narrower than
1251                    Pmode.
1252
1253                  - Doing the comparison in IV_PRECISION or wider allows
1254                    a natural 0-based IV, whereas using a narrower comparison
1255                    type requires mitigations against wrap-around.
1256
1257                  Conversely, if the IV limit is variable, doing the comparison
1258                  in a wider type than the original type can introduce
1259                  unnecessary extensions, so picking the widest valid mode
1260                  is not always a good choice either.
1261
1262                  Here we prefer the first IV type that's Pmode or wider,
1263                  and the first comparison type that's IV_PRECISION or wider.
1264                  (The comparison type must be no wider than the IV type,
1265                  to avoid extensions in the vector loop.)
1266
1267                  ??? We might want to try continuing beyond Pmode for ILP32
1268                  targets if CMP_BITS < IV_PRECISION.  */
1269               iv_type = this_type;
1270               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1271                 cmp_type = this_type;
1272               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1273                 break;
1274             }
1275         }
1276     }
1277
1278   if (!cmp_type)
1279     return false;
1280
1281   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1282   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1283   return true;
1284 }
1285
1286 /* Check whether we can use vector access with length based on precison
1287    comparison.  So far, to keep it simple, we only allow the case that the
1288    precision of the target supported length is larger than the precision
1289    required by loop niters.  */
1290
1291 static bool
1292 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1293 {
1294   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1295     return false;
1296
1297   machine_mode len_load_mode = get_len_load_store_mode
1298     (loop_vinfo->vector_mode, true).require ();
1299   machine_mode len_store_mode = get_len_load_store_mode
1300     (loop_vinfo->vector_mode, false).require ();
1301
1302   signed char partial_load_bias = internal_len_load_store_bias
1303     (IFN_LEN_LOAD, len_load_mode);
1304
1305   signed char partial_store_bias = internal_len_load_store_bias
1306     (IFN_LEN_STORE, len_store_mode);
1307
1308   gcc_assert (partial_load_bias == partial_store_bias);
1309
1310   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1311     return false;
1312
1313   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1314      len_loads with a length of zero.  In order to avoid that we prohibit
1315      more than one loop length here.  */
1316   if (partial_load_bias == -1
1317       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1318     return false;
1319
1320   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1321
1322   unsigned int max_nitems_per_iter = 1;
1323   unsigned int i;
1324   rgroup_controls *rgl;
1325   /* Find the maximum number of items per iteration for every rgroup.  */
1326   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1327     {
1328       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1329       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1330     }
1331
1332   /* Work out how many bits we need to represent the length limit.  */
1333   unsigned int min_ni_prec
1334     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1335
1336   /* Now use the maximum of below precisions for one suitable IV type:
1337      - the IV's natural precision
1338      - the precision needed to hold: the maximum number of scalar
1339        iterations multiplied by the scale factor (min_ni_prec above)
1340      - the Pmode precision
1341
1342      If min_ni_prec is less than the precision of the current niters,
1343      we perfer to still use the niters type.  Prefer to use Pmode and
1344      wider IV to avoid narrow conversions.  */
1345
1346   unsigned int ni_prec
1347     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1348   min_ni_prec = MAX (min_ni_prec, ni_prec);
1349   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1350
1351   tree iv_type = NULL_TREE;
1352   opt_scalar_int_mode tmode_iter;
1353   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1354     {
1355       scalar_mode tmode = tmode_iter.require ();
1356       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1357
1358       /* ??? Do we really want to construct one IV whose precision exceeds
1359          BITS_PER_WORD?  */
1360       if (tbits > BITS_PER_WORD)
1361         break;
1362
1363       /* Find the first available standard integral type.  */
1364       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1365         {
1366           iv_type = build_nonstandard_integer_type (tbits, true);
1367           break;
1368         }
1369     }
1370
1371   if (!iv_type)
1372     {
1373       if (dump_enabled_p ())
1374         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1375                          "can't vectorize with length-based partial vectors"
1376                          " because there is no suitable iv type.\n");
1377       return false;
1378     }
1379
1380   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1381   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1382
1383   return true;
1384 }
1385
1386 /* Calculate the cost of one scalar iteration of the loop.  */
1387 static void
1388 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1389 {
1390   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1391   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1392   int nbbs = loop->num_nodes, factor;
1393   int innerloop_iters, i;
1394
1395   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1396
1397   /* Gather costs for statements in the scalar loop.  */
1398
1399   /* FORNOW.  */
1400   innerloop_iters = 1;
1401   if (loop->inner)
1402     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1403
1404   for (i = 0; i < nbbs; i++)
1405     {
1406       gimple_stmt_iterator si;
1407       basic_block bb = bbs[i];
1408
1409       if (bb->loop_father == loop->inner)
1410         factor = innerloop_iters;
1411       else
1412         factor = 1;
1413
1414       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1415         {
1416           gimple *stmt = gsi_stmt (si);
1417           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1418
1419           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1420             continue;
1421
1422           /* Skip stmts that are not vectorized inside the loop.  */
1423           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1424           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1425               && (!STMT_VINFO_LIVE_P (vstmt_info)
1426                   || !VECTORIZABLE_CYCLE_DEF
1427                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1428             continue;
1429
1430           vect_cost_for_stmt kind;
1431           if (STMT_VINFO_DATA_REF (stmt_info))
1432             {
1433               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1434                kind = scalar_load;
1435              else
1436                kind = scalar_store;
1437             }
1438           else if (vect_nop_conversion_p (stmt_info))
1439             continue;
1440           else
1441             kind = scalar_stmt;
1442
1443           /* We are using vect_prologue here to avoid scaling twice
1444              by the inner loop factor.  */
1445           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1446                             factor, kind, stmt_info, 0, vect_prologue);
1447         }
1448     }
1449
1450   /* Now accumulate cost.  */
1451   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1452   add_stmt_costs (loop_vinfo->scalar_costs,
1453                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1454   loop_vinfo->scalar_costs->finish_cost (nullptr);
1455 }
1456
1457
1458 /* Function vect_analyze_loop_form.
1459
1460    Verify that certain CFG restrictions hold, including:
1461    - the loop has a pre-header
1462    - the loop has a single entry and exit
1463    - the loop exit condition is simple enough
1464    - the number of iterations can be analyzed, i.e, a countable loop.  The
1465      niter could be analyzed under some assumptions.  */
1466
1467 opt_result
1468 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1469 {
1470   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1471
1472   /* Different restrictions apply when we are considering an inner-most loop,
1473      vs. an outer (nested) loop.
1474      (FORNOW. May want to relax some of these restrictions in the future).  */
1475
1476   info->inner_loop_cond = NULL;
1477   if (!loop->inner)
1478     {
1479       /* Inner-most loop.  We currently require that the number of BBs is
1480          exactly 2 (the header and latch).  Vectorizable inner-most loops
1481          look like this:
1482
1483                         (pre-header)
1484                            |
1485                           header <--------+
1486                            | |            |
1487                            | +--> latch --+
1488                            |
1489                         (exit-bb)  */
1490
1491       if (loop->num_nodes != 2)
1492         return opt_result::failure_at (vect_location,
1493                                        "not vectorized:"
1494                                        " control flow in loop.\n");
1495
1496       if (empty_block_p (loop->header))
1497         return opt_result::failure_at (vect_location,
1498                                        "not vectorized: empty loop.\n");
1499     }
1500   else
1501     {
1502       class loop *innerloop = loop->inner;
1503       edge entryedge;
1504
1505       /* Nested loop. We currently require that the loop is doubly-nested,
1506          contains a single inner loop, and the number of BBs is exactly 5.
1507          Vectorizable outer-loops look like this:
1508
1509                         (pre-header)
1510                            |
1511                           header <---+
1512                            |         |
1513                           inner-loop |
1514                            |         |
1515                           tail ------+
1516                            |
1517                         (exit-bb)
1518
1519          The inner-loop has the properties expected of inner-most loops
1520          as described above.  */
1521
1522       if ((loop->inner)->inner || (loop->inner)->next)
1523         return opt_result::failure_at (vect_location,
1524                                        "not vectorized:"
1525                                        " multiple nested loops.\n");
1526
1527       if (loop->num_nodes != 5)
1528         return opt_result::failure_at (vect_location,
1529                                        "not vectorized:"
1530                                        " control flow in loop.\n");
1531
1532       entryedge = loop_preheader_edge (innerloop);
1533       if (entryedge->src != loop->header
1534           || !single_exit (innerloop)
1535           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1536         return opt_result::failure_at (vect_location,
1537                                        "not vectorized:"
1538                                        " unsupported outerloop form.\n");
1539
1540       /* Analyze the inner-loop.  */
1541       vect_loop_form_info inner;
1542       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1543       if (!res)
1544         {
1545           if (dump_enabled_p ())
1546             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1547                              "not vectorized: Bad inner loop.\n");
1548           return res;
1549         }
1550
1551       /* Don't support analyzing niter under assumptions for inner
1552          loop.  */
1553       if (!integer_onep (inner.assumptions))
1554         return opt_result::failure_at (vect_location,
1555                                        "not vectorized: Bad inner loop.\n");
1556
1557       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1558         return opt_result::failure_at (vect_location,
1559                                        "not vectorized: inner-loop count not"
1560                                        " invariant.\n");
1561
1562       if (dump_enabled_p ())
1563         dump_printf_loc (MSG_NOTE, vect_location,
1564                          "Considering outer-loop vectorization.\n");
1565       info->inner_loop_cond = inner.loop_cond;
1566     }
1567
1568   if (!single_exit (loop))
1569     return opt_result::failure_at (vect_location,
1570                                    "not vectorized: multiple exits.\n");
1571   if (EDGE_COUNT (loop->header->preds) != 2)
1572     return opt_result::failure_at (vect_location,
1573                                    "not vectorized:"
1574                                    " too many incoming edges.\n");
1575
1576   /* We assume that the loop exit condition is at the end of the loop. i.e,
1577      that the loop is represented as a do-while (with a proper if-guard
1578      before the loop if needed), where the loop header contains all the
1579      executable statements, and the latch is empty.  */
1580   if (!empty_block_p (loop->latch)
1581       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1582     return opt_result::failure_at (vect_location,
1583                                    "not vectorized: latch block not empty.\n");
1584
1585   /* Make sure the exit is not abnormal.  */
1586   edge e = single_exit (loop);
1587   if (e->flags & EDGE_ABNORMAL)
1588     return opt_result::failure_at (vect_location,
1589                                    "not vectorized:"
1590                                    " abnormal loop exit edge.\n");
1591
1592   info->loop_cond
1593     = vect_get_loop_niters (loop, &info->assumptions,
1594                             &info->number_of_iterations,
1595                             &info->number_of_iterationsm1);
1596   if (!info->loop_cond)
1597     return opt_result::failure_at
1598       (vect_location,
1599        "not vectorized: complicated exit condition.\n");
1600
1601   if (integer_zerop (info->assumptions)
1602       || !info->number_of_iterations
1603       || chrec_contains_undetermined (info->number_of_iterations))
1604     return opt_result::failure_at
1605       (info->loop_cond,
1606        "not vectorized: number of iterations cannot be computed.\n");
1607
1608   if (integer_zerop (info->number_of_iterations))
1609     return opt_result::failure_at
1610       (info->loop_cond,
1611        "not vectorized: number of iterations = 0.\n");
1612
1613   if (!(tree_fits_shwi_p (info->number_of_iterations)
1614         && tree_to_shwi (info->number_of_iterations) > 0))
1615     {
1616       if (dump_enabled_p ())
1617         {
1618           dump_printf_loc (MSG_NOTE, vect_location,
1619                            "Symbolic number of iterations is ");
1620           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1621           dump_printf (MSG_NOTE, "\n");
1622         }
1623     }
1624
1625   return opt_result::success ();
1626 }
1627
1628 /* Create a loop_vec_info for LOOP with SHARED and the
1629    vect_analyze_loop_form result.  */
1630
1631 loop_vec_info
1632 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1633                         const vect_loop_form_info *info,
1634                         loop_vec_info main_loop_info)
1635 {
1636   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1637   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1638   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1639   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1640   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1641   /* Also record the assumptions for versioning.  */
1642   if (!integer_onep (info->assumptions) && !main_loop_info)
1643     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1644
1645   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1646   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1647   if (info->inner_loop_cond)
1648     {
1649       stmt_vec_info inner_loop_cond_info
1650         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1651       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1652       /* If we have an estimate on the number of iterations of the inner
1653          loop use that to limit the scale for costing, otherwise use
1654          --param vect-inner-loop-cost-factor literally.  */
1655       widest_int nit;
1656       if (estimated_stmt_executions (loop->inner, &nit))
1657         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1658           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1659     }
1660
1661   return loop_vinfo;
1662 }
1663
1664
1665
1666 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1667    statements update the vectorization factor.  */
1668
1669 static void
1670 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1671 {
1672   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1673   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1674   int nbbs = loop->num_nodes;
1675   poly_uint64 vectorization_factor;
1676   int i;
1677
1678   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1679
1680   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1681   gcc_assert (known_ne (vectorization_factor, 0U));
1682
1683   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1684      vectorization factor of the loop is the unrolling factor required by
1685      the SLP instances.  If that unrolling factor is 1, we say, that we
1686      perform pure SLP on loop - cross iteration parallelism is not
1687      exploited.  */
1688   bool only_slp_in_loop = true;
1689   for (i = 0; i < nbbs; i++)
1690     {
1691       basic_block bb = bbs[i];
1692       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1693            gsi_next (&si))
1694         {
1695           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1696           if (!stmt_info)
1697             continue;
1698           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1699                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1700               && !PURE_SLP_STMT (stmt_info))
1701             /* STMT needs both SLP and loop-based vectorization.  */
1702             only_slp_in_loop = false;
1703         }
1704       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1705            gsi_next (&si))
1706         {
1707           if (is_gimple_debug (gsi_stmt (si)))
1708             continue;
1709           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1710           stmt_info = vect_stmt_to_vectorize (stmt_info);
1711           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1712                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1713               && !PURE_SLP_STMT (stmt_info))
1714             /* STMT needs both SLP and loop-based vectorization.  */
1715             only_slp_in_loop = false;
1716         }
1717     }
1718
1719   if (only_slp_in_loop)
1720     {
1721       if (dump_enabled_p ())
1722         dump_printf_loc (MSG_NOTE, vect_location,
1723                          "Loop contains only SLP stmts\n");
1724       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1725     }
1726   else
1727     {
1728       if (dump_enabled_p ())
1729         dump_printf_loc (MSG_NOTE, vect_location,
1730                          "Loop contains SLP and non-SLP stmts\n");
1731       /* Both the vectorization factor and unroll factor have the form
1732          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1733          so they must have a common multiple.  */
1734       vectorization_factor
1735         = force_common_multiple (vectorization_factor,
1736                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1737     }
1738
1739   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1740   if (dump_enabled_p ())
1741     {
1742       dump_printf_loc (MSG_NOTE, vect_location,
1743                        "Updating vectorization factor to ");
1744       dump_dec (MSG_NOTE, vectorization_factor);
1745       dump_printf (MSG_NOTE, ".\n");
1746     }
1747 }
1748
1749 /* Return true if STMT_INFO describes a double reduction phi and if
1750    the other phi in the reduction is also relevant for vectorization.
1751    This rejects cases such as:
1752
1753       outer1:
1754         x_1 = PHI <x_3(outer2), ...>;
1755         ...
1756
1757       inner:
1758         x_2 = ...;
1759         ...
1760
1761       outer2:
1762         x_3 = PHI <x_2(inner)>;
1763
1764    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1765
1766 static bool
1767 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1768 {
1769   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1770     return false;
1771
1772   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1773 }
1774
1775 /* Function vect_analyze_loop_operations.
1776
1777    Scan the loop stmts and make sure they are all vectorizable.  */
1778
1779 static opt_result
1780 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1781 {
1782   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1783   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1784   int nbbs = loop->num_nodes;
1785   int i;
1786   stmt_vec_info stmt_info;
1787   bool need_to_vectorize = false;
1788   bool ok;
1789
1790   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1791
1792   auto_vec<stmt_info_for_cost> cost_vec;
1793
1794   for (i = 0; i < nbbs; i++)
1795     {
1796       basic_block bb = bbs[i];
1797
1798       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1799            gsi_next (&si))
1800         {
1801           gphi *phi = si.phi ();
1802           ok = true;
1803
1804           stmt_info = loop_vinfo->lookup_stmt (phi);
1805           if (dump_enabled_p ())
1806             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1807                              (gimple *) phi);
1808           if (virtual_operand_p (gimple_phi_result (phi)))
1809             continue;
1810
1811           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1812              (i.e., a phi in the tail of the outer-loop).  */
1813           if (! is_loop_header_bb_p (bb))
1814             {
1815               /* FORNOW: we currently don't support the case that these phis
1816                  are not used in the outerloop (unless it is double reduction,
1817                  i.e., this phi is vect_reduction_def), cause this case
1818                  requires to actually do something here.  */
1819               if (STMT_VINFO_LIVE_P (stmt_info)
1820                   && !vect_active_double_reduction_p (stmt_info))
1821                 return opt_result::failure_at (phi,
1822                                                "Unsupported loop-closed phi"
1823                                                " in outer-loop.\n");
1824
1825               /* If PHI is used in the outer loop, we check that its operand
1826                  is defined in the inner loop.  */
1827               if (STMT_VINFO_RELEVANT_P (stmt_info))
1828                 {
1829                   tree phi_op;
1830
1831                   if (gimple_phi_num_args (phi) != 1)
1832                     return opt_result::failure_at (phi, "unsupported phi");
1833
1834                   phi_op = PHI_ARG_DEF (phi, 0);
1835                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1836                   if (!op_def_info)
1837                     return opt_result::failure_at (phi, "unsupported phi\n");
1838
1839                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1840                       && (STMT_VINFO_RELEVANT (op_def_info)
1841                           != vect_used_in_outer_by_reduction))
1842                     return opt_result::failure_at (phi, "unsupported phi\n");
1843
1844                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1845                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1846                            == vect_double_reduction_def))
1847                       && !vectorizable_lc_phi (loop_vinfo,
1848                                                stmt_info, NULL, NULL))
1849                     return opt_result::failure_at (phi, "unsupported phi\n");
1850                 }
1851
1852               continue;
1853             }
1854
1855           gcc_assert (stmt_info);
1856
1857           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1858                || STMT_VINFO_LIVE_P (stmt_info))
1859               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
1860               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
1861             /* A scalar-dependence cycle that we don't support.  */
1862             return opt_result::failure_at (phi,
1863                                            "not vectorized:"
1864                                            " scalar dependence cycle.\n");
1865
1866           if (STMT_VINFO_RELEVANT_P (stmt_info))
1867             {
1868               need_to_vectorize = true;
1869               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1870                   && ! PURE_SLP_STMT (stmt_info))
1871                 ok = vectorizable_induction (loop_vinfo,
1872                                              stmt_info, NULL, NULL,
1873                                              &cost_vec);
1874               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1875                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1876                             == vect_double_reduction_def)
1877                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1878                        && ! PURE_SLP_STMT (stmt_info))
1879                 ok = vectorizable_reduction (loop_vinfo,
1880                                              stmt_info, NULL, NULL, &cost_vec);
1881               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
1882                         == vect_first_order_recurrence)
1883                        && ! PURE_SLP_STMT (stmt_info))
1884                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
1885                                            &cost_vec);
1886             }
1887
1888           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1889           if (ok
1890               && STMT_VINFO_LIVE_P (stmt_info)
1891               && !PURE_SLP_STMT (stmt_info))
1892             ok = vectorizable_live_operation (loop_vinfo,
1893                                               stmt_info, NULL, NULL, NULL,
1894                                               -1, false, &cost_vec);
1895
1896           if (!ok)
1897             return opt_result::failure_at (phi,
1898                                            "not vectorized: relevant phi not "
1899                                            "supported: %G",
1900                                            static_cast <gimple *> (phi));
1901         }
1902
1903       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1904            gsi_next (&si))
1905         {
1906           gimple *stmt = gsi_stmt (si);
1907           if (!gimple_clobber_p (stmt)
1908               && !is_gimple_debug (stmt))
1909             {
1910               opt_result res
1911                 = vect_analyze_stmt (loop_vinfo,
1912                                      loop_vinfo->lookup_stmt (stmt),
1913                                      &need_to_vectorize,
1914                                      NULL, NULL, &cost_vec);
1915               if (!res)
1916                 return res;
1917             }
1918         }
1919     } /* bbs */
1920
1921   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1922
1923   /* All operations in the loop are either irrelevant (deal with loop
1924      control, or dead), or only used outside the loop and can be moved
1925      out of the loop (e.g. invariants, inductions).  The loop can be
1926      optimized away by scalar optimizations.  We're better off not
1927      touching this loop.  */
1928   if (!need_to_vectorize)
1929     {
1930       if (dump_enabled_p ())
1931         dump_printf_loc (MSG_NOTE, vect_location,
1932                          "All the computation can be taken out of the loop.\n");
1933       return opt_result::failure_at
1934         (vect_location,
1935          "not vectorized: redundant loop. no profit to vectorize.\n");
1936     }
1937
1938   return opt_result::success ();
1939 }
1940
1941 /* Return true if we know that the iteration count is smaller than the
1942    vectorization factor.  Return false if it isn't, or if we can't be sure
1943    either way.  */
1944
1945 static bool
1946 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1947 {
1948   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1949
1950   HOST_WIDE_INT max_niter;
1951   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1952     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1953   else
1954     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1955
1956   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1957     return true;
1958
1959   return false;
1960 }
1961
1962 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1963    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1964    definitely no, or -1 if it's worth retrying.  */
1965
1966 static int
1967 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1968                            unsigned *suggested_unroll_factor)
1969 {
1970   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1972
1973   /* Only loops that can handle partially-populated vectors can have iteration
1974      counts less than the vectorization factor.  */
1975   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1976     {
1977       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1978         {
1979           if (dump_enabled_p ())
1980             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981                              "not vectorized: iteration count smaller than "
1982                              "vectorization factor.\n");
1983           return 0;
1984         }
1985     }
1986
1987   /* If using the "very cheap" model. reject cases in which we'd keep
1988      a copy of the scalar code (even if we might be able to vectorize it).  */
1989   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1990       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1991           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1992           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1993     {
1994       if (dump_enabled_p ())
1995         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1996                          "some scalar iterations would need to be peeled\n");
1997       return 0;
1998     }
1999
2000   int min_profitable_iters, min_profitable_estimate;
2001   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2002                                       &min_profitable_estimate,
2003                                       suggested_unroll_factor);
2004
2005   if (min_profitable_iters < 0)
2006     {
2007       if (dump_enabled_p ())
2008         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009                          "not vectorized: vectorization not profitable.\n");
2010       if (dump_enabled_p ())
2011         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2012                          "not vectorized: vector version will never be "
2013                          "profitable.\n");
2014       return -1;
2015     }
2016
2017   int min_scalar_loop_bound = (param_min_vect_loop_bound
2018                                * assumed_vf);
2019
2020   /* Use the cost model only if it is more conservative than user specified
2021      threshold.  */
2022   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2023                                     min_profitable_iters);
2024
2025   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2026
2027   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2028       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2029     {
2030       if (dump_enabled_p ())
2031         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2032                          "not vectorized: vectorization not profitable.\n");
2033       if (dump_enabled_p ())
2034         dump_printf_loc (MSG_NOTE, vect_location,
2035                          "not vectorized: iteration count smaller than user "
2036                          "specified loop bound parameter or minimum profitable "
2037                          "iterations (whichever is more conservative).\n");
2038       return 0;
2039     }
2040
2041   /* The static profitablity threshold min_profitable_estimate includes
2042      the cost of having to check at runtime whether the scalar loop
2043      should be used instead.  If it turns out that we don't need or want
2044      such a check, the threshold we should use for the static estimate
2045      is simply the point at which the vector loop becomes more profitable
2046      than the scalar loop.  */
2047   if (min_profitable_estimate > min_profitable_iters
2048       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2049       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2050       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2051       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2052     {
2053       if (dump_enabled_p ())
2054         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2055                          " choice between the scalar and vector loops\n");
2056       min_profitable_estimate = min_profitable_iters;
2057     }
2058
2059   /* If the vector loop needs multiple iterations to be beneficial then
2060      things are probably too close to call, and the conservative thing
2061      would be to stick with the scalar code.  */
2062   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2063       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2064     {
2065       if (dump_enabled_p ())
2066         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2067                          "one iteration of the vector loop would be"
2068                          " more expensive than the equivalent number of"
2069                          " iterations of the scalar loop\n");
2070       return 0;
2071     }
2072
2073   HOST_WIDE_INT estimated_niter;
2074
2075   /* If we are vectorizing an epilogue then we know the maximum number of
2076      scalar iterations it will cover is at least one lower than the
2077      vectorization factor of the main loop.  */
2078   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2079     estimated_niter
2080       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2081   else
2082     {
2083       estimated_niter = estimated_stmt_executions_int (loop);
2084       if (estimated_niter == -1)
2085         estimated_niter = likely_max_stmt_executions_int (loop);
2086     }
2087   if (estimated_niter != -1
2088       && ((unsigned HOST_WIDE_INT) estimated_niter
2089           < MAX (th, (unsigned) min_profitable_estimate)))
2090     {
2091       if (dump_enabled_p ())
2092         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2093                          "not vectorized: estimated iteration count too "
2094                          "small.\n");
2095       if (dump_enabled_p ())
2096         dump_printf_loc (MSG_NOTE, vect_location,
2097                          "not vectorized: estimated iteration count smaller "
2098                          "than specified loop bound parameter or minimum "
2099                          "profitable iterations (whichever is more "
2100                          "conservative).\n");
2101       return -1;
2102     }
2103
2104   return 1;
2105 }
2106
2107 static opt_result
2108 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2109                            vec<data_reference_p> *datarefs,
2110                            unsigned int *n_stmts)
2111 {
2112   *n_stmts = 0;
2113   for (unsigned i = 0; i < loop->num_nodes; i++)
2114     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2115          !gsi_end_p (gsi); gsi_next (&gsi))
2116       {
2117         gimple *stmt = gsi_stmt (gsi);
2118         if (is_gimple_debug (stmt))
2119           continue;
2120         ++(*n_stmts);
2121         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2122                                                         NULL, 0);
2123         if (!res)
2124           {
2125             if (is_gimple_call (stmt) && loop->safelen)
2126               {
2127                 tree fndecl = gimple_call_fndecl (stmt), op;
2128                 if (fndecl == NULL_TREE
2129                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2130                   {
2131                     fndecl = gimple_call_arg (stmt, 0);
2132                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2133                     fndecl = TREE_OPERAND (fndecl, 0);
2134                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2135                   }
2136                 if (fndecl != NULL_TREE)
2137                   {
2138                     cgraph_node *node = cgraph_node::get (fndecl);
2139                     if (node != NULL && node->simd_clones != NULL)
2140                       {
2141                         unsigned int j, n = gimple_call_num_args (stmt);
2142                         for (j = 0; j < n; j++)
2143                           {
2144                             op = gimple_call_arg (stmt, j);
2145                             if (DECL_P (op)
2146                                 || (REFERENCE_CLASS_P (op)
2147                                     && get_base_address (op)))
2148                               break;
2149                           }
2150                         op = gimple_call_lhs (stmt);
2151                         /* Ignore #pragma omp declare simd functions
2152                            if they don't have data references in the
2153                            call stmt itself.  */
2154                         if (j == n
2155                             && !(op
2156                                  && (DECL_P (op)
2157                                      || (REFERENCE_CLASS_P (op)
2158                                          && get_base_address (op)))))
2159                           continue;
2160                       }
2161                   }
2162               }
2163             return res;
2164           }
2165         /* If dependence analysis will give up due to the limit on the
2166            number of datarefs stop here and fail fatally.  */
2167         if (datarefs->length ()
2168             > (unsigned)param_loop_max_datarefs_for_datadeps)
2169           return opt_result::failure_at (stmt, "exceeded param "
2170                                          "loop-max-datarefs-for-datadeps\n");
2171       }
2172   return opt_result::success ();
2173 }
2174
2175 /* Look for SLP-only access groups and turn each individual access into its own
2176    group.  */
2177 static void
2178 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2179 {
2180   unsigned int i;
2181   struct data_reference *dr;
2182
2183   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2184
2185   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2186   FOR_EACH_VEC_ELT (datarefs, i, dr)
2187     {
2188       gcc_assert (DR_REF (dr));
2189       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2190
2191       /* Check if the load is a part of an interleaving chain.  */
2192       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2193         {
2194           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2195           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2196           unsigned int group_size = DR_GROUP_SIZE (first_element);
2197
2198           /* Check if SLP-only groups.  */
2199           if (!STMT_SLP_TYPE (stmt_info)
2200               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2201             {
2202               /* Dissolve the group.  */
2203               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2204
2205               stmt_vec_info vinfo = first_element;
2206               while (vinfo)
2207                 {
2208                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2209                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2210                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2211                   DR_GROUP_SIZE (vinfo) = 1;
2212                   if (STMT_VINFO_STRIDED_P (first_element))
2213                     DR_GROUP_GAP (vinfo) = 0;
2214                   else
2215                     DR_GROUP_GAP (vinfo) = group_size - 1;
2216                   /* Duplicate and adjust alignment info, it needs to
2217                      be present on each group leader, see dr_misalignment.  */
2218                   if (vinfo != first_element)
2219                     {
2220                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2221                       dr_info2->target_alignment = dr_info->target_alignment;
2222                       int misalignment = dr_info->misalignment;
2223                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2224                         {
2225                           HOST_WIDE_INT diff
2226                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2227                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2228                           unsigned HOST_WIDE_INT align_c
2229                             = dr_info->target_alignment.to_constant ();
2230                           misalignment = (misalignment + diff) % align_c;
2231                         }
2232                       dr_info2->misalignment = misalignment;
2233                     }
2234                   vinfo = next;
2235                 }
2236             }
2237         }
2238     }
2239 }
2240
2241 /* Determine if operating on full vectors for LOOP_VINFO might leave
2242    some scalar iterations still to do.  If so, decide how we should
2243    handle those scalar iterations.  The possibilities are:
2244
2245    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2246        In this case:
2247
2248          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2249          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2250          LOOP_VINFO_PEELING_FOR_NITER == false
2251
2252    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2253        to handle the remaining scalar iterations.  In this case:
2254
2255          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2256          LOOP_VINFO_PEELING_FOR_NITER == true
2257
2258        There are two choices:
2259
2260        (2a) Consider vectorizing the epilogue loop at the same VF as the
2261             main loop, but using partial vectors instead of full vectors.
2262             In this case:
2263
2264               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2265
2266        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2267             In this case:
2268
2269               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2270
2271    When FOR_EPILOGUE_P is true, make this determination based on the
2272    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2273    based on the assumption that LOOP_VINFO is the main loop.  The caller
2274    has made sure that the number of iterations is set appropriately for
2275    this value of FOR_EPILOGUE_P.  */
2276
2277 opt_result
2278 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2279                                             bool for_epilogue_p)
2280 {
2281   /* Determine whether there would be any scalar iterations left over.  */
2282   bool need_peeling_or_partial_vectors_p
2283     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2284
2285   /* Decide whether to vectorize the loop with partial vectors.  */
2286   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2287   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2288   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2289       && need_peeling_or_partial_vectors_p)
2290     {
2291       /* For partial-vector-usage=1, try to push the handling of partial
2292          vectors to the epilogue, with the main loop continuing to operate
2293          on full vectors.
2294
2295          If we are unrolling we also do not want to use partial vectors. This
2296          is to avoid the overhead of generating multiple masks and also to
2297          avoid having to execute entire iterations of FALSE masked instructions
2298          when dealing with one or less full iterations.
2299
2300          ??? We could then end up failing to use partial vectors if we
2301          decide to peel iterations into a prologue, and if the main loop
2302          then ends up processing fewer than VF iterations.  */
2303       if ((param_vect_partial_vector_usage == 1
2304            || loop_vinfo->suggested_unroll_factor > 1)
2305           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2306           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2307         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2308       else
2309         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2310     }
2311
2312   if (dump_enabled_p ())
2313     {
2314       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2315         dump_printf_loc (MSG_NOTE, vect_location,
2316                          "operating on partial vectors%s.\n",
2317                          for_epilogue_p ? " for epilogue loop" : "");
2318       else
2319         dump_printf_loc (MSG_NOTE, vect_location,
2320                          "operating only on full vectors%s.\n",
2321                          for_epilogue_p ? " for epilogue loop" : "");
2322     }
2323
2324   if (for_epilogue_p)
2325     {
2326       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2327       gcc_assert (orig_loop_vinfo);
2328       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2329         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2330                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2331     }
2332
2333   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2334       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2335     {
2336       /* Check that the loop processes at least one full vector.  */
2337       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2338       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2339       if (known_lt (wi::to_widest (scalar_niters), vf))
2340         return opt_result::failure_at (vect_location,
2341                                        "loop does not have enough iterations"
2342                                        " to support vectorization.\n");
2343
2344       /* If we need to peel an extra epilogue iteration to handle data
2345          accesses with gaps, check that there are enough scalar iterations
2346          available.
2347
2348          The check above is redundant with this one when peeling for gaps,
2349          but the distinction is useful for diagnostics.  */
2350       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2351       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2352           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2353         return opt_result::failure_at (vect_location,
2354                                        "loop does not have enough iterations"
2355                                        " to support peeling for gaps.\n");
2356     }
2357
2358   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2359     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2360        && need_peeling_or_partial_vectors_p);
2361
2362   return opt_result::success ();
2363 }
2364
2365 /* Function vect_analyze_loop_2.
2366
2367    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2368    analyses will record information in some members of LOOP_VINFO.  FATAL
2369    indicates if some analysis meets fatal error.  If one non-NULL pointer
2370    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2371    worked out suggested unroll factor, while one NULL pointer shows it's
2372    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2373    is to hold the slp decision when the suggested unroll factor is worked
2374    out.  */
2375 static opt_result
2376 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2377                      unsigned *suggested_unroll_factor,
2378                      bool& slp_done_for_suggested_uf)
2379 {
2380   opt_result ok = opt_result::success ();
2381   int res;
2382   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2383   poly_uint64 min_vf = 2;
2384   loop_vec_info orig_loop_vinfo = NULL;
2385
2386   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2387      loop_vec_info of the first vectorized loop.  */
2388   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2389     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2390   else
2391     orig_loop_vinfo = loop_vinfo;
2392   gcc_assert (orig_loop_vinfo);
2393
2394   /* The first group of checks is independent of the vector size.  */
2395   fatal = true;
2396
2397   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2398       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2399     return opt_result::failure_at (vect_location,
2400                                    "not vectorized: simd if(0)\n");
2401
2402   /* Find all data references in the loop (which correspond to vdefs/vuses)
2403      and analyze their evolution in the loop.  */
2404
2405   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2406
2407   /* Gather the data references and count stmts in the loop.  */
2408   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2409     {
2410       opt_result res
2411         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2412                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2413                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2414       if (!res)
2415         {
2416           if (dump_enabled_p ())
2417             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2418                              "not vectorized: loop contains function "
2419                              "calls or data references that cannot "
2420                              "be analyzed\n");
2421           return res;
2422         }
2423       loop_vinfo->shared->save_datarefs ();
2424     }
2425   else
2426     loop_vinfo->shared->check_datarefs ();
2427
2428   /* Analyze the data references and also adjust the minimal
2429      vectorization factor according to the loads and stores.  */
2430
2431   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2432   if (!ok)
2433     {
2434       if (dump_enabled_p ())
2435         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2436                          "bad data references.\n");
2437       return ok;
2438     }
2439
2440   /* Check if we are applying unroll factor now.  */
2441   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2442   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2443
2444   /* If the slp decision is false when suggested unroll factor is worked
2445      out, and we are applying suggested unroll factor, we can simply skip
2446      all slp related analyses this time.  */
2447   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2448
2449   /* Classify all cross-iteration scalar data-flow cycles.
2450      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2451   vect_analyze_scalar_cycles (loop_vinfo, slp);
2452
2453   vect_pattern_recog (loop_vinfo);
2454
2455   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2456
2457   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2458      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2459
2460   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2461   if (!ok)
2462     {
2463       if (dump_enabled_p ())
2464         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2465                          "bad data access.\n");
2466       return ok;
2467     }
2468
2469   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2470
2471   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2472   if (!ok)
2473     {
2474       if (dump_enabled_p ())
2475         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2476                          "unexpected pattern.\n");
2477       return ok;
2478     }
2479
2480   /* While the rest of the analysis below depends on it in some way.  */
2481   fatal = false;
2482
2483   /* Analyze data dependences between the data-refs in the loop
2484      and adjust the maximum vectorization factor according to
2485      the dependences.
2486      FORNOW: fail at the first data dependence that we encounter.  */
2487
2488   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2489   if (!ok)
2490     {
2491       if (dump_enabled_p ())
2492         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2493                          "bad data dependence.\n");
2494       return ok;
2495     }
2496   if (max_vf != MAX_VECTORIZATION_FACTOR
2497       && maybe_lt (max_vf, min_vf))
2498     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2499   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2500
2501   ok = vect_determine_vectorization_factor (loop_vinfo);
2502   if (!ok)
2503     {
2504       if (dump_enabled_p ())
2505         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2506                          "can't determine vectorization factor.\n");
2507       return ok;
2508     }
2509   if (max_vf != MAX_VECTORIZATION_FACTOR
2510       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2511     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2512
2513   /* Compute the scalar iteration cost.  */
2514   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2515
2516   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2517
2518   if (slp)
2519     {
2520       /* Check the SLP opportunities in the loop, analyze and build
2521          SLP trees.  */
2522       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2523       if (!ok)
2524         return ok;
2525
2526       /* If there are any SLP instances mark them as pure_slp.  */
2527       slp = vect_make_slp_decision (loop_vinfo);
2528       if (slp)
2529         {
2530           /* Find stmts that need to be both vectorized and SLPed.  */
2531           vect_detect_hybrid_slp (loop_vinfo);
2532
2533           /* Update the vectorization factor based on the SLP decision.  */
2534           vect_update_vf_for_slp (loop_vinfo);
2535
2536           /* Optimize the SLP graph with the vectorization factor fixed.  */
2537           vect_optimize_slp (loop_vinfo);
2538
2539           /* Gather the loads reachable from the SLP graph entries.  */
2540           vect_gather_slp_loads (loop_vinfo);
2541         }
2542     }
2543
2544   bool saved_can_use_partial_vectors_p
2545     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2546
2547   /* We don't expect to have to roll back to anything other than an empty
2548      set of rgroups.  */
2549   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2550
2551   /* This is the point where we can re-start analysis with SLP forced off.  */
2552 start_over:
2553
2554   /* Apply the suggested unrolling factor, this was determined by the backend
2555      during finish_cost the first time we ran the analyzis for this
2556      vector mode.  */
2557   if (applying_suggested_uf)
2558     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2559
2560   /* Now the vectorization factor is final.  */
2561   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2562   gcc_assert (known_ne (vectorization_factor, 0U));
2563
2564   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2565     {
2566       dump_printf_loc (MSG_NOTE, vect_location,
2567                        "vectorization_factor = ");
2568       dump_dec (MSG_NOTE, vectorization_factor);
2569       dump_printf (MSG_NOTE, ", niters = %wd\n",
2570                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2571     }
2572
2573   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2574
2575   /* Analyze the alignment of the data-refs in the loop.
2576      Fail if a data reference is found that cannot be vectorized.  */
2577
2578   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2579   if (!ok)
2580     {
2581       if (dump_enabled_p ())
2582         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2583                          "bad data alignment.\n");
2584       return ok;
2585     }
2586
2587   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2588      It is important to call pruning after vect_analyze_data_ref_accesses,
2589      since we use grouping information gathered by interleaving analysis.  */
2590   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2591   if (!ok)
2592     return ok;
2593
2594   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2595      vectorization, since we do not want to add extra peeling or
2596      add versioning for alignment.  */
2597   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2598     /* This pass will decide on using loop versioning and/or loop peeling in
2599        order to enhance the alignment of data references in the loop.  */
2600     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2601   if (!ok)
2602     return ok;
2603
2604   if (slp)
2605     {
2606       /* Analyze operations in the SLP instances.  Note this may
2607          remove unsupported SLP instances which makes the above
2608          SLP kind detection invalid.  */
2609       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2610       vect_slp_analyze_operations (loop_vinfo);
2611       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2612         {
2613           ok = opt_result::failure_at (vect_location,
2614                                        "unsupported SLP instances\n");
2615           goto again;
2616         }
2617
2618       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2619       slp_tree load_node, slp_root;
2620       unsigned i, x;
2621       slp_instance instance;
2622       bool can_use_lanes = true;
2623       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2624         {
2625           slp_root = SLP_INSTANCE_TREE (instance);
2626           int group_size = SLP_TREE_LANES (slp_root);
2627           tree vectype = SLP_TREE_VECTYPE (slp_root);
2628           bool loads_permuted = false;
2629           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2630             {
2631               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2632                 continue;
2633               unsigned j;
2634               stmt_vec_info load_info;
2635               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2636                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2637                   {
2638                     loads_permuted = true;
2639                     break;
2640                   }
2641             }
2642
2643           /* If the loads and stores can be handled with load/store-lane
2644              instructions record it and move on to the next instance.  */
2645           if (loads_permuted
2646               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2647               && vect_store_lanes_supported (vectype, group_size, false))
2648             {
2649               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2650                 {
2651                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2652                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2653                   /* Use SLP for strided accesses (or if we can't
2654                      load-lanes).  */
2655                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2656                       || ! vect_load_lanes_supported
2657                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2658                              DR_GROUP_SIZE (stmt_vinfo), false))
2659                     break;
2660                 }
2661
2662               can_use_lanes
2663                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2664
2665               if (can_use_lanes && dump_enabled_p ())
2666                 dump_printf_loc (MSG_NOTE, vect_location,
2667                                  "SLP instance %p can use load/store-lanes\n",
2668                                  (void *) instance);
2669             }
2670           else
2671             {
2672               can_use_lanes = false;
2673               break;
2674             }
2675         }
2676
2677       /* If all SLP instances can use load/store-lanes abort SLP and try again
2678          with SLP disabled.  */
2679       if (can_use_lanes)
2680         {
2681           ok = opt_result::failure_at (vect_location,
2682                                        "Built SLP cancelled: can use "
2683                                        "load/store-lanes\n");
2684           if (dump_enabled_p ())
2685             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2686                              "Built SLP cancelled: all SLP instances support "
2687                              "load/store-lanes\n");
2688           goto again;
2689         }
2690     }
2691
2692   /* Dissolve SLP-only groups.  */
2693   vect_dissolve_slp_only_groups (loop_vinfo);
2694
2695   /* Scan all the remaining operations in the loop that are not subject
2696      to SLP and make sure they are vectorizable.  */
2697   ok = vect_analyze_loop_operations (loop_vinfo);
2698   if (!ok)
2699     {
2700       if (dump_enabled_p ())
2701         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2702                          "bad operation or unsupported loop bound.\n");
2703       return ok;
2704     }
2705
2706   /* For now, we don't expect to mix both masking and length approaches for one
2707      loop, disable it if both are recorded.  */
2708   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2709       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2710       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2711     {
2712       if (dump_enabled_p ())
2713         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2714                          "can't vectorize a loop with partial vectors"
2715                          " because we don't expect to mix different"
2716                          " approaches with partial vectors for the"
2717                          " same loop.\n");
2718       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2719     }
2720
2721   /* If we still have the option of using partial vectors,
2722      check whether we can generate the necessary loop controls.  */
2723   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2724       && !vect_verify_full_masking (loop_vinfo)
2725       && !vect_verify_loop_lens (loop_vinfo))
2726     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2727
2728   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2729      to be able to handle fewer than VF scalars, or needs to have a lower VF
2730      than the main loop.  */
2731   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2732       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2733       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2734                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2735     return opt_result::failure_at (vect_location,
2736                                    "Vectorization factor too high for"
2737                                    " epilogue loop.\n");
2738
2739   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2740      assuming that the loop will be used as a main loop.  We will redo
2741      this analysis later if we instead decide to use the loop as an
2742      epilogue loop.  */
2743   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2744   if (!ok)
2745     return ok;
2746
2747   /* Check the costings of the loop make vectorizing worthwhile.  */
2748   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2749   if (res < 0)
2750     {
2751       ok = opt_result::failure_at (vect_location,
2752                                    "Loop costings may not be worthwhile.\n");
2753       goto again;
2754     }
2755   if (!res)
2756     return opt_result::failure_at (vect_location,
2757                                    "Loop costings not worthwhile.\n");
2758
2759   /* If an epilogue loop is required make sure we can create one.  */
2760   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2761       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2762     {
2763       if (dump_enabled_p ())
2764         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2765       if (!vect_can_advance_ivs_p (loop_vinfo)
2766           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2767                                            single_exit (LOOP_VINFO_LOOP
2768                                                          (loop_vinfo))))
2769         {
2770           ok = opt_result::failure_at (vect_location,
2771                                        "not vectorized: can't create required "
2772                                        "epilog loop\n");
2773           goto again;
2774         }
2775     }
2776
2777   /* During peeling, we need to check if number of loop iterations is
2778      enough for both peeled prolog loop and vector loop.  This check
2779      can be merged along with threshold check of loop versioning, so
2780      increase threshold for this case if necessary.
2781
2782      If we are analyzing an epilogue we still want to check what its
2783      versioning threshold would be.  If we decide to vectorize the epilogues we
2784      will want to use the lowest versioning threshold of all epilogues and main
2785      loop.  This will enable us to enter a vectorized epilogue even when
2786      versioning the loop.  We can't simply check whether the epilogue requires
2787      versioning though since we may have skipped some versioning checks when
2788      analyzing the epilogue.  For instance, checks for alias versioning will be
2789      skipped when dealing with epilogues as we assume we already checked them
2790      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2791   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2792     {
2793       poly_uint64 niters_th = 0;
2794       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2795
2796       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2797         {
2798           /* Niters for peeled prolog loop.  */
2799           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2800             {
2801               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2802               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2803               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2804             }
2805           else
2806             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2807         }
2808
2809       /* Niters for at least one iteration of vectorized loop.  */
2810       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2811         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2812       /* One additional iteration because of peeling for gap.  */
2813       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2814         niters_th += 1;
2815
2816       /*  Use the same condition as vect_transform_loop to decide when to use
2817           the cost to determine a versioning threshold.  */
2818       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2819           && ordered_p (th, niters_th))
2820         niters_th = ordered_max (poly_uint64 (th), niters_th);
2821
2822       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2823     }
2824
2825   gcc_assert (known_eq (vectorization_factor,
2826                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2827
2828   slp_done_for_suggested_uf = slp;
2829
2830   /* Ok to vectorize!  */
2831   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2832   return opt_result::success ();
2833
2834 again:
2835   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2836   gcc_assert (!ok);
2837
2838   /* Try again with SLP forced off but if we didn't do any SLP there is
2839      no point in re-trying.  */
2840   if (!slp)
2841     return ok;
2842
2843   /* If the slp decision is true when suggested unroll factor is worked
2844      out, and we are applying suggested unroll factor, we don't need to
2845      re-try any more.  */
2846   if (applying_suggested_uf && slp_done_for_suggested_uf)
2847     return ok;
2848
2849   /* If there are reduction chains re-trying will fail anyway.  */
2850   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2851     return ok;
2852
2853   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2854      via interleaving or lane instructions.  */
2855   slp_instance instance;
2856   slp_tree node;
2857   unsigned i, j;
2858   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2859     {
2860       stmt_vec_info vinfo;
2861       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2862       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2863         continue;
2864       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2865       unsigned int size = DR_GROUP_SIZE (vinfo);
2866       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2867       if (! vect_store_lanes_supported (vectype, size, false)
2868          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2869          && ! vect_grouped_store_supported (vectype, size))
2870         return opt_result::failure_at (vinfo->stmt,
2871                                        "unsupported grouped store\n");
2872       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2873         {
2874           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2875           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2876           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2877           size = DR_GROUP_SIZE (vinfo);
2878           vectype = STMT_VINFO_VECTYPE (vinfo);
2879           if (! vect_load_lanes_supported (vectype, size, false)
2880               && ! vect_grouped_load_supported (vectype, single_element_p,
2881                                                 size))
2882             return opt_result::failure_at (vinfo->stmt,
2883                                            "unsupported grouped load\n");
2884         }
2885     }
2886
2887   if (dump_enabled_p ())
2888     dump_printf_loc (MSG_NOTE, vect_location,
2889                      "re-trying with SLP disabled\n");
2890
2891   /* Roll back state appropriately.  No SLP this time.  */
2892   slp = false;
2893   /* Restore vectorization factor as it were without SLP.  */
2894   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2895   /* Free the SLP instances.  */
2896   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2897     vect_free_slp_instance (instance);
2898   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2899   /* Reset SLP type to loop_vect on all stmts.  */
2900   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2901     {
2902       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2903       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2904            !gsi_end_p (si); gsi_next (&si))
2905         {
2906           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2907           STMT_SLP_TYPE (stmt_info) = loop_vect;
2908           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2909               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2910             {
2911               /* vectorizable_reduction adjusts reduction stmt def-types,
2912                  restore them to that of the PHI.  */
2913               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2914                 = STMT_VINFO_DEF_TYPE (stmt_info);
2915               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2916                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2917                 = STMT_VINFO_DEF_TYPE (stmt_info);
2918             }
2919         }
2920       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2921            !gsi_end_p (si); gsi_next (&si))
2922         {
2923           if (is_gimple_debug (gsi_stmt (si)))
2924             continue;
2925           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2926           STMT_SLP_TYPE (stmt_info) = loop_vect;
2927           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2928             {
2929               stmt_vec_info pattern_stmt_info
2930                 = STMT_VINFO_RELATED_STMT (stmt_info);
2931               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2932                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2933
2934               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2935               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2936               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2937                    !gsi_end_p (pi); gsi_next (&pi))
2938                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2939                   = loop_vect;
2940             }
2941         }
2942     }
2943   /* Free optimized alias test DDRS.  */
2944   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2945   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2946   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2947   /* Reset target cost data.  */
2948   delete loop_vinfo->vector_costs;
2949   loop_vinfo->vector_costs = nullptr;
2950   /* Reset accumulated rgroup information.  */
2951   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2952   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2953   /* Reset assorted flags.  */
2954   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2955   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2956   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2957   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2958   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2959     = saved_can_use_partial_vectors_p;
2960
2961   goto start_over;
2962 }
2963
2964 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2965    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2966    OLD_LOOP_VINFO is better unless something specifically indicates
2967    otherwise.
2968
2969    Note that this deliberately isn't a partial order.  */
2970
2971 static bool
2972 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2973                           loop_vec_info old_loop_vinfo)
2974 {
2975   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2976   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2977
2978   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2979   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2980
2981   /* Always prefer a VF of loop->simdlen over any other VF.  */
2982   if (loop->simdlen)
2983     {
2984       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2985       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2986       if (new_simdlen_p != old_simdlen_p)
2987         return new_simdlen_p;
2988     }
2989
2990   const auto *old_costs = old_loop_vinfo->vector_costs;
2991   const auto *new_costs = new_loop_vinfo->vector_costs;
2992   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2993     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2994
2995   return new_costs->better_main_loop_than_p (old_costs);
2996 }
2997
2998 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2999    true if we should.  */
3000
3001 static bool
3002 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3003                         loop_vec_info old_loop_vinfo)
3004 {
3005   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3006     return false;
3007
3008   if (dump_enabled_p ())
3009     dump_printf_loc (MSG_NOTE, vect_location,
3010                      "***** Preferring vector mode %s to vector mode %s\n",
3011                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3012                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3013   return true;
3014 }
3015
3016 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3017    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3018    MODE_I to the next mode useful to analyze.
3019    Return the loop_vinfo on success and wrapped null on failure.  */
3020
3021 static opt_loop_vec_info
3022 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3023                      const vect_loop_form_info *loop_form_info,
3024                      loop_vec_info main_loop_vinfo,
3025                      const vector_modes &vector_modes, unsigned &mode_i,
3026                      machine_mode &autodetected_vector_mode,
3027                      bool &fatal)
3028 {
3029   loop_vec_info loop_vinfo
3030     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3031
3032   machine_mode vector_mode = vector_modes[mode_i];
3033   loop_vinfo->vector_mode = vector_mode;
3034   unsigned int suggested_unroll_factor = 1;
3035   bool slp_done_for_suggested_uf;
3036
3037   /* Run the main analysis.  */
3038   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3039                                         &suggested_unroll_factor,
3040                                         slp_done_for_suggested_uf);
3041   if (dump_enabled_p ())
3042     dump_printf_loc (MSG_NOTE, vect_location,
3043                      "***** Analysis %s with vector mode %s\n",
3044                      res ? "succeeded" : " failed",
3045                      GET_MODE_NAME (loop_vinfo->vector_mode));
3046
3047   if (!main_loop_vinfo && suggested_unroll_factor > 1)
3048     {
3049       if (dump_enabled_p ())
3050         dump_printf_loc (MSG_NOTE, vect_location,
3051                          "***** Re-trying analysis for unrolling"
3052                          " with unroll factor %d and slp %s.\n",
3053                          suggested_unroll_factor,
3054                          slp_done_for_suggested_uf ? "on" : "off");
3055       loop_vec_info unroll_vinfo
3056         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3057       unroll_vinfo->vector_mode = vector_mode;
3058       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3059       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3060                                                 slp_done_for_suggested_uf);
3061       if (new_res)
3062         {
3063           delete loop_vinfo;
3064           loop_vinfo = unroll_vinfo;
3065         }
3066       else
3067         delete unroll_vinfo;
3068     }
3069
3070   /* Remember the autodetected vector mode.  */
3071   if (vector_mode == VOIDmode)
3072     autodetected_vector_mode = loop_vinfo->vector_mode;
3073
3074   /* Advance mode_i, first skipping modes that would result in the
3075      same analysis result.  */
3076   while (mode_i + 1 < vector_modes.length ()
3077          && vect_chooses_same_modes_p (loop_vinfo,
3078                                        vector_modes[mode_i + 1]))
3079     {
3080       if (dump_enabled_p ())
3081         dump_printf_loc (MSG_NOTE, vect_location,
3082                          "***** The result for vector mode %s would"
3083                          " be the same\n",
3084                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3085       mode_i += 1;
3086     }
3087   if (mode_i + 1 < vector_modes.length ()
3088       && VECTOR_MODE_P (autodetected_vector_mode)
3089       && (related_vector_mode (vector_modes[mode_i + 1],
3090                                GET_MODE_INNER (autodetected_vector_mode))
3091           == autodetected_vector_mode)
3092       && (related_vector_mode (autodetected_vector_mode,
3093                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3094           == vector_modes[mode_i + 1]))
3095     {
3096       if (dump_enabled_p ())
3097         dump_printf_loc (MSG_NOTE, vect_location,
3098                          "***** Skipping vector mode %s, which would"
3099                          " repeat the analysis for %s\n",
3100                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3101                          GET_MODE_NAME (autodetected_vector_mode));
3102       mode_i += 1;
3103     }
3104   mode_i++;
3105
3106   if (!res)
3107     {
3108       delete loop_vinfo;
3109       if (fatal)
3110         gcc_checking_assert (main_loop_vinfo == NULL);
3111       return opt_loop_vec_info::propagate_failure (res);
3112     }
3113
3114   return opt_loop_vec_info::success (loop_vinfo);
3115 }
3116
3117 /* Function vect_analyze_loop.
3118
3119    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3120    for it.  The different analyses will record information in the
3121    loop_vec_info struct.  */
3122 opt_loop_vec_info
3123 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3124 {
3125   DUMP_VECT_SCOPE ("analyze_loop_nest");
3126
3127   if (loop_outer (loop)
3128       && loop_vec_info_for_loop (loop_outer (loop))
3129       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3130     return opt_loop_vec_info::failure_at (vect_location,
3131                                           "outer-loop already vectorized.\n");
3132
3133   if (!find_loop_nest (loop, &shared->loop_nest))
3134     return opt_loop_vec_info::failure_at
3135       (vect_location,
3136        "not vectorized: loop nest containing two or more consecutive inner"
3137        " loops cannot be vectorized\n");
3138
3139   /* Analyze the loop form.  */
3140   vect_loop_form_info loop_form_info;
3141   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3142   if (!res)
3143     {
3144       if (dump_enabled_p ())
3145         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3146                          "bad loop form.\n");
3147       return opt_loop_vec_info::propagate_failure (res);
3148     }
3149   if (!integer_onep (loop_form_info.assumptions))
3150     {
3151       /* We consider to vectorize this loop by versioning it under
3152          some assumptions.  In order to do this, we need to clear
3153          existing information computed by scev and niter analyzer.  */
3154       scev_reset_htab ();
3155       free_numbers_of_iterations_estimates (loop);
3156       /* Also set flag for this loop so that following scev and niter
3157          analysis are done under the assumptions.  */
3158       loop_constraint_set (loop, LOOP_C_FINITE);
3159     }
3160
3161   auto_vector_modes vector_modes;
3162   /* Autodetect first vector size we try.  */
3163   vector_modes.safe_push (VOIDmode);
3164   unsigned int autovec_flags
3165     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3166                                                     loop->simdlen != 0);
3167   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3168                              && !unlimited_cost_model (loop));
3169   machine_mode autodetected_vector_mode = VOIDmode;
3170   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3171   unsigned int mode_i = 0;
3172   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3173
3174   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3175      a mode has not been analyzed.  */
3176   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3177   for (unsigned i = 0; i < vector_modes.length (); ++i)
3178     cached_vf_per_mode.safe_push (0);
3179
3180   /* First determine the main loop vectorization mode, either the first
3181      one that works, starting with auto-detecting the vector mode and then
3182      following the targets order of preference, or the one with the
3183      lowest cost if pick_lowest_cost_p.  */
3184   while (1)
3185     {
3186       bool fatal;
3187       unsigned int last_mode_i = mode_i;
3188       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3189          failed.  */
3190       cached_vf_per_mode[last_mode_i] = -1;
3191       opt_loop_vec_info loop_vinfo
3192         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3193                                NULL, vector_modes, mode_i,
3194                                autodetected_vector_mode, fatal);
3195       if (fatal)
3196         break;
3197
3198       if (loop_vinfo)
3199         {
3200           /*  Analyzis has been successful so update the VF value.  The
3201               VF should always be a multiple of unroll_factor and we want to
3202               capture the original VF here.  */
3203           cached_vf_per_mode[last_mode_i]
3204             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3205                          loop_vinfo->suggested_unroll_factor);
3206           /* Once we hit the desired simdlen for the first time,
3207              discard any previous attempts.  */
3208           if (simdlen
3209               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3210             {
3211               delete first_loop_vinfo;
3212               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3213               simdlen = 0;
3214             }
3215           else if (pick_lowest_cost_p
3216                    && first_loop_vinfo
3217                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3218             {
3219               /* Pick loop_vinfo over first_loop_vinfo.  */
3220               delete first_loop_vinfo;
3221               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3222             }
3223           if (first_loop_vinfo == NULL)
3224             first_loop_vinfo = loop_vinfo;
3225           else
3226             {
3227               delete loop_vinfo;
3228               loop_vinfo = opt_loop_vec_info::success (NULL);
3229             }
3230
3231           /* Commit to first_loop_vinfo if we have no reason to try
3232              alternatives.  */
3233           if (!simdlen && !pick_lowest_cost_p)
3234             break;
3235         }
3236       if (mode_i == vector_modes.length ()
3237           || autodetected_vector_mode == VOIDmode)
3238         break;
3239
3240       /* Try the next biggest vector size.  */
3241       if (dump_enabled_p ())
3242         dump_printf_loc (MSG_NOTE, vect_location,
3243                          "***** Re-trying analysis with vector mode %s\n",
3244                          GET_MODE_NAME (vector_modes[mode_i]));
3245     }
3246   if (!first_loop_vinfo)
3247     return opt_loop_vec_info::propagate_failure (res);
3248
3249   if (dump_enabled_p ())
3250     dump_printf_loc (MSG_NOTE, vect_location,
3251                      "***** Choosing vector mode %s\n",
3252                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3253
3254   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3255      enabled, SIMDUID is not set, it is the innermost loop and we have
3256      either already found the loop's SIMDLEN or there was no SIMDLEN to
3257      begin with.
3258      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3259   bool vect_epilogues = (!simdlen
3260                          && loop->inner == NULL
3261                          && param_vect_epilogues_nomask
3262                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3263                          && !loop->simduid);
3264   if (!vect_epilogues)
3265     return first_loop_vinfo;
3266
3267   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3268   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3269
3270   /* For epilogues start the analysis from the first mode.  The motivation
3271      behind starting from the beginning comes from cases where the VECTOR_MODES
3272      array may contain length-agnostic and length-specific modes.  Their
3273      ordering is not guaranteed, so we could end up picking a mode for the main
3274      loop that is after the epilogue's optimal mode.  */
3275   vector_modes[0] = autodetected_vector_mode;
3276   mode_i = 0;
3277
3278   bool supports_partial_vectors =
3279     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3280   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3281
3282   while (1)
3283     {
3284       /* If the target does not support partial vectors we can shorten the
3285          number of modes to analyze for the epilogue as we know we can't pick a
3286          mode that would lead to a VF at least as big as the
3287          FIRST_VINFO_VF.  */
3288       if (!supports_partial_vectors
3289           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3290         {
3291           mode_i++;
3292           if (mode_i == vector_modes.length ())
3293             break;
3294           continue;
3295         }
3296
3297       if (dump_enabled_p ())
3298         dump_printf_loc (MSG_NOTE, vect_location,
3299                          "***** Re-trying epilogue analysis with vector "
3300                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3301
3302       bool fatal;
3303       opt_loop_vec_info loop_vinfo
3304         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3305                                first_loop_vinfo,
3306                                vector_modes, mode_i,
3307                                autodetected_vector_mode, fatal);
3308       if (fatal)
3309         break;
3310
3311       if (loop_vinfo)
3312         {
3313           if (pick_lowest_cost_p)
3314             {
3315               /* Keep trying to roll back vectorization attempts while the
3316                  loop_vec_infos they produced were worse than this one.  */
3317               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3318               while (!vinfos.is_empty ()
3319                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3320                 {
3321                   gcc_assert (vect_epilogues);
3322                   delete vinfos.pop ();
3323                 }
3324             }
3325           /* For now only allow one epilogue loop.  */
3326           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3327             {
3328               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3329               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3330               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3331                           || maybe_ne (lowest_th, 0U));
3332               /* Keep track of the known smallest versioning
3333                  threshold.  */
3334               if (ordered_p (lowest_th, th))
3335                 lowest_th = ordered_min (lowest_th, th);
3336             }
3337           else
3338             {
3339               delete loop_vinfo;
3340               loop_vinfo = opt_loop_vec_info::success (NULL);
3341             }
3342
3343           /* For now only allow one epilogue loop, but allow
3344              pick_lowest_cost_p to replace it, so commit to the
3345              first epilogue if we have no reason to try alternatives.  */
3346           if (!pick_lowest_cost_p)
3347             break;
3348         }
3349
3350       if (mode_i == vector_modes.length ())
3351         break;
3352
3353     }
3354
3355   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3356     {
3357       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3358       if (dump_enabled_p ())
3359         dump_printf_loc (MSG_NOTE, vect_location,
3360                          "***** Choosing epilogue vector mode %s\n",
3361                          GET_MODE_NAME
3362                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3363     }
3364
3365   return first_loop_vinfo;
3366 }
3367
3368 /* Return true if there is an in-order reduction function for CODE, storing
3369    it in *REDUC_FN if so.  */
3370
3371 static bool
3372 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3373 {
3374   if (code == PLUS_EXPR)
3375     {
3376       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3377       return true;
3378     }
3379   return false;
3380 }
3381
3382 /* Function reduction_fn_for_scalar_code
3383
3384    Input:
3385    CODE - tree_code of a reduction operations.
3386
3387    Output:
3388    REDUC_FN - the corresponding internal function to be used to reduce the
3389       vector of partial results into a single scalar result, or IFN_LAST
3390       if the operation is a supported reduction operation, but does not have
3391       such an internal function.
3392
3393    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3394
3395 bool
3396 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3397 {
3398   if (code.is_tree_code ())
3399     switch (tree_code (code))
3400       {
3401       case MAX_EXPR:
3402         *reduc_fn = IFN_REDUC_MAX;
3403         return true;
3404
3405       case MIN_EXPR:
3406         *reduc_fn = IFN_REDUC_MIN;
3407         return true;
3408
3409       case PLUS_EXPR:
3410         *reduc_fn = IFN_REDUC_PLUS;
3411         return true;
3412
3413       case BIT_AND_EXPR:
3414         *reduc_fn = IFN_REDUC_AND;
3415         return true;
3416
3417       case BIT_IOR_EXPR:
3418         *reduc_fn = IFN_REDUC_IOR;
3419         return true;
3420
3421       case BIT_XOR_EXPR:
3422         *reduc_fn = IFN_REDUC_XOR;
3423         return true;
3424
3425       case MULT_EXPR:
3426       case MINUS_EXPR:
3427         *reduc_fn = IFN_LAST;
3428         return true;
3429
3430       default:
3431         return false;
3432       }
3433   else
3434     switch (combined_fn (code))
3435       {
3436       CASE_CFN_FMAX:
3437         *reduc_fn = IFN_REDUC_FMAX;
3438         return true;
3439
3440       CASE_CFN_FMIN:
3441         *reduc_fn = IFN_REDUC_FMIN;
3442         return true;
3443
3444       default:
3445         return false;
3446       }
3447 }
3448
3449 /* If there is a neutral value X such that a reduction would not be affected
3450    by the introduction of additional X elements, return that X, otherwise
3451    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3452    of the scalar elements.  If the reduction has just a single initial value
3453    then INITIAL_VALUE is that value, otherwise it is null.  */
3454
3455 tree
3456 neutral_op_for_reduction (tree scalar_type, code_helper code,
3457                           tree initial_value)
3458 {
3459   if (code.is_tree_code ())
3460     switch (tree_code (code))
3461       {
3462       case WIDEN_SUM_EXPR:
3463       case DOT_PROD_EXPR:
3464       case SAD_EXPR:
3465       case PLUS_EXPR:
3466       case MINUS_EXPR:
3467       case BIT_IOR_EXPR:
3468       case BIT_XOR_EXPR:
3469         return build_zero_cst (scalar_type);
3470
3471       case MULT_EXPR:
3472         return build_one_cst (scalar_type);
3473
3474       case BIT_AND_EXPR:
3475         return build_all_ones_cst (scalar_type);
3476
3477       case MAX_EXPR:
3478       case MIN_EXPR:
3479         return initial_value;
3480
3481       default:
3482         return NULL_TREE;
3483       }
3484   else
3485     switch (combined_fn (code))
3486       {
3487       CASE_CFN_FMIN:
3488       CASE_CFN_FMAX:
3489         return initial_value;
3490
3491       default:
3492         return NULL_TREE;
3493       }
3494 }
3495
3496 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3497    STMT is printed with a message MSG. */
3498
3499 static void
3500 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3501 {
3502   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3503 }
3504
3505 /* Return true if we need an in-order reduction for operation CODE
3506    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3507    overflow must wrap.  */
3508
3509 bool
3510 needs_fold_left_reduction_p (tree type, code_helper code)
3511 {
3512   /* CHECKME: check for !flag_finite_math_only too?  */
3513   if (SCALAR_FLOAT_TYPE_P (type))
3514     {
3515       if (code.is_tree_code ())
3516         switch (tree_code (code))
3517           {
3518           case MIN_EXPR:
3519           case MAX_EXPR:
3520             return false;
3521
3522           default:
3523             return !flag_associative_math;
3524           }
3525       else
3526         switch (combined_fn (code))
3527           {
3528           CASE_CFN_FMIN:
3529           CASE_CFN_FMAX:
3530             return false;
3531
3532           default:
3533             return !flag_associative_math;
3534           }
3535     }
3536
3537   if (INTEGRAL_TYPE_P (type))
3538     return (!code.is_tree_code ()
3539             || !operation_no_trapping_overflow (type, tree_code (code)));
3540
3541   if (SAT_FIXED_POINT_TYPE_P (type))
3542     return true;
3543
3544   return false;
3545 }
3546
3547 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3548    has a handled computation expression.  Store the main reduction
3549    operation in *CODE.  */
3550
3551 static bool
3552 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3553                       tree loop_arg, code_helper *code,
3554                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3555 {
3556   auto_bitmap visited;
3557   tree lookfor = PHI_RESULT (phi);
3558   ssa_op_iter curri;
3559   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3560   while (USE_FROM_PTR (curr) != loop_arg)
3561     curr = op_iter_next_use (&curri);
3562   curri.i = curri.numops;
3563   do
3564     {
3565       path.safe_push (std::make_pair (curri, curr));
3566       tree use = USE_FROM_PTR (curr);
3567       if (use == lookfor)
3568         break;
3569       gimple *def = SSA_NAME_DEF_STMT (use);
3570       if (gimple_nop_p (def)
3571           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3572         {
3573 pop:
3574           do
3575             {
3576               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3577               curri = x.first;
3578               curr = x.second;
3579               do
3580                 curr = op_iter_next_use (&curri);
3581               /* Skip already visited or non-SSA operands (from iterating
3582                  over PHI args).  */
3583               while (curr != NULL_USE_OPERAND_P
3584                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3585                          || ! bitmap_set_bit (visited,
3586                                               SSA_NAME_VERSION
3587                                                 (USE_FROM_PTR (curr)))));
3588             }
3589           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3590           if (curr == NULL_USE_OPERAND_P)
3591             break;
3592         }
3593       else
3594         {
3595           if (gimple_code (def) == GIMPLE_PHI)
3596             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3597           else
3598             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3599           while (curr != NULL_USE_OPERAND_P
3600                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3601                      || ! bitmap_set_bit (visited,
3602                                           SSA_NAME_VERSION
3603                                             (USE_FROM_PTR (curr)))))
3604             curr = op_iter_next_use (&curri);
3605           if (curr == NULL_USE_OPERAND_P)
3606             goto pop;
3607         }
3608     }
3609   while (1);
3610   if (dump_file && (dump_flags & TDF_DETAILS))
3611     {
3612       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3613       unsigned i;
3614       std::pair<ssa_op_iter, use_operand_p> *x;
3615       FOR_EACH_VEC_ELT (path, i, x)
3616         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3617       dump_printf (MSG_NOTE, "\n");
3618     }
3619
3620   /* Check whether the reduction path detected is valid.  */
3621   bool fail = path.length () == 0;
3622   bool neg = false;
3623   int sign = -1;
3624   *code = ERROR_MARK;
3625   for (unsigned i = 1; i < path.length (); ++i)
3626     {
3627       gimple *use_stmt = USE_STMT (path[i].second);
3628       gimple_match_op op;
3629       if (!gimple_extract_op (use_stmt, &op))
3630         {
3631           fail = true;
3632           break;
3633         }
3634       unsigned int opi = op.num_ops;
3635       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3636         {
3637           /* The following make sure we can compute the operand index
3638              easily plus it mostly disallows chaining via COND_EXPR condition
3639              operands.  */
3640           for (opi = 0; opi < op.num_ops; ++opi)
3641             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3642               break;
3643         }
3644       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3645         {
3646           for (opi = 0; opi < op.num_ops; ++opi)
3647             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3648               break;
3649         }
3650       if (opi == op.num_ops)
3651         {
3652           fail = true;
3653           break;
3654         }
3655       op.code = canonicalize_code (op.code, op.type);
3656       if (op.code == MINUS_EXPR)
3657         {
3658           op.code = PLUS_EXPR;
3659           /* Track whether we negate the reduction value each iteration.  */
3660           if (op.ops[1] == op.ops[opi])
3661             neg = ! neg;
3662         }
3663       if (CONVERT_EXPR_CODE_P (op.code)
3664           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3665         ;
3666       else if (*code == ERROR_MARK)
3667         {
3668           *code = op.code;
3669           sign = TYPE_SIGN (op.type);
3670         }
3671       else if (op.code != *code)
3672         {
3673           fail = true;
3674           break;
3675         }
3676       else if ((op.code == MIN_EXPR
3677                 || op.code == MAX_EXPR)
3678                && sign != TYPE_SIGN (op.type))
3679         {
3680           fail = true;
3681           break;
3682         }
3683       /* Check there's only a single stmt the op is used on.  For the
3684          not value-changing tail and the last stmt allow out-of-loop uses.
3685          ???  We could relax this and handle arbitrary live stmts by
3686          forcing a scalar epilogue for example.  */
3687       imm_use_iterator imm_iter;
3688       gimple *op_use_stmt;
3689       unsigned cnt = 0;
3690       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3691         if (!is_gimple_debug (op_use_stmt)
3692             && (*code != ERROR_MARK
3693                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3694           {
3695             /* We want to allow x + x but not x < 1 ? x : 2.  */
3696             if (is_gimple_assign (op_use_stmt)
3697                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3698               {
3699                 use_operand_p use_p;
3700                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3701                   cnt++;
3702               }
3703             else
3704               cnt++;
3705           }
3706       if (cnt != 1)
3707         {
3708           fail = true;
3709           break;
3710         }
3711     }
3712   return ! fail && ! neg && *code != ERROR_MARK;
3713 }
3714
3715 bool
3716 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3717                       tree loop_arg, enum tree_code code)
3718 {
3719   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3720   code_helper code_;
3721   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3722           && code_ == code);
3723 }
3724
3725
3726
3727 /* Function vect_is_simple_reduction
3728
3729    (1) Detect a cross-iteration def-use cycle that represents a simple
3730    reduction computation.  We look for the following pattern:
3731
3732    loop_header:
3733      a1 = phi < a0, a2 >
3734      a3 = ...
3735      a2 = operation (a3, a1)
3736
3737    or
3738
3739    a3 = ...
3740    loop_header:
3741      a1 = phi < a0, a2 >
3742      a2 = operation (a3, a1)
3743
3744    such that:
3745    1. operation is commutative and associative and it is safe to
3746       change the order of the computation
3747    2. no uses for a2 in the loop (a2 is used out of the loop)
3748    3. no uses of a1 in the loop besides the reduction operation
3749    4. no uses of a1 outside the loop.
3750
3751    Conditions 1,4 are tested here.
3752    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3753
3754    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3755    nested cycles.
3756
3757    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3758    reductions:
3759
3760      a1 = phi < a0, a2 >
3761      inner loop (def of a3)
3762      a2 = phi < a3 >
3763
3764    (4) Detect condition expressions, ie:
3765      for (int i = 0; i < N; i++)
3766        if (a[i] < val)
3767         ret_val = a[i];
3768
3769 */
3770
3771 static stmt_vec_info
3772 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3773                           bool *double_reduc, bool *reduc_chain_p, bool slp)
3774 {
3775   gphi *phi = as_a <gphi *> (phi_info->stmt);
3776   gimple *phi_use_stmt = NULL;
3777   imm_use_iterator imm_iter;
3778   use_operand_p use_p;
3779
3780   *double_reduc = false;
3781   *reduc_chain_p = false;
3782   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3783
3784   tree phi_name = PHI_RESULT (phi);
3785   /* ???  If there are no uses of the PHI result the inner loop reduction
3786      won't be detected as possibly double-reduction by vectorizable_reduction
3787      because that tries to walk the PHI arg from the preheader edge which
3788      can be constant.  See PR60382.  */
3789   if (has_zero_uses (phi_name))
3790     return NULL;
3791   class loop *loop = (gimple_bb (phi))->loop_father;
3792   unsigned nphi_def_loop_uses = 0;
3793   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3794     {
3795       gimple *use_stmt = USE_STMT (use_p);
3796       if (is_gimple_debug (use_stmt))
3797         continue;
3798
3799       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3800         {
3801           if (dump_enabled_p ())
3802             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3803                              "intermediate value used outside loop.\n");
3804
3805           return NULL;
3806         }
3807
3808       nphi_def_loop_uses++;
3809       phi_use_stmt = use_stmt;
3810     }
3811
3812   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3813   if (TREE_CODE (latch_def) != SSA_NAME)
3814     {
3815       if (dump_enabled_p ())
3816         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3817                          "reduction: not ssa_name: %T\n", latch_def);
3818       return NULL;
3819     }
3820
3821   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3822   if (!def_stmt_info
3823       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3824     return NULL;
3825
3826   bool nested_in_vect_loop
3827     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3828   unsigned nlatch_def_loop_uses = 0;
3829   auto_vec<gphi *, 3> lcphis;
3830   bool inner_loop_of_double_reduc = false;
3831   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3832     {
3833       gimple *use_stmt = USE_STMT (use_p);
3834       if (is_gimple_debug (use_stmt))
3835         continue;
3836       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3837         nlatch_def_loop_uses++;
3838       else
3839         {
3840           /* We can have more than one loop-closed PHI.  */
3841           lcphis.safe_push (as_a <gphi *> (use_stmt));
3842           if (nested_in_vect_loop
3843               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3844                   == vect_double_reduction_def))
3845             inner_loop_of_double_reduc = true;
3846         }
3847     }
3848
3849   /* If we are vectorizing an inner reduction we are executing that
3850      in the original order only in case we are not dealing with a
3851      double reduction.  */
3852   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3853     {
3854       if (dump_enabled_p ())
3855         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3856                         "detected nested cycle: ");
3857       return def_stmt_info;
3858     }
3859
3860   /* When the inner loop of a double reduction ends up with more than
3861      one loop-closed PHI we have failed to classify alternate such
3862      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3863   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3864     {
3865       if (dump_enabled_p ())
3866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3867                          "unhandle double reduction\n");
3868       return NULL;
3869     }
3870
3871   /* If this isn't a nested cycle or if the nested cycle reduction value
3872      is used ouside of the inner loop we cannot handle uses of the reduction
3873      value.  */
3874   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3875     {
3876       if (dump_enabled_p ())
3877         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3878                          "reduction used in loop.\n");
3879       return NULL;
3880     }
3881
3882   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3883      defined in the inner loop.  */
3884   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3885     {
3886       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3887       if (gimple_phi_num_args (def_stmt) != 1
3888           || TREE_CODE (op1) != SSA_NAME)
3889         {
3890           if (dump_enabled_p ())
3891             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3892                              "unsupported phi node definition.\n");
3893
3894           return NULL;
3895         }
3896
3897       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3898          and the latch definition op1.  */
3899       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3900       if (gimple_bb (def1)
3901           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3902           && loop->inner
3903           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3904           && (is_gimple_assign (def1) || is_gimple_call (def1))
3905           && is_a <gphi *> (phi_use_stmt)
3906           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3907           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3908                                             loop_latch_edge (loop->inner))))
3909         {
3910           if (dump_enabled_p ())
3911             report_vect_op (MSG_NOTE, def_stmt,
3912                             "detected double reduction: ");
3913
3914           *double_reduc = true;
3915           return def_stmt_info;
3916         }
3917
3918       return NULL;
3919     }
3920
3921   /* Look for the expression computing latch_def from then loop PHI result.  */
3922   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3923   code_helper code;
3924   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3925                             path))
3926     {
3927       STMT_VINFO_REDUC_CODE (phi_info) = code;
3928       if (code == COND_EXPR && !nested_in_vect_loop)
3929         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3930
3931       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3932          reduction chain for which the additional restriction is that
3933          all operations in the chain are the same.  */
3934       auto_vec<stmt_vec_info, 8> reduc_chain;
3935       unsigned i;
3936       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3937       for (i = path.length () - 1; i >= 1; --i)
3938         {
3939           gimple *stmt = USE_STMT (path[i].second);
3940           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3941           gimple_match_op op;
3942           if (!gimple_extract_op (stmt, &op))
3943             gcc_unreachable ();
3944           if (gassign *assign = dyn_cast<gassign *> (stmt))
3945             STMT_VINFO_REDUC_IDX (stmt_info)
3946               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3947           else
3948             {
3949               gcall *call = as_a<gcall *> (stmt);
3950               STMT_VINFO_REDUC_IDX (stmt_info)
3951                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3952             }
3953           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3954                                      && (i == 1 || i == path.length () - 1));
3955           if ((op.code != code && !leading_conversion)
3956               /* We can only handle the final value in epilogue
3957                  generation for reduction chains.  */
3958               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3959             is_slp_reduc = false;
3960           /* For reduction chains we support a trailing/leading
3961              conversions.  We do not store those in the actual chain.  */
3962           if (leading_conversion)
3963             continue;
3964           reduc_chain.safe_push (stmt_info);
3965         }
3966       if (slp && is_slp_reduc && reduc_chain.length () > 1)
3967         {
3968           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3969             {
3970               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3971               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3972             }
3973           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3974           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3975
3976           /* Save the chain for further analysis in SLP detection.  */
3977           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3978           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3979
3980           *reduc_chain_p = true;
3981           if (dump_enabled_p ())
3982             dump_printf_loc (MSG_NOTE, vect_location,
3983                             "reduction: detected reduction chain\n");
3984         }
3985       else if (dump_enabled_p ())
3986         dump_printf_loc (MSG_NOTE, vect_location,
3987                          "reduction: detected reduction\n");
3988
3989       return def_stmt_info;
3990     }
3991
3992   if (dump_enabled_p ())
3993     dump_printf_loc (MSG_NOTE, vect_location,
3994                      "reduction: unknown pattern\n");
3995
3996   return NULL;
3997 }
3998
3999 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4000    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4001    or -1 if not known.  */
4002
4003 static int
4004 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4005 {
4006   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4007   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4008     {
4009       if (dump_enabled_p ())
4010         dump_printf_loc (MSG_NOTE, vect_location,
4011                          "cost model: epilogue peel iters set to vf/2 "
4012                          "because loop iterations are unknown .\n");
4013       return assumed_vf / 2;
4014     }
4015   else
4016     {
4017       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4018       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4019       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4020       /* If we need to peel for gaps, but no peeling is required, we have to
4021          peel VF iterations.  */
4022       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4023         peel_iters_epilogue = assumed_vf;
4024       return peel_iters_epilogue;
4025     }
4026 }
4027
4028 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4029 int
4030 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4031                              int *peel_iters_epilogue,
4032                              stmt_vector_for_cost *scalar_cost_vec,
4033                              stmt_vector_for_cost *prologue_cost_vec,
4034                              stmt_vector_for_cost *epilogue_cost_vec)
4035 {
4036   int retval = 0;
4037
4038   *peel_iters_epilogue
4039     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4040
4041   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4042     {
4043       /* If peeled iterations are known but number of scalar loop
4044          iterations are unknown, count a taken branch per peeled loop.  */
4045       if (peel_iters_prologue > 0)
4046         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4047                                    vect_prologue);
4048       if (*peel_iters_epilogue > 0)
4049         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4050                                     vect_epilogue);
4051     }
4052
4053   stmt_info_for_cost *si;
4054   int j;
4055   if (peel_iters_prologue)
4056     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4057       retval += record_stmt_cost (prologue_cost_vec,
4058                                   si->count * peel_iters_prologue,
4059                                   si->kind, si->stmt_info, si->misalign,
4060                                   vect_prologue);
4061   if (*peel_iters_epilogue)
4062     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4063       retval += record_stmt_cost (epilogue_cost_vec,
4064                                   si->count * *peel_iters_epilogue,
4065                                   si->kind, si->stmt_info, si->misalign,
4066                                   vect_epilogue);
4067
4068   return retval;
4069 }
4070
4071 /* Function vect_estimate_min_profitable_iters
4072
4073    Return the number of iterations required for the vector version of the
4074    loop to be profitable relative to the cost of the scalar version of the
4075    loop.
4076
4077    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4078    of iterations for vectorization.  -1 value means loop vectorization
4079    is not profitable.  This returned value may be used for dynamic
4080    profitability check.
4081
4082    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4083    for static check against estimated number of iterations.  */
4084
4085 static void
4086 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4087                                     int *ret_min_profitable_niters,
4088                                     int *ret_min_profitable_estimate,
4089                                     unsigned *suggested_unroll_factor)
4090 {
4091   int min_profitable_iters;
4092   int min_profitable_estimate;
4093   int peel_iters_prologue;
4094   int peel_iters_epilogue;
4095   unsigned vec_inside_cost = 0;
4096   int vec_outside_cost = 0;
4097   unsigned vec_prologue_cost = 0;
4098   unsigned vec_epilogue_cost = 0;
4099   int scalar_single_iter_cost = 0;
4100   int scalar_outside_cost = 0;
4101   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4102   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4103   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4104
4105   /* Cost model disabled.  */
4106   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4107     {
4108       if (dump_enabled_p ())
4109         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4110       *ret_min_profitable_niters = 0;
4111       *ret_min_profitable_estimate = 0;
4112       return;
4113     }
4114
4115   /* Requires loop versioning tests to handle misalignment.  */
4116   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4117     {
4118       /*  FIXME: Make cost depend on complexity of individual check.  */
4119       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4120       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4121       if (dump_enabled_p ())
4122         dump_printf (MSG_NOTE,
4123                      "cost model: Adding cost of checks for loop "
4124                      "versioning to treat misalignment.\n");
4125     }
4126
4127   /* Requires loop versioning with alias checks.  */
4128   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4129     {
4130       /*  FIXME: Make cost depend on complexity of individual check.  */
4131       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4132       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4133       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4134       if (len)
4135         /* Count LEN - 1 ANDs and LEN comparisons.  */
4136         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4137                               scalar_stmt, vect_prologue);
4138       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4139       if (len)
4140         {
4141           /* Count LEN - 1 ANDs and LEN comparisons.  */
4142           unsigned int nstmts = len * 2 - 1;
4143           /* +1 for each bias that needs adding.  */
4144           for (unsigned int i = 0; i < len; ++i)
4145             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4146               nstmts += 1;
4147           (void) add_stmt_cost (target_cost_data, nstmts,
4148                                 scalar_stmt, vect_prologue);
4149         }
4150       if (dump_enabled_p ())
4151         dump_printf (MSG_NOTE,
4152                      "cost model: Adding cost of checks for loop "
4153                      "versioning aliasing.\n");
4154     }
4155
4156   /* Requires loop versioning with niter checks.  */
4157   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4158     {
4159       /*  FIXME: Make cost depend on complexity of individual check.  */
4160       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4161                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4162       if (dump_enabled_p ())
4163         dump_printf (MSG_NOTE,
4164                      "cost model: Adding cost of checks for loop "
4165                      "versioning niters.\n");
4166     }
4167
4168   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4169     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4170                           vect_prologue);
4171
4172   /* Count statements in scalar loop.  Using this as scalar cost for a single
4173      iteration for now.
4174
4175      TODO: Add outer loop support.
4176
4177      TODO: Consider assigning different costs to different scalar
4178      statements.  */
4179
4180   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4181
4182   /* Add additional cost for the peeled instructions in prologue and epilogue
4183      loop.  (For fully-masked loops there will be no peeling.)
4184
4185      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4186      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4187
4188      TODO: Build an expression that represents peel_iters for prologue and
4189      epilogue to be used in a run-time test.  */
4190
4191   bool prologue_need_br_taken_cost = false;
4192   bool prologue_need_br_not_taken_cost = false;
4193
4194   /* Calculate peel_iters_prologue.  */
4195   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4196     peel_iters_prologue = 0;
4197   else if (npeel < 0)
4198     {
4199       peel_iters_prologue = assumed_vf / 2;
4200       if (dump_enabled_p ())
4201         dump_printf (MSG_NOTE, "cost model: "
4202                      "prologue peel iters set to vf/2.\n");
4203
4204       /* If peeled iterations are unknown, count a taken branch and a not taken
4205          branch per peeled loop.  Even if scalar loop iterations are known,
4206          vector iterations are not known since peeled prologue iterations are
4207          not known.  Hence guards remain the same.  */
4208       prologue_need_br_taken_cost = true;
4209       prologue_need_br_not_taken_cost = true;
4210     }
4211   else
4212     {
4213       peel_iters_prologue = npeel;
4214       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4215         /* If peeled iterations are known but number of scalar loop
4216            iterations are unknown, count a taken branch per peeled loop.  */
4217         prologue_need_br_taken_cost = true;
4218     }
4219
4220   bool epilogue_need_br_taken_cost = false;
4221   bool epilogue_need_br_not_taken_cost = false;
4222
4223   /* Calculate peel_iters_epilogue.  */
4224   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4225     /* We need to peel exactly one iteration for gaps.  */
4226     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4227   else if (npeel < 0)
4228     {
4229       /* If peeling for alignment is unknown, loop bound of main loop
4230          becomes unknown.  */
4231       peel_iters_epilogue = assumed_vf / 2;
4232       if (dump_enabled_p ())
4233         dump_printf (MSG_NOTE, "cost model: "
4234                      "epilogue peel iters set to vf/2 because "
4235                      "peeling for alignment is unknown.\n");
4236
4237       /* See the same reason above in peel_iters_prologue calculation.  */
4238       epilogue_need_br_taken_cost = true;
4239       epilogue_need_br_not_taken_cost = true;
4240     }
4241   else
4242     {
4243       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4244       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4245         /* If peeled iterations are known but number of scalar loop
4246            iterations are unknown, count a taken branch per peeled loop.  */
4247         epilogue_need_br_taken_cost = true;
4248     }
4249
4250   stmt_info_for_cost *si;
4251   int j;
4252   /* Add costs associated with peel_iters_prologue.  */
4253   if (peel_iters_prologue)
4254     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4255       {
4256         (void) add_stmt_cost (target_cost_data,
4257                               si->count * peel_iters_prologue, si->kind,
4258                               si->stmt_info, si->node, si->vectype,
4259                               si->misalign, vect_prologue);
4260       }
4261
4262   /* Add costs associated with peel_iters_epilogue.  */
4263   if (peel_iters_epilogue)
4264     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4265       {
4266         (void) add_stmt_cost (target_cost_data,
4267                               si->count * peel_iters_epilogue, si->kind,
4268                               si->stmt_info, si->node, si->vectype,
4269                               si->misalign, vect_epilogue);
4270       }
4271
4272   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4273
4274   if (prologue_need_br_taken_cost)
4275     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4276                           vect_prologue);
4277
4278   if (prologue_need_br_not_taken_cost)
4279     (void) add_stmt_cost (target_cost_data, 1,
4280                           cond_branch_not_taken, vect_prologue);
4281
4282   if (epilogue_need_br_taken_cost)
4283     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4284                           vect_epilogue);
4285
4286   if (epilogue_need_br_not_taken_cost)
4287     (void) add_stmt_cost (target_cost_data, 1,
4288                           cond_branch_not_taken, vect_epilogue);
4289
4290   /* Take care of special costs for rgroup controls of partial vectors.  */
4291   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4292     {
4293       /* Calculate how many masks we need to generate.  */
4294       unsigned int num_masks = 0;
4295       rgroup_controls *rgm;
4296       unsigned int num_vectors_m1;
4297       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4298         if (rgm->type)
4299           num_masks += num_vectors_m1 + 1;
4300       gcc_assert (num_masks > 0);
4301
4302       /* In the worst case, we need to generate each mask in the prologue
4303          and in the loop body.  One of the loop body mask instructions
4304          replaces the comparison in the scalar loop, and since we don't
4305          count the scalar comparison against the scalar body, we shouldn't
4306          count that vector instruction against the vector body either.
4307
4308          Sometimes we can use unpacks instead of generating prologue
4309          masks and sometimes the prologue mask will fold to a constant,
4310          so the actual prologue cost might be smaller.  However, it's
4311          simpler and safer to use the worst-case cost; if this ends up
4312          being the tie-breaker between vectorizing or not, then it's
4313          probably better not to vectorize.  */
4314       (void) add_stmt_cost (target_cost_data, num_masks,
4315                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4316                             vect_prologue);
4317       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4318                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4319                             vect_body);
4320     }
4321   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4322     {
4323       /* Referring to the functions vect_set_loop_condition_partial_vectors
4324          and vect_set_loop_controls_directly, we need to generate each
4325          length in the prologue and in the loop body if required. Although
4326          there are some possible optimizations, we consider the worst case
4327          here.  */
4328
4329       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4330       signed char partial_load_store_bias
4331         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4332       bool need_iterate_p
4333         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4334            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4335
4336       /* Calculate how many statements to be added.  */
4337       unsigned int prologue_stmts = 0;
4338       unsigned int body_stmts = 0;
4339
4340       rgroup_controls *rgc;
4341       unsigned int num_vectors_m1;
4342       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4343         if (rgc->type)
4344           {
4345             /* May need one SHIFT for nitems_total computation.  */
4346             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4347             if (nitems != 1 && !niters_known_p)
4348               prologue_stmts += 1;
4349
4350             /* May need one MAX and one MINUS for wrap around.  */
4351             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4352               prologue_stmts += 2;
4353
4354             /* Need one MAX and one MINUS for each batch limit excepting for
4355                the 1st one.  */
4356             prologue_stmts += num_vectors_m1 * 2;
4357
4358             unsigned int num_vectors = num_vectors_m1 + 1;
4359
4360             /* Need to set up lengths in prologue, only one MIN required
4361                for each since start index is zero.  */
4362             prologue_stmts += num_vectors;
4363
4364             /* If we have a non-zero partial load bias, we need one PLUS
4365                to adjust the load length.  */
4366             if (partial_load_store_bias != 0)
4367               body_stmts += 1;
4368
4369             /* Each may need two MINs and one MINUS to update lengths in body
4370                for next iteration.  */
4371             if (need_iterate_p)
4372               body_stmts += 3 * num_vectors;
4373           }
4374
4375       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4376                             scalar_stmt, vect_prologue);
4377       (void) add_stmt_cost (target_cost_data, body_stmts,
4378                             scalar_stmt, vect_body);
4379     }
4380
4381   /* FORNOW: The scalar outside cost is incremented in one of the
4382      following ways:
4383
4384      1. The vectorizer checks for alignment and aliasing and generates
4385      a condition that allows dynamic vectorization.  A cost model
4386      check is ANDED with the versioning condition.  Hence scalar code
4387      path now has the added cost of the versioning check.
4388
4389        if (cost > th & versioning_check)
4390          jmp to vector code
4391
4392      Hence run-time scalar is incremented by not-taken branch cost.
4393
4394      2. The vectorizer then checks if a prologue is required.  If the
4395      cost model check was not done before during versioning, it has to
4396      be done before the prologue check.
4397
4398        if (cost <= th)
4399          prologue = scalar_iters
4400        if (prologue == 0)
4401          jmp to vector code
4402        else
4403          execute prologue
4404        if (prologue == num_iters)
4405          go to exit
4406
4407      Hence the run-time scalar cost is incremented by a taken branch,
4408      plus a not-taken branch, plus a taken branch cost.
4409
4410      3. The vectorizer then checks if an epilogue is required.  If the
4411      cost model check was not done before during prologue check, it
4412      has to be done with the epilogue check.
4413
4414        if (prologue == 0)
4415          jmp to vector code
4416        else
4417          execute prologue
4418        if (prologue == num_iters)
4419          go to exit
4420        vector code:
4421          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4422            jmp to epilogue
4423
4424      Hence the run-time scalar cost should be incremented by 2 taken
4425      branches.
4426
4427      TODO: The back end may reorder the BBS's differently and reverse
4428      conditions/branch directions.  Change the estimates below to
4429      something more reasonable.  */
4430
4431   /* If the number of iterations is known and we do not do versioning, we can
4432      decide whether to vectorize at compile time.  Hence the scalar version
4433      do not carry cost model guard costs.  */
4434   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4435       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4436     {
4437       /* Cost model check occurs at versioning.  */
4438       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4439         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4440       else
4441         {
4442           /* Cost model check occurs at prologue generation.  */
4443           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4444             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4445               + vect_get_stmt_cost (cond_branch_not_taken);
4446           /* Cost model check occurs at epilogue generation.  */
4447           else
4448             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4449         }
4450     }
4451
4452   /* Complete the target-specific cost calculations.  */
4453   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4454                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4455                suggested_unroll_factor);
4456
4457   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4458       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4459       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4460                     *suggested_unroll_factor,
4461                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4462     {
4463       if (dump_enabled_p ())
4464         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4465                          "can't unroll as unrolled vectorization factor larger"
4466                          " than maximum vectorization factor: "
4467                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4468                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4469       *suggested_unroll_factor = 1;
4470     }
4471
4472   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4473
4474   if (dump_enabled_p ())
4475     {
4476       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4477       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4478                    vec_inside_cost);
4479       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4480                    vec_prologue_cost);
4481       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4482                    vec_epilogue_cost);
4483       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4484                    scalar_single_iter_cost);
4485       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4486                    scalar_outside_cost);
4487       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4488                    vec_outside_cost);
4489       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4490                    peel_iters_prologue);
4491       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4492                    peel_iters_epilogue);
4493     }
4494
4495   /* Calculate number of iterations required to make the vector version
4496      profitable, relative to the loop bodies only.  The following condition
4497      must hold true:
4498      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4499      where
4500      SIC = scalar iteration cost, VIC = vector iteration cost,
4501      VOC = vector outside cost, VF = vectorization factor,
4502      NPEEL = prologue iterations + epilogue iterations,
4503      SOC = scalar outside cost for run time cost model check.  */
4504
4505   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4506                           - vec_inside_cost);
4507   if (saving_per_viter <= 0)
4508     {
4509       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4510         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4511                     "vectorization did not happen for a simd loop");
4512
4513       if (dump_enabled_p ())
4514         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4515                          "cost model: the vector iteration cost = %d "
4516                          "divided by the scalar iteration cost = %d "
4517                          "is greater or equal to the vectorization factor = %d"
4518                          ".\n",
4519                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4520       *ret_min_profitable_niters = -1;
4521       *ret_min_profitable_estimate = -1;
4522       return;
4523     }
4524
4525   /* ??? The "if" arm is written to handle all cases; see below for what
4526      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4527   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4528     {
4529       /* Rewriting the condition above in terms of the number of
4530          vector iterations (vniters) rather than the number of
4531          scalar iterations (niters) gives:
4532
4533          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4534
4535          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4536
4537          For integer N, X and Y when X > 0:
4538
4539          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4540       int outside_overhead = (vec_outside_cost
4541                               - scalar_single_iter_cost * peel_iters_prologue
4542                               - scalar_single_iter_cost * peel_iters_epilogue
4543                               - scalar_outside_cost);
4544       /* We're only interested in cases that require at least one
4545          vector iteration.  */
4546       int min_vec_niters = 1;
4547       if (outside_overhead > 0)
4548         min_vec_niters = outside_overhead / saving_per_viter + 1;
4549
4550       if (dump_enabled_p ())
4551         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4552                      min_vec_niters);
4553
4554       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4555         {
4556           /* Now that we know the minimum number of vector iterations,
4557              find the minimum niters for which the scalar cost is larger:
4558
4559              SIC * niters > VIC * vniters + VOC - SOC
4560
4561              We know that the minimum niters is no more than
4562              vniters * VF + NPEEL, but it might be (and often is) less
4563              than that if a partial vector iteration is cheaper than the
4564              equivalent scalar code.  */
4565           int threshold = (vec_inside_cost * min_vec_niters
4566                            + vec_outside_cost
4567                            - scalar_outside_cost);
4568           if (threshold <= 0)
4569             min_profitable_iters = 1;
4570           else
4571             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4572         }
4573       else
4574         /* Convert the number of vector iterations into a number of
4575            scalar iterations.  */
4576         min_profitable_iters = (min_vec_niters * assumed_vf
4577                                 + peel_iters_prologue
4578                                 + peel_iters_epilogue);
4579     }
4580   else
4581     {
4582       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4583                               * assumed_vf
4584                               - vec_inside_cost * peel_iters_prologue
4585                               - vec_inside_cost * peel_iters_epilogue);
4586       if (min_profitable_iters <= 0)
4587         min_profitable_iters = 0;
4588       else
4589         {
4590           min_profitable_iters /= saving_per_viter;
4591
4592           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4593               <= (((int) vec_inside_cost * min_profitable_iters)
4594                   + (((int) vec_outside_cost - scalar_outside_cost)
4595                      * assumed_vf)))
4596             min_profitable_iters++;
4597         }
4598     }
4599
4600   if (dump_enabled_p ())
4601     dump_printf (MSG_NOTE,
4602                  "  Calculated minimum iters for profitability: %d\n",
4603                  min_profitable_iters);
4604
4605   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4606       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4607     /* We want the vectorized loop to execute at least once.  */
4608     min_profitable_iters = assumed_vf + peel_iters_prologue;
4609   else if (min_profitable_iters < peel_iters_prologue)
4610     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4611        vectorized loop executes at least once.  */
4612     min_profitable_iters = peel_iters_prologue;
4613
4614   if (dump_enabled_p ())
4615     dump_printf_loc (MSG_NOTE, vect_location,
4616                      "  Runtime profitability threshold = %d\n",
4617                      min_profitable_iters);
4618
4619   *ret_min_profitable_niters = min_profitable_iters;
4620
4621   /* Calculate number of iterations required to make the vector version
4622      profitable, relative to the loop bodies only.
4623
4624      Non-vectorized variant is SIC * niters and it must win over vector
4625      variant on the expected loop trip count.  The following condition must hold true:
4626      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4627
4628   if (vec_outside_cost <= 0)
4629     min_profitable_estimate = 0;
4630   /* ??? This "else if" arm is written to handle all cases; see below for
4631      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4632   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4633     {
4634       /* This is a repeat of the code above, but with + SOC rather
4635          than - SOC.  */
4636       int outside_overhead = (vec_outside_cost
4637                               - scalar_single_iter_cost * peel_iters_prologue
4638                               - scalar_single_iter_cost * peel_iters_epilogue
4639                               + scalar_outside_cost);
4640       int min_vec_niters = 1;
4641       if (outside_overhead > 0)
4642         min_vec_niters = outside_overhead / saving_per_viter + 1;
4643
4644       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4645         {
4646           int threshold = (vec_inside_cost * min_vec_niters
4647                            + vec_outside_cost
4648                            + scalar_outside_cost);
4649           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4650         }
4651       else
4652         min_profitable_estimate = (min_vec_niters * assumed_vf
4653                                    + peel_iters_prologue
4654                                    + peel_iters_epilogue);
4655     }
4656   else
4657     {
4658       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4659                                  * assumed_vf
4660                                  - vec_inside_cost * peel_iters_prologue
4661                                  - vec_inside_cost * peel_iters_epilogue)
4662                                  / ((scalar_single_iter_cost * assumed_vf)
4663                                    - vec_inside_cost);
4664     }
4665   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4666   if (dump_enabled_p ())
4667     dump_printf_loc (MSG_NOTE, vect_location,
4668                      "  Static estimate profitability threshold = %d\n",
4669                      min_profitable_estimate);
4670
4671   *ret_min_profitable_estimate = min_profitable_estimate;
4672 }
4673
4674 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4675    vector elements (not bits) for a vector with NELT elements.  */
4676 static void
4677 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4678                               vec_perm_builder *sel)
4679 {
4680   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4681      by vec_perm_indices.  */
4682   sel->new_vector (nelt, 1, 3);
4683   for (unsigned int i = 0; i < 3; i++)
4684     sel->quick_push (i + offset);
4685 }
4686
4687 /* Checks whether the target supports whole-vector shifts for vectors of mode
4688    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4689    it supports vec_perm_const with masks for all necessary shift amounts.  */
4690 static bool
4691 have_whole_vector_shift (machine_mode mode)
4692 {
4693   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4694     return true;
4695
4696   /* Variable-length vectors should be handled via the optab.  */
4697   unsigned int nelt;
4698   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4699     return false;
4700
4701   vec_perm_builder sel;
4702   vec_perm_indices indices;
4703   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4704     {
4705       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4706       indices.new_vector (sel, 2, nelt);
4707       if (!can_vec_perm_const_p (mode, mode, indices, false))
4708         return false;
4709     }
4710   return true;
4711 }
4712
4713 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4714    multiplication operands have differing signs and (b) we intend
4715    to emulate the operation using a series of signed DOT_PROD_EXPRs.
4716    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
4717
4718 static bool
4719 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
4720                                  stmt_vec_info stmt_info)
4721 {
4722   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4723   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4724     return false;
4725
4726   tree rhs1 = gimple_assign_rhs1 (assign);
4727   tree rhs2 = gimple_assign_rhs2 (assign);
4728   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4729     return false;
4730
4731   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4732   gcc_assert (reduc_info->is_reduc_info);
4733   return !directly_supported_p (DOT_PROD_EXPR,
4734                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
4735                                 optab_vector_mixed_sign);
4736 }
4737
4738 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4739    functions. Design better to avoid maintenance issues.  */
4740
4741 /* Function vect_model_reduction_cost.
4742
4743    Models cost for a reduction operation, including the vector ops
4744    generated within the strip-mine loop in some cases, the initial
4745    definition before the loop, and the epilogue code that must be generated.  */
4746
4747 static void
4748 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4749                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4750                            vect_reduction_type reduction_type,
4751                            int ncopies, stmt_vector_for_cost *cost_vec)
4752 {
4753   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4754   tree vectype;
4755   machine_mode mode;
4756   class loop *loop = NULL;
4757
4758   if (loop_vinfo)
4759     loop = LOOP_VINFO_LOOP (loop_vinfo);
4760
4761   /* Condition reductions generate two reductions in the loop.  */
4762   if (reduction_type == COND_REDUCTION)
4763     ncopies *= 2;
4764
4765   vectype = STMT_VINFO_VECTYPE (stmt_info);
4766   mode = TYPE_MODE (vectype);
4767   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4768
4769   gimple_match_op op;
4770   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4771     gcc_unreachable ();
4772
4773   bool emulated_mixed_dot_prod
4774     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
4775   if (reduction_type == EXTRACT_LAST_REDUCTION)
4776     /* No extra instructions are needed in the prologue.  The loop body
4777        operations are costed in vectorizable_condition.  */
4778     inside_cost = 0;
4779   else if (reduction_type == FOLD_LEFT_REDUCTION)
4780     {
4781       /* No extra instructions needed in the prologue.  */
4782       prologue_cost = 0;
4783
4784       if (reduc_fn != IFN_LAST)
4785         /* Count one reduction-like operation per vector.  */
4786         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4787                                         stmt_info, 0, vect_body);
4788       else
4789         {
4790           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4791           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4792           inside_cost = record_stmt_cost (cost_vec, nelements,
4793                                           vec_to_scalar, stmt_info, 0,
4794                                           vect_body);
4795           inside_cost += record_stmt_cost (cost_vec, nelements,
4796                                            scalar_stmt, stmt_info, 0,
4797                                            vect_body);
4798         }
4799     }
4800   else
4801     {
4802       /* Add in the cost of the initial definitions.  */
4803       int prologue_stmts;
4804       if (reduction_type == COND_REDUCTION)
4805         /* For cond reductions we have four vectors: initial index, step,
4806            initial result of the data reduction, initial value of the index
4807            reduction.  */
4808         prologue_stmts = 4;
4809       else if (emulated_mixed_dot_prod)
4810         /* We need the initial reduction value and two invariants:
4811            one that contains the minimum signed value and one that
4812            contains half of its negative.  */
4813         prologue_stmts = 3;
4814       else
4815         prologue_stmts = 1;
4816       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4817                                          scalar_to_vec, stmt_info, 0,
4818                                          vect_prologue);
4819     }
4820
4821   /* Determine cost of epilogue code.
4822
4823      We have a reduction operator that will reduce the vector in one statement.
4824      Also requires scalar extract.  */
4825
4826   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4827     {
4828       if (reduc_fn != IFN_LAST)
4829         {
4830           if (reduction_type == COND_REDUCTION)
4831             {
4832               /* An EQ stmt and an COND_EXPR stmt.  */
4833               epilogue_cost += record_stmt_cost (cost_vec, 2,
4834                                                  vector_stmt, stmt_info, 0,
4835                                                  vect_epilogue);
4836               /* Reduction of the max index and a reduction of the found
4837                  values.  */
4838               epilogue_cost += record_stmt_cost (cost_vec, 2,
4839                                                  vec_to_scalar, stmt_info, 0,
4840                                                  vect_epilogue);
4841               /* A broadcast of the max value.  */
4842               epilogue_cost += record_stmt_cost (cost_vec, 1,
4843                                                  scalar_to_vec, stmt_info, 0,
4844                                                  vect_epilogue);
4845             }
4846           else
4847             {
4848               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4849                                                  stmt_info, 0, vect_epilogue);
4850               epilogue_cost += record_stmt_cost (cost_vec, 1,
4851                                                  vec_to_scalar, stmt_info, 0,
4852                                                  vect_epilogue);
4853             }
4854         }
4855       else if (reduction_type == COND_REDUCTION)
4856         {
4857           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4858           /* Extraction of scalar elements.  */
4859           epilogue_cost += record_stmt_cost (cost_vec,
4860                                              2 * estimated_nunits,
4861                                              vec_to_scalar, stmt_info, 0,
4862                                              vect_epilogue);
4863           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4864           epilogue_cost += record_stmt_cost (cost_vec,
4865                                              2 * estimated_nunits - 3,
4866                                              scalar_stmt, stmt_info, 0,
4867                                              vect_epilogue);
4868         }
4869       else if (reduction_type == EXTRACT_LAST_REDUCTION
4870                || reduction_type == FOLD_LEFT_REDUCTION)
4871         /* No extra instructions need in the epilogue.  */
4872         ;
4873       else
4874         {
4875           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4876           tree bitsize = TYPE_SIZE (op.type);
4877           int element_bitsize = tree_to_uhwi (bitsize);
4878           int nelements = vec_size_in_bits / element_bitsize;
4879
4880           if (op.code == COND_EXPR)
4881             op.code = MAX_EXPR;
4882
4883           /* We have a whole vector shift available.  */
4884           if (VECTOR_MODE_P (mode)
4885               && directly_supported_p (op.code, vectype)
4886               && have_whole_vector_shift (mode))
4887             {
4888               /* Final reduction via vector shifts and the reduction operator.
4889                  Also requires scalar extract.  */
4890               epilogue_cost += record_stmt_cost (cost_vec,
4891                                                  exact_log2 (nelements) * 2,
4892                                                  vector_stmt, stmt_info, 0,
4893                                                  vect_epilogue);
4894               epilogue_cost += record_stmt_cost (cost_vec, 1,
4895                                                  vec_to_scalar, stmt_info, 0,
4896                                                  vect_epilogue);
4897             }
4898           else
4899             /* Use extracts and reduction op for final reduction.  For N
4900                elements, we have N extracts and N-1 reduction ops.  */
4901             epilogue_cost += record_stmt_cost (cost_vec,
4902                                                nelements + nelements - 1,
4903                                                vector_stmt, stmt_info, 0,
4904                                                vect_epilogue);
4905         }
4906     }
4907
4908   if (dump_enabled_p ())
4909     dump_printf (MSG_NOTE,
4910                  "vect_model_reduction_cost: inside_cost = %d, "
4911                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4912                  prologue_cost, epilogue_cost);
4913 }
4914
4915 /* SEQ is a sequence of instructions that initialize the reduction
4916    described by REDUC_INFO.  Emit them in the appropriate place.  */
4917
4918 static void
4919 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4920                                 stmt_vec_info reduc_info, gimple *seq)
4921 {
4922   if (reduc_info->reused_accumulator)
4923     {
4924       /* When reusing an accumulator from the main loop, we only need
4925          initialization instructions if the main loop can be skipped.
4926          In that case, emit the initialization instructions at the end
4927          of the guard block that does the skip.  */
4928       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4929       gcc_assert (skip_edge);
4930       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4931       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4932     }
4933   else
4934     {
4935       /* The normal case: emit the initialization instructions on the
4936          preheader edge.  */
4937       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4938       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4939     }
4940 }
4941
4942 /* Function get_initial_def_for_reduction
4943
4944    Input:
4945    REDUC_INFO - the info_for_reduction
4946    INIT_VAL - the initial value of the reduction variable
4947    NEUTRAL_OP - a value that has no effect on the reduction, as per
4948                 neutral_op_for_reduction
4949
4950    Output:
4951    Return a vector variable, initialized according to the operation that
4952         STMT_VINFO performs. This vector will be used as the initial value
4953         of the vector of partial results.
4954
4955    The value we need is a vector in which element 0 has value INIT_VAL
4956    and every other element has value NEUTRAL_OP.  */
4957
4958 static tree
4959 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4960                                stmt_vec_info reduc_info,
4961                                tree init_val, tree neutral_op)
4962 {
4963   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4964   tree scalar_type = TREE_TYPE (init_val);
4965   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4966   tree init_def;
4967   gimple_seq stmts = NULL;
4968
4969   gcc_assert (vectype);
4970
4971   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4972               || SCALAR_FLOAT_TYPE_P (scalar_type));
4973
4974   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4975               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4976
4977   if (operand_equal_p (init_val, neutral_op))
4978     {
4979       /* If both elements are equal then the vector described above is
4980          just a splat.  */
4981       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4982       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4983     }
4984   else
4985     {
4986       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4987       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4988       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4989         {
4990           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4991              element 0.  */
4992           init_def = gimple_build_vector_from_val (&stmts, vectype,
4993                                                    neutral_op);
4994           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4995                                    vectype, init_def, init_val);
4996         }
4997       else
4998         {
4999           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5000           tree_vector_builder elts (vectype, 1, 2);
5001           elts.quick_push (init_val);
5002           elts.quick_push (neutral_op);
5003           init_def = gimple_build_vector (&stmts, &elts);
5004         }
5005     }
5006
5007   if (stmts)
5008     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5009   return init_def;
5010 }
5011
5012 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5013    which performs a reduction involving GROUP_SIZE scalar statements.
5014    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5015    is nonnull, introducing extra elements of that value will not change the
5016    result.  */
5017
5018 static void
5019 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5020                                 stmt_vec_info reduc_info,
5021                                 vec<tree> *vec_oprnds,
5022                                 unsigned int number_of_vectors,
5023                                 unsigned int group_size, tree neutral_op)
5024 {
5025   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5026   unsigned HOST_WIDE_INT nunits;
5027   unsigned j, number_of_places_left_in_vector;
5028   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5029   unsigned int i;
5030
5031   gcc_assert (group_size == initial_values.length () || neutral_op);
5032
5033   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5034      created vectors. It is greater than 1 if unrolling is performed.
5035
5036      For example, we have two scalar operands, s1 and s2 (e.g., group of
5037      strided accesses of size two), while NUNITS is four (i.e., four scalars
5038      of this type can be packed in a vector).  The output vector will contain
5039      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5040      will be 2).
5041
5042      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5043      vectors containing the operands.
5044
5045      For example, NUNITS is four as before, and the group size is 8
5046      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5047      {s5, s6, s7, s8}.  */
5048
5049   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5050     nunits = group_size;
5051
5052   number_of_places_left_in_vector = nunits;
5053   bool constant_p = true;
5054   tree_vector_builder elts (vector_type, nunits, 1);
5055   elts.quick_grow (nunits);
5056   gimple_seq ctor_seq = NULL;
5057   for (j = 0; j < nunits * number_of_vectors; ++j)
5058     {
5059       tree op;
5060       i = j % group_size;
5061
5062       /* Get the def before the loop.  In reduction chain we have only
5063          one initial value.  Else we have as many as PHIs in the group.  */
5064       if (i >= initial_values.length () || (j > i && neutral_op))
5065         op = neutral_op;
5066       else
5067         op = initial_values[i];
5068
5069       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5070       number_of_places_left_in_vector--;
5071       elts[nunits - number_of_places_left_in_vector - 1] = op;
5072       if (!CONSTANT_CLASS_P (op))
5073         constant_p = false;
5074
5075       if (number_of_places_left_in_vector == 0)
5076         {
5077           tree init;
5078           if (constant_p && !neutral_op
5079               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5080               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5081             /* Build the vector directly from ELTS.  */
5082             init = gimple_build_vector (&ctor_seq, &elts);
5083           else if (neutral_op)
5084             {
5085               /* Build a vector of the neutral value and shift the
5086                  other elements into place.  */
5087               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5088                                                    neutral_op);
5089               int k = nunits;
5090               while (k > 0 && elts[k - 1] == neutral_op)
5091                 k -= 1;
5092               while (k > 0)
5093                 {
5094                   k -= 1;
5095                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5096                                        vector_type, init, elts[k]);
5097                 }
5098             }
5099           else
5100             {
5101               /* First time round, duplicate ELTS to fill the
5102                  required number of vectors.  */
5103               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5104                                         elts, number_of_vectors, *vec_oprnds);
5105               break;
5106             }
5107           vec_oprnds->quick_push (init);
5108
5109           number_of_places_left_in_vector = nunits;
5110           elts.new_vector (vector_type, nunits, 1);
5111           elts.quick_grow (nunits);
5112           constant_p = true;
5113         }
5114     }
5115   if (ctor_seq != NULL)
5116     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5117 }
5118
5119 /* For a statement STMT_INFO taking part in a reduction operation return
5120    the stmt_vec_info the meta information is stored on.  */
5121
5122 stmt_vec_info
5123 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5124 {
5125   stmt_info = vect_orig_stmt (stmt_info);
5126   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5127   if (!is_a <gphi *> (stmt_info->stmt)
5128       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5129     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5130   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5131   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5132     {
5133       if (gimple_phi_num_args (phi) == 1)
5134         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5135     }
5136   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5137     {
5138       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5139       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5140         stmt_info = info;
5141     }
5142   return stmt_info;
5143 }
5144
5145 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5146    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5147    return false.  */
5148
5149 static bool
5150 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5151                                 stmt_vec_info reduc_info)
5152 {
5153   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5154   if (!main_loop_vinfo)
5155     return false;
5156
5157   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5158     return false;
5159
5160   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5161   auto_vec<tree, 16> main_loop_results (num_phis);
5162   auto_vec<tree, 16> initial_values (num_phis);
5163   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5164     {
5165       /* The epilogue loop can be entered either from the main loop or
5166          from an earlier guard block.  */
5167       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5168       for (tree incoming_value : reduc_info->reduc_initial_values)
5169         {
5170           /* Look for:
5171
5172                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5173                                     INITIAL_VALUE(guard block)>.  */
5174           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5175
5176           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5177           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5178
5179           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5180           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5181
5182           main_loop_results.quick_push (from_main_loop);
5183           initial_values.quick_push (from_skip);
5184         }
5185     }
5186   else
5187     /* The main loop dominates the epilogue loop.  */
5188     main_loop_results.splice (reduc_info->reduc_initial_values);
5189
5190   /* See if the main loop has the kind of accumulator we need.  */
5191   vect_reusable_accumulator *accumulator
5192     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5193   if (!accumulator
5194       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5195       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5196                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5197     return false;
5198
5199   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5200   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5201   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5202   unsigned HOST_WIDE_INT m;
5203   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5204                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5205     return false;
5206   /* Check the intermediate vector types and operations are available.  */
5207   tree prev_vectype = old_vectype;
5208   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5209   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5210     {
5211       intermediate_nunits = exact_div (intermediate_nunits, 2);
5212       tree intermediate_vectype = get_related_vectype_for_scalar_type
5213         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5214       if (!intermediate_vectype
5215           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5216                                     intermediate_vectype)
5217           || !can_vec_extract (TYPE_MODE (prev_vectype),
5218                                TYPE_MODE (intermediate_vectype)))
5219         return false;
5220       prev_vectype = intermediate_vectype;
5221     }
5222
5223   /* Non-SLP reductions might apply an adjustment after the reduction
5224      operation, in order to simplify the initialization of the accumulator.
5225      If the epilogue loop carries on from where the main loop left off,
5226      it should apply the same adjustment to the final reduction result.
5227
5228      If the epilogue loop can also be entered directly (rather than via
5229      the main loop), we need to be able to handle that case in the same way,
5230      with the same adjustment.  (In principle we could add a PHI node
5231      to select the correct adjustment, but in practice that shouldn't be
5232      necessary.)  */
5233   tree main_adjustment
5234     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5235   if (loop_vinfo->main_loop_edge && main_adjustment)
5236     {
5237       gcc_assert (num_phis == 1);
5238       tree initial_value = initial_values[0];
5239       /* Check that we can use INITIAL_VALUE as the adjustment and
5240          initialize the accumulator with a neutral value instead.  */
5241       if (!operand_equal_p (initial_value, main_adjustment))
5242         return false;
5243       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5244       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5245                                                     code, initial_value);
5246     }
5247   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5248   reduc_info->reduc_initial_values.truncate (0);
5249   reduc_info->reduc_initial_values.splice (initial_values);
5250   reduc_info->reused_accumulator = accumulator;
5251   return true;
5252 }
5253
5254 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5255    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5256
5257 static tree
5258 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5259                             gimple_seq *seq)
5260 {
5261   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5262   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5263   tree stype = TREE_TYPE (vectype);
5264   tree new_temp = vec_def;
5265   while (nunits > nunits1)
5266     {
5267       nunits /= 2;
5268       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5269                                                            stype, nunits);
5270       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5271
5272       /* The target has to make sure we support lowpart/highpart
5273          extraction, either via direct vector extract or through
5274          an integer mode punning.  */
5275       tree dst1, dst2;
5276       gimple *epilog_stmt;
5277       if (convert_optab_handler (vec_extract_optab,
5278                                  TYPE_MODE (TREE_TYPE (new_temp)),
5279                                  TYPE_MODE (vectype1))
5280           != CODE_FOR_nothing)
5281         {
5282           /* Extract sub-vectors directly once vec_extract becomes
5283              a conversion optab.  */
5284           dst1 = make_ssa_name (vectype1);
5285           epilog_stmt
5286               = gimple_build_assign (dst1, BIT_FIELD_REF,
5287                                      build3 (BIT_FIELD_REF, vectype1,
5288                                              new_temp, TYPE_SIZE (vectype1),
5289                                              bitsize_int (0)));
5290           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5291           dst2 =  make_ssa_name (vectype1);
5292           epilog_stmt
5293               = gimple_build_assign (dst2, BIT_FIELD_REF,
5294                                      build3 (BIT_FIELD_REF, vectype1,
5295                                              new_temp, TYPE_SIZE (vectype1),
5296                                              bitsize_int (bitsize)));
5297           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5298         }
5299       else
5300         {
5301           /* Extract via punning to appropriately sized integer mode
5302              vector.  */
5303           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5304           tree etype = build_vector_type (eltype, 2);
5305           gcc_assert (convert_optab_handler (vec_extract_optab,
5306                                              TYPE_MODE (etype),
5307                                              TYPE_MODE (eltype))
5308                       != CODE_FOR_nothing);
5309           tree tem = make_ssa_name (etype);
5310           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5311                                              build1 (VIEW_CONVERT_EXPR,
5312                                                      etype, new_temp));
5313           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5314           new_temp = tem;
5315           tem = make_ssa_name (eltype);
5316           epilog_stmt
5317               = gimple_build_assign (tem, BIT_FIELD_REF,
5318                                      build3 (BIT_FIELD_REF, eltype,
5319                                              new_temp, TYPE_SIZE (eltype),
5320                                              bitsize_int (0)));
5321           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5322           dst1 = make_ssa_name (vectype1);
5323           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5324                                              build1 (VIEW_CONVERT_EXPR,
5325                                                      vectype1, tem));
5326           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5327           tem = make_ssa_name (eltype);
5328           epilog_stmt
5329               = gimple_build_assign (tem, BIT_FIELD_REF,
5330                                      build3 (BIT_FIELD_REF, eltype,
5331                                              new_temp, TYPE_SIZE (eltype),
5332                                              bitsize_int (bitsize)));
5333           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5334           dst2 =  make_ssa_name (vectype1);
5335           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5336                                              build1 (VIEW_CONVERT_EXPR,
5337                                                      vectype1, tem));
5338           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5339         }
5340
5341       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5342     }
5343
5344   return new_temp;
5345 }
5346
5347 /* Function vect_create_epilog_for_reduction
5348
5349    Create code at the loop-epilog to finalize the result of a reduction
5350    computation.
5351
5352    STMT_INFO is the scalar reduction stmt that is being vectorized.
5353    SLP_NODE is an SLP node containing a group of reduction statements. The
5354      first one in this group is STMT_INFO.
5355    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5356    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5357      (counting from 0)
5358
5359    This function:
5360    1. Completes the reduction def-use cycles.
5361    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5362       by calling the function specified by REDUC_FN if available, or by
5363       other means (whole-vector shifts or a scalar loop).
5364       The function also creates a new phi node at the loop exit to preserve
5365       loop-closed form, as illustrated below.
5366
5367      The flow at the entry to this function:
5368
5369         loop:
5370           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5371           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5372           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5373         loop_exit:
5374           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5375           use <s_out0>
5376           use <s_out0>
5377
5378      The above is transformed by this function into:
5379
5380         loop:
5381           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5382           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5383           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5384         loop_exit:
5385           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5386           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5387           v_out2 = reduce <v_out1>
5388           s_out3 = extract_field <v_out2, 0>
5389           s_out4 = adjust_result <s_out3>
5390           use <s_out4>
5391           use <s_out4>
5392 */
5393
5394 static void
5395 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5396                                   stmt_vec_info stmt_info,
5397                                   slp_tree slp_node,
5398                                   slp_instance slp_node_instance)
5399 {
5400   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5401   gcc_assert (reduc_info->is_reduc_info);
5402   /* For double reductions we need to get at the inner loop reduction
5403      stmt which has the meta info attached.  Our stmt_info is that of the
5404      loop-closed PHI of the inner loop which we remember as
5405      def for the reduction PHI generation.  */
5406   bool double_reduc = false;
5407   stmt_vec_info rdef_info = stmt_info;
5408   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5409     {
5410       gcc_assert (!slp_node);
5411       double_reduc = true;
5412       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5413                                             (stmt_info->stmt, 0));
5414       stmt_info = vect_stmt_to_vectorize (stmt_info);
5415     }
5416   gphi *reduc_def_stmt
5417     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5418   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5419   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5420   tree vectype;
5421   machine_mode mode;
5422   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5423   basic_block exit_bb;
5424   tree scalar_dest;
5425   tree scalar_type;
5426   gimple *new_phi = NULL, *phi;
5427   gimple_stmt_iterator exit_gsi;
5428   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5429   gimple *epilog_stmt = NULL;
5430   gimple *exit_phi;
5431   tree bitsize;
5432   tree def;
5433   tree orig_name, scalar_result;
5434   imm_use_iterator imm_iter, phi_imm_iter;
5435   use_operand_p use_p, phi_use_p;
5436   gimple *use_stmt;
5437   auto_vec<tree> reduc_inputs;
5438   int j, i;
5439   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5440   unsigned int group_size = 1, k;
5441   auto_vec<gimple *> phis;
5442   /* SLP reduction without reduction chain, e.g.,
5443      # a1 = phi <a2, a0>
5444      # b1 = phi <b2, b0>
5445      a2 = operation (a1)
5446      b2 = operation (b1)  */
5447   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5448   bool direct_slp_reduc;
5449   tree induction_index = NULL_TREE;
5450
5451   if (slp_node)
5452     group_size = SLP_TREE_LANES (slp_node);
5453
5454   if (nested_in_vect_loop_p (loop, stmt_info))
5455     {
5456       outer_loop = loop;
5457       loop = loop->inner;
5458       gcc_assert (!slp_node && double_reduc);
5459     }
5460
5461   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5462   gcc_assert (vectype);
5463   mode = TYPE_MODE (vectype);
5464
5465   tree induc_val = NULL_TREE;
5466   tree adjustment_def = NULL;
5467   if (slp_node)
5468     ;
5469   else
5470     {
5471       /* Optimize: for induction condition reduction, if we can't use zero
5472          for induc_val, use initial_def.  */
5473       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5474         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5475       else if (double_reduc)
5476         ;
5477       else
5478         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5479     }
5480
5481   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5482   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5483   if (slp_reduc)
5484     /* All statements produce live-out values.  */
5485     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5486   else if (slp_node)
5487     {
5488       /* The last statement in the reduction chain produces the live-out
5489          value.  Note SLP optimization can shuffle scalar stmts to
5490          optimize permutations so we have to search for the last stmt.  */
5491       for (k = 0; k < group_size; ++k)
5492         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5493           {
5494             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5495             break;
5496           }
5497     }
5498
5499   unsigned vec_num;
5500   int ncopies;
5501   if (slp_node)
5502     {
5503       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5504       ncopies = 1;
5505     }
5506   else
5507     {
5508       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5509       vec_num = 1;
5510       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5511     }
5512
5513   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5514      which is updated with the current index of the loop for every match of
5515      the original loop's cond_expr (VEC_STMT).  This results in a vector
5516      containing the last time the condition passed for that vector lane.
5517      The first match will be a 1 to allow 0 to be used for non-matching
5518      indexes.  If there are no matches at all then the vector will be all
5519      zeroes.
5520
5521      PR92772: This algorithm is broken for architectures that support
5522      masked vectors, but do not provide fold_extract_last.  */
5523   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5524     {
5525       auto_vec<std::pair<tree, bool>, 2> ccompares;
5526       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5527       cond_info = vect_stmt_to_vectorize (cond_info);
5528       while (cond_info != reduc_info)
5529         {
5530           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5531             {
5532               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5533               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5534               ccompares.safe_push
5535                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5536                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5537             }
5538           cond_info
5539             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5540                                                  1 + STMT_VINFO_REDUC_IDX
5541                                                         (cond_info)));
5542           cond_info = vect_stmt_to_vectorize (cond_info);
5543         }
5544       gcc_assert (ccompares.length () != 0);
5545
5546       tree indx_before_incr, indx_after_incr;
5547       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5548       int scalar_precision
5549         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5550       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5551       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5552         (TYPE_MODE (vectype), cr_index_scalar_type,
5553          TYPE_VECTOR_SUBPARTS (vectype));
5554
5555       /* First we create a simple vector induction variable which starts
5556          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5557          vector size (STEP).  */
5558
5559       /* Create a {1,2,3,...} vector.  */
5560       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5561
5562       /* Create a vector of the step value.  */
5563       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5564       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5565
5566       /* Create an induction variable.  */
5567       gimple_stmt_iterator incr_gsi;
5568       bool insert_after;
5569       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5570       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5571                  insert_after, &indx_before_incr, &indx_after_incr);
5572
5573       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5574          filled with zeros (VEC_ZERO).  */
5575
5576       /* Create a vector of 0s.  */
5577       tree zero = build_zero_cst (cr_index_scalar_type);
5578       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5579
5580       /* Create a vector phi node.  */
5581       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5582       new_phi = create_phi_node (new_phi_tree, loop->header);
5583       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5584                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5585
5586       /* Now take the condition from the loops original cond_exprs
5587          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5588          every match uses values from the induction variable
5589          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5590          (NEW_PHI_TREE).
5591          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5592          the new cond_expr (INDEX_COND_EXPR).  */
5593       gimple_seq stmts = NULL;
5594       for (int i = ccompares.length () - 1; i != -1; --i)
5595         {
5596           tree ccompare = ccompares[i].first;
5597           if (ccompares[i].second)
5598             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5599                                          cr_index_vector_type,
5600                                          ccompare,
5601                                          indx_before_incr, new_phi_tree);
5602           else
5603             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5604                                          cr_index_vector_type,
5605                                          ccompare,
5606                                          new_phi_tree, indx_before_incr);
5607         }
5608       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5609
5610       /* Update the phi with the vec cond.  */
5611       induction_index = new_phi_tree;
5612       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5613                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5614     }
5615
5616   /* 2. Create epilog code.
5617         The reduction epilog code operates across the elements of the vector
5618         of partial results computed by the vectorized loop.
5619         The reduction epilog code consists of:
5620
5621         step 1: compute the scalar result in a vector (v_out2)
5622         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5623         step 3: adjust the scalar result (s_out3) if needed.
5624
5625         Step 1 can be accomplished using one the following three schemes:
5626           (scheme 1) using reduc_fn, if available.
5627           (scheme 2) using whole-vector shifts, if available.
5628           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5629                      combined.
5630
5631           The overall epilog code looks like this:
5632
5633           s_out0 = phi <s_loop>         # original EXIT_PHI
5634           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5635           v_out2 = reduce <v_out1>              # step 1
5636           s_out3 = extract_field <v_out2, 0>    # step 2
5637           s_out4 = adjust_result <s_out3>       # step 3
5638
5639           (step 3 is optional, and steps 1 and 2 may be combined).
5640           Lastly, the uses of s_out0 are replaced by s_out4.  */
5641
5642
5643   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5644          v_out1 = phi <VECT_DEF>
5645          Store them in NEW_PHIS.  */
5646   if (double_reduc)
5647     loop = outer_loop;
5648   exit_bb = single_exit (loop)->dest;
5649   exit_gsi = gsi_after_labels (exit_bb);
5650   reduc_inputs.create (slp_node ? vec_num : ncopies);
5651   for (unsigned i = 0; i < vec_num; i++)
5652     {
5653       gimple_seq stmts = NULL;
5654       if (slp_node)
5655         def = vect_get_slp_vect_def (slp_node, i);
5656       else
5657         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5658       for (j = 0; j < ncopies; j++)
5659         {
5660           tree new_def = copy_ssa_name (def);
5661           phi = create_phi_node (new_def, exit_bb);
5662           if (j)
5663             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5664           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5665           new_def = gimple_convert (&stmts, vectype, new_def);
5666           reduc_inputs.quick_push (new_def);
5667         }
5668       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5669     }
5670
5671   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5672          (i.e. when reduc_fn is not available) and in the final adjustment
5673          code (if needed).  Also get the original scalar reduction variable as
5674          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5675          represents a reduction pattern), the tree-code and scalar-def are
5676          taken from the original stmt that the pattern-stmt (STMT) replaces.
5677          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5678          are taken from STMT.  */
5679
5680   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5681   if (orig_stmt_info != stmt_info)
5682     {
5683       /* Reduction pattern  */
5684       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5685       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5686     }
5687
5688   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5689   scalar_type = TREE_TYPE (scalar_dest);
5690   scalar_results.truncate (0);
5691   scalar_results.reserve_exact (group_size);
5692   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5693   bitsize = TYPE_SIZE (scalar_type);
5694
5695   /* True if we should implement SLP_REDUC using native reduction operations
5696      instead of scalar operations.  */
5697   direct_slp_reduc = (reduc_fn != IFN_LAST
5698                       && slp_reduc
5699                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5700
5701   /* In case of reduction chain, e.g.,
5702      # a1 = phi <a3, a0>
5703      a2 = operation (a1)
5704      a3 = operation (a2),
5705
5706      we may end up with more than one vector result.  Here we reduce them
5707      to one vector.
5708
5709      The same is true for a SLP reduction, e.g.,
5710      # a1 = phi <a2, a0>
5711      # b1 = phi <b2, b0>
5712      a2 = operation (a1)
5713      b2 = operation (a2),
5714
5715      where we can end up with more than one vector as well.  We can
5716      easily accumulate vectors when the number of vector elements is
5717      a multiple of the SLP group size.
5718
5719      The same is true if we couldn't use a single defuse cycle.  */
5720   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5721       || direct_slp_reduc
5722       || (slp_reduc
5723           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
5724       || ncopies > 1)
5725     {
5726       gimple_seq stmts = NULL;
5727       tree single_input = reduc_inputs[0];
5728       for (k = 1; k < reduc_inputs.length (); k++)
5729         single_input = gimple_build (&stmts, code, vectype,
5730                                      single_input, reduc_inputs[k]);
5731       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5732
5733       reduc_inputs.truncate (0);
5734       reduc_inputs.safe_push (single_input);
5735     }
5736
5737   tree orig_reduc_input = reduc_inputs[0];
5738
5739   /* If this loop is an epilogue loop that can be skipped after the
5740      main loop, we can only share a reduction operation between the
5741      main loop and the epilogue if we put it at the target of the
5742      skip edge.
5743
5744      We can still reuse accumulators if this check fails.  Doing so has
5745      the minor(?) benefit of making the epilogue loop's scalar result
5746      independent of the main loop's scalar result.  */
5747   bool unify_with_main_loop_p = false;
5748   if (reduc_info->reused_accumulator
5749       && loop_vinfo->skip_this_loop_edge
5750       && single_succ_p (exit_bb)
5751       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5752     {
5753       unify_with_main_loop_p = true;
5754
5755       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5756       reduc_inputs[0] = make_ssa_name (vectype);
5757       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5758       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5759                    UNKNOWN_LOCATION);
5760       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5761                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5762       exit_gsi = gsi_after_labels (reduc_block);
5763     }
5764
5765   /* Shouldn't be used beyond this point.  */
5766   exit_bb = nullptr;
5767
5768   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5769       && reduc_fn != IFN_LAST)
5770     {
5771       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5772          various data values where the condition matched and another vector
5773          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5774          need to extract the last matching index (which will be the index with
5775          highest value) and use this to index into the data vector.
5776          For the case where there were no matches, the data vector will contain
5777          all default values and the index vector will be all zeros.  */
5778
5779       /* Get various versions of the type of the vector of indexes.  */
5780       tree index_vec_type = TREE_TYPE (induction_index);
5781       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5782       tree index_scalar_type = TREE_TYPE (index_vec_type);
5783       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5784
5785       /* Get an unsigned integer version of the type of the data vector.  */
5786       int scalar_precision
5787         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5788       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5789       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5790                                                 vectype);
5791
5792       /* First we need to create a vector (ZERO_VEC) of zeros and another
5793          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5794          can create using a MAX reduction and then expanding.
5795          In the case where the loop never made any matches, the max index will
5796          be zero.  */
5797
5798       /* Vector of {0, 0, 0,...}.  */
5799       tree zero_vec = build_zero_cst (vectype);
5800
5801       /* Find maximum value from the vector of found indexes.  */
5802       tree max_index = make_ssa_name (index_scalar_type);
5803       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5804                                                           1, induction_index);
5805       gimple_call_set_lhs (max_index_stmt, max_index);
5806       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5807
5808       /* Vector of {max_index, max_index, max_index,...}.  */
5809       tree max_index_vec = make_ssa_name (index_vec_type);
5810       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5811                                                       max_index);
5812       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5813                                                         max_index_vec_rhs);
5814       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5815
5816       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5817          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5818          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5819          otherwise.  Only one value should match, resulting in a vector
5820          (VEC_COND) with one data value and the rest zeros.
5821          In the case where the loop never made any matches, every index will
5822          match, resulting in a vector with all data values (which will all be
5823          the default value).  */
5824
5825       /* Compare the max index vector to the vector of found indexes to find
5826          the position of the max value.  */
5827       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5828       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5829                                                       induction_index,
5830                                                       max_index_vec);
5831       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5832
5833       /* Use the compare to choose either values from the data vector or
5834          zero.  */
5835       tree vec_cond = make_ssa_name (vectype);
5836       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5837                                                    vec_compare,
5838                                                    reduc_inputs[0],
5839                                                    zero_vec);
5840       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5841
5842       /* Finally we need to extract the data value from the vector (VEC_COND)
5843          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5844          reduction, but because this doesn't exist, we can use a MAX reduction
5845          instead.  The data value might be signed or a float so we need to cast
5846          it first.
5847          In the case where the loop never made any matches, the data values are
5848          all identical, and so will reduce down correctly.  */
5849
5850       /* Make the matched data values unsigned.  */
5851       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5852       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5853                                        vec_cond);
5854       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5855                                                         VIEW_CONVERT_EXPR,
5856                                                         vec_cond_cast_rhs);
5857       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5858
5859       /* Reduce down to a scalar value.  */
5860       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5861       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5862                                                            1, vec_cond_cast);
5863       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5864       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5865
5866       /* Convert the reduced value back to the result type and set as the
5867          result.  */
5868       gimple_seq stmts = NULL;
5869       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5870                                data_reduc);
5871       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5872       scalar_results.safe_push (new_temp);
5873     }
5874   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5875            && reduc_fn == IFN_LAST)
5876     {
5877       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5878          idx = 0;
5879          idx_val = induction_index[0];
5880          val = data_reduc[0];
5881          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5882            if (induction_index[i] > idx_val)
5883              val = data_reduc[i], idx_val = induction_index[i];
5884          return val;  */
5885
5886       tree data_eltype = TREE_TYPE (vectype);
5887       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5888       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5889       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5890       /* Enforced by vectorizable_reduction, which ensures we have target
5891          support before allowing a conditional reduction on variable-length
5892          vectors.  */
5893       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5894       tree idx_val = NULL_TREE, val = NULL_TREE;
5895       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5896         {
5897           tree old_idx_val = idx_val;
5898           tree old_val = val;
5899           idx_val = make_ssa_name (idx_eltype);
5900           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5901                                              build3 (BIT_FIELD_REF, idx_eltype,
5902                                                      induction_index,
5903                                                      bitsize_int (el_size),
5904                                                      bitsize_int (off)));
5905           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5906           val = make_ssa_name (data_eltype);
5907           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5908                                              build3 (BIT_FIELD_REF,
5909                                                      data_eltype,
5910                                                      reduc_inputs[0],
5911                                                      bitsize_int (el_size),
5912                                                      bitsize_int (off)));
5913           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5914           if (off != 0)
5915             {
5916               tree new_idx_val = idx_val;
5917               if (off != v_size - el_size)
5918                 {
5919                   new_idx_val = make_ssa_name (idx_eltype);
5920                   epilog_stmt = gimple_build_assign (new_idx_val,
5921                                                      MAX_EXPR, idx_val,
5922                                                      old_idx_val);
5923                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5924                 }
5925               tree cond = make_ssa_name (boolean_type_node);
5926               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5927                                                  idx_val, old_idx_val);
5928               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5929               tree new_val = make_ssa_name (data_eltype);
5930               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5931                                                  cond, val, old_val);
5932               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5933               idx_val = new_idx_val;
5934               val = new_val;
5935             }
5936         }
5937       /* Convert the reduced value back to the result type and set as the
5938          result.  */
5939       gimple_seq stmts = NULL;
5940       val = gimple_convert (&stmts, scalar_type, val);
5941       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5942       scalar_results.safe_push (val);
5943     }
5944
5945   /* 2.3 Create the reduction code, using one of the three schemes described
5946          above. In SLP we simply need to extract all the elements from the
5947          vector (without reducing them), so we use scalar shifts.  */
5948   else if (reduc_fn != IFN_LAST && !slp_reduc)
5949     {
5950       tree tmp;
5951       tree vec_elem_type;
5952
5953       /* Case 1:  Create:
5954          v_out2 = reduc_expr <v_out1>  */
5955
5956       if (dump_enabled_p ())
5957         dump_printf_loc (MSG_NOTE, vect_location,
5958                          "Reduce using direct vector reduction.\n");
5959
5960       gimple_seq stmts = NULL;
5961       vec_elem_type = TREE_TYPE (vectype);
5962       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5963                                vec_elem_type, reduc_inputs[0]);
5964       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5965       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5966
5967       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5968           && induc_val)
5969         {
5970           /* Earlier we set the initial value to be a vector if induc_val
5971              values.  Check the result and if it is induc_val then replace
5972              with the original initial value, unless induc_val is
5973              the same as initial_def already.  */
5974           tree zcompare = make_ssa_name (boolean_type_node);
5975           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5976                                              new_temp, induc_val);
5977           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5978           tree initial_def = reduc_info->reduc_initial_values[0];
5979           tmp = make_ssa_name (new_scalar_dest);
5980           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5981                                              initial_def, new_temp);
5982           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5983           new_temp = tmp;
5984         }
5985
5986       scalar_results.safe_push (new_temp);
5987     }
5988   else if (direct_slp_reduc)
5989     {
5990       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5991          with the elements for other SLP statements replaced with the
5992          neutral value.  We can then do a normal reduction on each vector.  */
5993
5994       /* Enforced by vectorizable_reduction.  */
5995       gcc_assert (reduc_inputs.length () == 1);
5996       gcc_assert (pow2p_hwi (group_size));
5997
5998       gimple_seq seq = NULL;
5999
6000       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6001          and the same element size as VECTYPE.  */
6002       tree index = build_index_vector (vectype, 0, 1);
6003       tree index_type = TREE_TYPE (index);
6004       tree index_elt_type = TREE_TYPE (index_type);
6005       tree mask_type = truth_type_for (index_type);
6006
6007       /* Create a vector that, for each element, identifies which of
6008          the REDUC_GROUP_SIZE results should use it.  */
6009       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6010       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6011                             build_vector_from_val (index_type, index_mask));
6012
6013       /* Get a neutral vector value.  This is simply a splat of the neutral
6014          scalar value if we have one, otherwise the initial scalar value
6015          is itself a neutral value.  */
6016       tree vector_identity = NULL_TREE;
6017       tree neutral_op = NULL_TREE;
6018       if (slp_node)
6019         {
6020           tree initial_value = NULL_TREE;
6021           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6022             initial_value = reduc_info->reduc_initial_values[0];
6023           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6024                                                  initial_value);
6025         }
6026       if (neutral_op)
6027         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6028                                                         neutral_op);
6029       for (unsigned int i = 0; i < group_size; ++i)
6030         {
6031           /* If there's no univeral neutral value, we can use the
6032              initial scalar value from the original PHI.  This is used
6033              for MIN and MAX reduction, for example.  */
6034           if (!neutral_op)
6035             {
6036               tree scalar_value = reduc_info->reduc_initial_values[i];
6037               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6038                                              scalar_value);
6039               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6040                                                               scalar_value);
6041             }
6042
6043           /* Calculate the equivalent of:
6044
6045              sel[j] = (index[j] == i);
6046
6047              which selects the elements of REDUC_INPUTS[0] that should
6048              be included in the result.  */
6049           tree compare_val = build_int_cst (index_elt_type, i);
6050           compare_val = build_vector_from_val (index_type, compare_val);
6051           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6052                                    index, compare_val);
6053
6054           /* Calculate the equivalent of:
6055
6056              vec = seq ? reduc_inputs[0] : vector_identity;
6057
6058              VEC is now suitable for a full vector reduction.  */
6059           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6060                                    sel, reduc_inputs[0], vector_identity);
6061
6062           /* Do the reduction and convert it to the appropriate type.  */
6063           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6064                                       TREE_TYPE (vectype), vec);
6065           scalar = gimple_convert (&seq, scalar_type, scalar);
6066           scalar_results.safe_push (scalar);
6067         }
6068       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6069     }
6070   else
6071     {
6072       bool reduce_with_shift;
6073       tree vec_temp;
6074
6075       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6076
6077       /* See if the target wants to do the final (shift) reduction
6078          in a vector mode of smaller size and first reduce upper/lower
6079          halves against each other.  */
6080       enum machine_mode mode1 = mode;
6081       tree stype = TREE_TYPE (vectype);
6082       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6083       unsigned nunits1 = nunits;
6084       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6085           && reduc_inputs.length () == 1)
6086         {
6087           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6088           /* For SLP reductions we have to make sure lanes match up, but
6089              since we're doing individual element final reduction reducing
6090              vector width here is even more important.
6091              ???  We can also separate lanes with permutes, for the common
6092              case of power-of-two group-size odd/even extracts would work.  */
6093           if (slp_reduc && nunits != nunits1)
6094             {
6095               nunits1 = least_common_multiple (nunits1, group_size);
6096               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6097             }
6098         }
6099       if (!slp_reduc
6100           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6101         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6102
6103       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6104                                                            stype, nunits1);
6105       reduce_with_shift = have_whole_vector_shift (mode1);
6106       if (!VECTOR_MODE_P (mode1)
6107           || !directly_supported_p (code, vectype1))
6108         reduce_with_shift = false;
6109
6110       /* First reduce the vector to the desired vector size we should
6111          do shift reduction on by combining upper and lower halves.  */
6112       gimple_seq stmts = NULL;
6113       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6114                                              code, &stmts);
6115       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6116       reduc_inputs[0] = new_temp;
6117
6118       if (reduce_with_shift && !slp_reduc)
6119         {
6120           int element_bitsize = tree_to_uhwi (bitsize);
6121           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6122              for variable-length vectors and also requires direct target support
6123              for loop reductions.  */
6124           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6125           int nelements = vec_size_in_bits / element_bitsize;
6126           vec_perm_builder sel;
6127           vec_perm_indices indices;
6128
6129           int elt_offset;
6130
6131           tree zero_vec = build_zero_cst (vectype1);
6132           /* Case 2: Create:
6133              for (offset = nelements/2; offset >= 1; offset/=2)
6134                 {
6135                   Create:  va' = vec_shift <va, offset>
6136                   Create:  va = vop <va, va'>
6137                 }  */
6138
6139           tree rhs;
6140
6141           if (dump_enabled_p ())
6142             dump_printf_loc (MSG_NOTE, vect_location,
6143                              "Reduce using vector shifts\n");
6144
6145           gimple_seq stmts = NULL;
6146           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6147           for (elt_offset = nelements / 2;
6148                elt_offset >= 1;
6149                elt_offset /= 2)
6150             {
6151               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6152               indices.new_vector (sel, 2, nelements);
6153               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6154               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6155                                        new_temp, zero_vec, mask);
6156               new_temp = gimple_build (&stmts, code,
6157                                        vectype1, new_name, new_temp);
6158             }
6159           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6160
6161           /* 2.4  Extract the final scalar result.  Create:
6162              s_out3 = extract_field <v_out2, bitpos>  */
6163
6164           if (dump_enabled_p ())
6165             dump_printf_loc (MSG_NOTE, vect_location,
6166                              "extract scalar result\n");
6167
6168           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6169                         bitsize, bitsize_zero_node);
6170           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6171           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6172           gimple_assign_set_lhs (epilog_stmt, new_temp);
6173           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6174           scalar_results.safe_push (new_temp);
6175         }
6176       else
6177         {
6178           /* Case 3: Create:
6179              s = extract_field <v_out2, 0>
6180              for (offset = element_size;
6181                   offset < vector_size;
6182                   offset += element_size;)
6183                {
6184                  Create:  s' = extract_field <v_out2, offset>
6185                  Create:  s = op <s, s'>  // For non SLP cases
6186                }  */
6187
6188           if (dump_enabled_p ())
6189             dump_printf_loc (MSG_NOTE, vect_location,
6190                              "Reduce using scalar code.\n");
6191
6192           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6193           int element_bitsize = tree_to_uhwi (bitsize);
6194           tree compute_type = TREE_TYPE (vectype);
6195           gimple_seq stmts = NULL;
6196           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6197             {
6198               int bit_offset;
6199               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6200                                        vec_temp, bitsize, bitsize_zero_node);
6201
6202               /* In SLP we don't need to apply reduction operation, so we just
6203                  collect s' values in SCALAR_RESULTS.  */
6204               if (slp_reduc)
6205                 scalar_results.safe_push (new_temp);
6206
6207               for (bit_offset = element_bitsize;
6208                    bit_offset < vec_size_in_bits;
6209                    bit_offset += element_bitsize)
6210                 {
6211                   tree bitpos = bitsize_int (bit_offset);
6212                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6213                                            compute_type, vec_temp,
6214                                            bitsize, bitpos);
6215                   if (slp_reduc)
6216                     {
6217                       /* In SLP we don't need to apply reduction operation, so
6218                          we just collect s' values in SCALAR_RESULTS.  */
6219                       new_temp = new_name;
6220                       scalar_results.safe_push (new_name);
6221                     }
6222                   else
6223                     new_temp = gimple_build (&stmts, code, compute_type,
6224                                              new_name, new_temp);
6225                 }
6226             }
6227
6228           /* The only case where we need to reduce scalar results in SLP, is
6229              unrolling.  If the size of SCALAR_RESULTS is greater than
6230              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6231              REDUC_GROUP_SIZE.  */
6232           if (slp_reduc)
6233             {
6234               tree res, first_res, new_res;
6235
6236               /* Reduce multiple scalar results in case of SLP unrolling.  */
6237               for (j = group_size; scalar_results.iterate (j, &res);
6238                    j++)
6239                 {
6240                   first_res = scalar_results[j % group_size];
6241                   new_res = gimple_build (&stmts, code, compute_type,
6242                                           first_res, res);
6243                   scalar_results[j % group_size] = new_res;
6244                 }
6245               scalar_results.truncate (group_size);
6246               for (k = 0; k < group_size; k++)
6247                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6248                                                     scalar_results[k]);
6249             }
6250           else
6251             {
6252               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6253               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6254               scalar_results.safe_push (new_temp);
6255             }
6256
6257           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6258         }
6259
6260       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6261           && induc_val)
6262         {
6263           /* Earlier we set the initial value to be a vector if induc_val
6264              values.  Check the result and if it is induc_val then replace
6265              with the original initial value, unless induc_val is
6266              the same as initial_def already.  */
6267           tree zcompare = make_ssa_name (boolean_type_node);
6268           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6269                                              induc_val);
6270           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6271           tree initial_def = reduc_info->reduc_initial_values[0];
6272           tree tmp = make_ssa_name (new_scalar_dest);
6273           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6274                                              initial_def, new_temp);
6275           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6276           scalar_results[0] = tmp;
6277         }
6278     }
6279
6280   /* 2.5 Adjust the final result by the initial value of the reduction
6281          variable. (When such adjustment is not needed, then
6282          'adjustment_def' is zero).  For example, if code is PLUS we create:
6283          new_temp = loop_exit_def + adjustment_def  */
6284
6285   if (adjustment_def)
6286     {
6287       gcc_assert (!slp_reduc);
6288       gimple_seq stmts = NULL;
6289       if (double_reduc)
6290         {
6291           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6292           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6293           new_temp = gimple_build (&stmts, code, vectype,
6294                                    reduc_inputs[0], adjustment_def);
6295         }
6296       else
6297         {
6298           new_temp = scalar_results[0];
6299           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6300           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6301           new_temp = gimple_build (&stmts, code, scalar_type,
6302                                    new_temp, adjustment_def);
6303         }
6304
6305       epilog_stmt = gimple_seq_last_stmt (stmts);
6306       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6307       scalar_results[0] = new_temp;
6308     }
6309
6310   /* Record this operation if it could be reused by the epilogue loop.  */
6311   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6312       && reduc_inputs.length () == 1)
6313     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6314                                            { orig_reduc_input, reduc_info });
6315
6316   if (double_reduc)
6317     loop = outer_loop;
6318
6319   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6320           phis with new adjusted scalar results, i.e., replace use <s_out0>
6321           with use <s_out4>.
6322
6323      Transform:
6324         loop_exit:
6325           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6326           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6327           v_out2 = reduce <v_out1>
6328           s_out3 = extract_field <v_out2, 0>
6329           s_out4 = adjust_result <s_out3>
6330           use <s_out0>
6331           use <s_out0>
6332
6333      into:
6334
6335         loop_exit:
6336           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6337           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6338           v_out2 = reduce <v_out1>
6339           s_out3 = extract_field <v_out2, 0>
6340           s_out4 = adjust_result <s_out3>
6341           use <s_out4>
6342           use <s_out4> */
6343
6344   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6345   for (k = 0; k < live_out_stmts.size (); k++)
6346     {
6347       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6348       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6349
6350       phis.create (3);
6351       /* Find the loop-closed-use at the loop exit of the original scalar
6352          result.  (The reduction result is expected to have two immediate uses,
6353          one at the latch block, and one at the loop exit).  For double
6354          reductions we are looking for exit phis of the outer loop.  */
6355       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6356         {
6357           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6358             {
6359               if (!is_gimple_debug (USE_STMT (use_p)))
6360                 phis.safe_push (USE_STMT (use_p));
6361             }
6362           else
6363             {
6364               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6365                 {
6366                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6367
6368                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6369                     {
6370                       if (!flow_bb_inside_loop_p (loop,
6371                                              gimple_bb (USE_STMT (phi_use_p)))
6372                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6373                         phis.safe_push (USE_STMT (phi_use_p));
6374                     }
6375                 }
6376             }
6377         }
6378
6379       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6380         {
6381           /* Replace the uses:  */
6382           orig_name = PHI_RESULT (exit_phi);
6383
6384           /* Look for a single use at the target of the skip edge.  */
6385           if (unify_with_main_loop_p)
6386             {
6387               use_operand_p use_p;
6388               gimple *user;
6389               if (!single_imm_use (orig_name, &use_p, &user))
6390                 gcc_unreachable ();
6391               orig_name = gimple_get_lhs (user);
6392             }
6393
6394           scalar_result = scalar_results[k];
6395           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6396             {
6397               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6398                 SET_USE (use_p, scalar_result);
6399               update_stmt (use_stmt);
6400             }
6401         }
6402
6403       phis.release ();
6404     }
6405 }
6406
6407 /* Return a vector of type VECTYPE that is equal to the vector select
6408    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6409    before GSI.  */
6410
6411 static tree
6412 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6413                      tree vec, tree identity)
6414 {
6415   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6416   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6417                                           mask, vec, identity);
6418   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6419   return cond;
6420 }
6421
6422 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6423    order, starting with LHS.  Insert the extraction statements before GSI and
6424    associate the new scalar SSA names with variable SCALAR_DEST.
6425    Return the SSA name for the result.  */
6426
6427 static tree
6428 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6429                        tree_code code, tree lhs, tree vector_rhs)
6430 {
6431   tree vectype = TREE_TYPE (vector_rhs);
6432   tree scalar_type = TREE_TYPE (vectype);
6433   tree bitsize = TYPE_SIZE (scalar_type);
6434   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6435   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6436
6437   for (unsigned HOST_WIDE_INT bit_offset = 0;
6438        bit_offset < vec_size_in_bits;
6439        bit_offset += element_bitsize)
6440     {
6441       tree bitpos = bitsize_int (bit_offset);
6442       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6443                          bitsize, bitpos);
6444
6445       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6446       rhs = make_ssa_name (scalar_dest, stmt);
6447       gimple_assign_set_lhs (stmt, rhs);
6448       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6449
6450       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6451       tree new_name = make_ssa_name (scalar_dest, stmt);
6452       gimple_assign_set_lhs (stmt, new_name);
6453       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6454       lhs = new_name;
6455     }
6456   return lhs;
6457 }
6458
6459 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6460    type of the vector input.  */
6461
6462 static internal_fn
6463 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6464 {
6465   internal_fn mask_reduc_fn;
6466
6467   switch (reduc_fn)
6468     {
6469     case IFN_FOLD_LEFT_PLUS:
6470       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6471       break;
6472
6473     default:
6474       return IFN_LAST;
6475     }
6476
6477   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6478                                       OPTIMIZE_FOR_SPEED))
6479     return mask_reduc_fn;
6480   return IFN_LAST;
6481 }
6482
6483 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6484    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6485    statement.  CODE is the operation performed by STMT_INFO and OPS are
6486    its scalar operands.  REDUC_INDEX is the index of the operand in
6487    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6488    implements in-order reduction, or IFN_LAST if we should open-code it.
6489    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6490    that should be used to control the operation in a fully-masked loop.  */
6491
6492 static bool
6493 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6494                                stmt_vec_info stmt_info,
6495                                gimple_stmt_iterator *gsi,
6496                                gimple **vec_stmt, slp_tree slp_node,
6497                                gimple *reduc_def_stmt,
6498                                tree_code code, internal_fn reduc_fn,
6499                                tree ops[3], tree vectype_in,
6500                                int reduc_index, vec_loop_masks *masks)
6501 {
6502   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6503   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6504   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6505
6506   int ncopies;
6507   if (slp_node)
6508     ncopies = 1;
6509   else
6510     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6511
6512   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6513   gcc_assert (ncopies == 1);
6514   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6515
6516   if (slp_node)
6517     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6518                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6519
6520   tree op0 = ops[1 - reduc_index];
6521
6522   int group_size = 1;
6523   stmt_vec_info scalar_dest_def_info;
6524   auto_vec<tree> vec_oprnds0;
6525   if (slp_node)
6526     {
6527       auto_vec<vec<tree> > vec_defs (2);
6528       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6529       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6530       vec_defs[0].release ();
6531       vec_defs[1].release ();
6532       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6533       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6534     }
6535   else
6536     {
6537       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6538                                      op0, &vec_oprnds0);
6539       scalar_dest_def_info = stmt_info;
6540     }
6541
6542   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6543   tree scalar_type = TREE_TYPE (scalar_dest);
6544   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6545
6546   int vec_num = vec_oprnds0.length ();
6547   gcc_assert (vec_num == 1 || slp_node);
6548   tree vec_elem_type = TREE_TYPE (vectype_out);
6549   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6550
6551   tree vector_identity = NULL_TREE;
6552   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6553     vector_identity = build_zero_cst (vectype_out);
6554
6555   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6556   int i;
6557   tree def0;
6558   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6559     {
6560       gimple *new_stmt;
6561       tree mask = NULL_TREE;
6562       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6563         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6564
6565       /* Handle MINUS by adding the negative.  */
6566       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6567         {
6568           tree negated = make_ssa_name (vectype_out);
6569           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6570           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6571           def0 = negated;
6572         }
6573
6574       if (mask && mask_reduc_fn == IFN_LAST)
6575         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6576                                     vector_identity);
6577
6578       /* On the first iteration the input is simply the scalar phi
6579          result, and for subsequent iterations it is the output of
6580          the preceding operation.  */
6581       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6582         {
6583           if (mask && mask_reduc_fn != IFN_LAST)
6584             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6585                                                    def0, mask);
6586           else
6587             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6588                                                    def0);
6589           /* For chained SLP reductions the output of the previous reduction
6590              operation serves as the input of the next. For the final statement
6591              the output cannot be a temporary - we reuse the original
6592              scalar destination of the last statement.  */
6593           if (i != vec_num - 1)
6594             {
6595               gimple_set_lhs (new_stmt, scalar_dest_var);
6596               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6597               gimple_set_lhs (new_stmt, reduc_var);
6598             }
6599         }
6600       else
6601         {
6602           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6603                                              reduc_var, def0);
6604           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6605           /* Remove the statement, so that we can use the same code paths
6606              as for statements that we've just created.  */
6607           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6608           gsi_remove (&tmp_gsi, true);
6609         }
6610
6611       if (i == vec_num - 1)
6612         {
6613           gimple_set_lhs (new_stmt, scalar_dest);
6614           vect_finish_replace_stmt (loop_vinfo,
6615                                     scalar_dest_def_info,
6616                                     new_stmt);
6617         }
6618       else
6619         vect_finish_stmt_generation (loop_vinfo,
6620                                      scalar_dest_def_info,
6621                                      new_stmt, gsi);
6622
6623       if (slp_node)
6624         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6625       else
6626         {
6627           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6628           *vec_stmt = new_stmt;
6629         }
6630     }
6631
6632   return true;
6633 }
6634
6635 /* Function is_nonwrapping_integer_induction.
6636
6637    Check if STMT_VINO (which is part of loop LOOP) both increments and
6638    does not cause overflow.  */
6639
6640 static bool
6641 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6642 {
6643   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6644   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6645   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6646   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6647   widest_int ni, max_loop_value, lhs_max;
6648   wi::overflow_type overflow = wi::OVF_NONE;
6649
6650   /* Make sure the loop is integer based.  */
6651   if (TREE_CODE (base) != INTEGER_CST
6652       || TREE_CODE (step) != INTEGER_CST)
6653     return false;
6654
6655   /* Check that the max size of the loop will not wrap.  */
6656
6657   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6658     return true;
6659
6660   if (! max_stmt_executions (loop, &ni))
6661     return false;
6662
6663   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6664                             &overflow);
6665   if (overflow)
6666     return false;
6667
6668   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6669                             TYPE_SIGN (lhs_type), &overflow);
6670   if (overflow)
6671     return false;
6672
6673   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6674           <= TYPE_PRECISION (lhs_type));
6675 }
6676
6677 /* Check if masking can be supported by inserting a conditional expression.
6678    CODE is the code for the operation.  COND_FN is the conditional internal
6679    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6680 static bool
6681 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6682                          tree vectype_in)
6683 {
6684   if (cond_fn != IFN_LAST
6685       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6686                                          OPTIMIZE_FOR_SPEED))
6687     return false;
6688
6689   if (code.is_tree_code ())
6690     switch (tree_code (code))
6691       {
6692       case DOT_PROD_EXPR:
6693       case SAD_EXPR:
6694         return true;
6695
6696       default:
6697         break;
6698       }
6699   return false;
6700 }
6701
6702 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6703    code for the operation.  VOP is the array of operands.  MASK is the loop
6704    mask.  GSI is a statement iterator used to place the new conditional
6705    expression.  */
6706 static void
6707 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6708                       gimple_stmt_iterator *gsi)
6709 {
6710   switch (tree_code (code))
6711     {
6712     case DOT_PROD_EXPR:
6713       {
6714         tree vectype = TREE_TYPE (vop[1]);
6715         tree zero = build_zero_cst (vectype);
6716         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6717         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6718                                                mask, vop[1], zero);
6719         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6720         vop[1] = masked_op1;
6721         break;
6722       }
6723
6724     case SAD_EXPR:
6725       {
6726         tree vectype = TREE_TYPE (vop[1]);
6727         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6728         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6729                                                mask, vop[1], vop[0]);
6730         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6731         vop[1] = masked_op1;
6732         break;
6733       }
6734
6735     default:
6736       gcc_unreachable ();
6737     }
6738 }
6739
6740 /* Function vectorizable_reduction.
6741
6742    Check if STMT_INFO performs a reduction operation that can be vectorized.
6743    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6744    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6745    Return true if STMT_INFO is vectorizable in this way.
6746
6747    This function also handles reduction idioms (patterns) that have been
6748    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6749    may be of this form:
6750      X = pattern_expr (arg0, arg1, ..., X)
6751    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6752    sequence that had been detected and replaced by the pattern-stmt
6753    (STMT_INFO).
6754
6755    This function also handles reduction of condition expressions, for example:
6756      for (int i = 0; i < N; i++)
6757        if (a[i] < value)
6758          last = a[i];
6759    This is handled by vectorising the loop and creating an additional vector
6760    containing the loop indexes for which "a[i] < value" was true.  In the
6761    function epilogue this is reduced to a single max value and then used to
6762    index into the vector of results.
6763
6764    In some cases of reduction patterns, the type of the reduction variable X is
6765    different than the type of the other arguments of STMT_INFO.
6766    In such cases, the vectype that is used when transforming STMT_INFO into
6767    a vector stmt is different than the vectype that is used to determine the
6768    vectorization factor, because it consists of a different number of elements
6769    than the actual number of elements that are being operated upon in parallel.
6770
6771    For example, consider an accumulation of shorts into an int accumulator.
6772    On some targets it's possible to vectorize this pattern operating on 8
6773    shorts at a time (hence, the vectype for purposes of determining the
6774    vectorization factor should be V8HI); on the other hand, the vectype that
6775    is used to create the vector form is actually V4SI (the type of the result).
6776
6777    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6778    indicates what is the actual level of parallelism (V8HI in the example), so
6779    that the right vectorization factor would be derived.  This vectype
6780    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6781    be used to create the vectorized stmt.  The right vectype for the vectorized
6782    stmt is obtained from the type of the result X:
6783       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6784
6785    This means that, contrary to "regular" reductions (or "regular" stmts in
6786    general), the following equation:
6787       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6788    does *NOT* necessarily hold for reduction patterns.  */
6789
6790 bool
6791 vectorizable_reduction (loop_vec_info loop_vinfo,
6792                         stmt_vec_info stmt_info, slp_tree slp_node,
6793                         slp_instance slp_node_instance,
6794                         stmt_vector_for_cost *cost_vec)
6795 {
6796   tree vectype_in = NULL_TREE;
6797   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
6798   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6799   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6800   stmt_vec_info cond_stmt_vinfo = NULL;
6801   int i;
6802   int ncopies;
6803   bool single_defuse_cycle = false;
6804   bool nested_cycle = false;
6805   bool double_reduc = false;
6806   int vec_num;
6807   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6808   tree cond_reduc_val = NULL_TREE;
6809
6810   /* Make sure it was already recognized as a reduction computation.  */
6811   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6812       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6813       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6814     return false;
6815
6816   /* The stmt we store reduction analysis meta on.  */
6817   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6818   reduc_info->is_reduc_info = true;
6819
6820   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6821     {
6822       if (is_a <gphi *> (stmt_info->stmt))
6823         {
6824           if (slp_node)
6825             {
6826               /* We eventually need to set a vector type on invariant
6827                  arguments.  */
6828               unsigned j;
6829               slp_tree child;
6830               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6831                 if (!vect_maybe_update_slp_op_vectype
6832                        (child, SLP_TREE_VECTYPE (slp_node)))
6833                   {
6834                     if (dump_enabled_p ())
6835                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6836                                        "incompatible vector types for "
6837                                        "invariants\n");
6838                     return false;
6839                   }
6840             }
6841           /* Analysis for double-reduction is done on the outer
6842              loop PHI, nested cycles have no further restrictions.  */
6843           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6844         }
6845       else
6846         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6847       return true;
6848     }
6849
6850   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6851   stmt_vec_info phi_info = stmt_info;
6852   if (!is_a <gphi *> (stmt_info->stmt))
6853     {
6854       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6855       return true;
6856     }
6857   if (slp_node)
6858     {
6859       slp_node_instance->reduc_phis = slp_node;
6860       /* ???  We're leaving slp_node to point to the PHIs, we only
6861          need it to get at the number of vector stmts which wasn't
6862          yet initialized for the instance root.  */
6863     }
6864   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6865     {
6866       use_operand_p use_p;
6867       gimple *use_stmt;
6868       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6869                                  &use_p, &use_stmt);
6870       gcc_assert (res);
6871       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6872     }
6873
6874   /* PHIs should not participate in patterns.  */
6875   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6876   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6877
6878   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6879      and compute the reduction chain length.  Discover the real
6880      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6881   tree reduc_def
6882     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6883                              loop_latch_edge
6884                                (gimple_bb (reduc_def_phi)->loop_father));
6885   unsigned reduc_chain_length = 0;
6886   bool only_slp_reduc_chain = true;
6887   stmt_info = NULL;
6888   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6889   while (reduc_def != PHI_RESULT (reduc_def_phi))
6890     {
6891       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6892       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6893       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6894         {
6895           if (dump_enabled_p ())
6896             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6897                              "reduction chain broken by patterns.\n");
6898           return false;
6899         }
6900       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6901         only_slp_reduc_chain = false;
6902       /* For epilogue generation live members of the chain need
6903          to point back to the PHI via their original stmt for
6904          info_for_reduction to work.  For SLP we need to look at
6905          all lanes here - even though we only will vectorize from
6906          the SLP node with live lane zero the other live lanes also
6907          need to be identified as part of a reduction to be able
6908          to skip code generation for them.  */
6909       if (slp_for_stmt_info)
6910         {
6911           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
6912             if (STMT_VINFO_LIVE_P (s))
6913               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
6914         }
6915       else if (STMT_VINFO_LIVE_P (vdef))
6916         STMT_VINFO_REDUC_DEF (def) = phi_info;
6917       gimple_match_op op;
6918       if (!gimple_extract_op (vdef->stmt, &op))
6919         {
6920           if (dump_enabled_p ())
6921             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922                              "reduction chain includes unsupported"
6923                              " statement type.\n");
6924           return false;
6925         }
6926       if (CONVERT_EXPR_CODE_P (op.code))
6927         {
6928           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6929             {
6930               if (dump_enabled_p ())
6931                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6932                                  "conversion in the reduction chain.\n");
6933               return false;
6934             }
6935         }
6936       else if (!stmt_info)
6937         /* First non-conversion stmt.  */
6938         stmt_info = vdef;
6939       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6940       reduc_chain_length++;
6941       if (!stmt_info && slp_node)
6942         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6943     }
6944   /* PHIs should not participate in patterns.  */
6945   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6946
6947   if (nested_in_vect_loop_p (loop, stmt_info))
6948     {
6949       loop = loop->inner;
6950       nested_cycle = true;
6951     }
6952
6953   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6954      element.  */
6955   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6956     {
6957       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6958       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6959     }
6960   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6961     gcc_assert (slp_node
6962                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6963
6964   /* 1. Is vectorizable reduction?  */
6965   /* Not supportable if the reduction variable is used in the loop, unless
6966      it's a reduction chain.  */
6967   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6968       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6969     return false;
6970
6971   /* Reductions that are not used even in an enclosing outer-loop,
6972      are expected to be "live" (used out of the loop).  */
6973   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6974       && !STMT_VINFO_LIVE_P (stmt_info))
6975     return false;
6976
6977   /* 2. Has this been recognized as a reduction pattern?
6978
6979      Check if STMT represents a pattern that has been recognized
6980      in earlier analysis stages.  For stmts that represent a pattern,
6981      the STMT_VINFO_RELATED_STMT field records the last stmt in
6982      the original sequence that constitutes the pattern.  */
6983
6984   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6985   if (orig_stmt_info)
6986     {
6987       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6988       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6989     }
6990
6991   /* 3. Check the operands of the operation.  The first operands are defined
6992         inside the loop body. The last operand is the reduction variable,
6993         which is defined by the loop-header-phi.  */
6994
6995   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6996   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6997   gimple_match_op op;
6998   if (!gimple_extract_op (stmt_info->stmt, &op))
6999     gcc_unreachable ();
7000   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7001                             || op.code == WIDEN_SUM_EXPR
7002                             || op.code == SAD_EXPR);
7003
7004   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7005       && !SCALAR_FLOAT_TYPE_P (op.type))
7006     return false;
7007
7008   /* Do not try to vectorize bit-precision reductions.  */
7009   if (!type_has_mode_precision_p (op.type))
7010     return false;
7011
7012   /* For lane-reducing ops we're reducing the number of reduction PHIs
7013      which means the only use of that may be in the lane-reducing operation.  */
7014   if (lane_reduc_code_p
7015       && reduc_chain_length != 1
7016       && !only_slp_reduc_chain)
7017     {
7018       if (dump_enabled_p ())
7019         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7020                          "lane-reducing reduction with extra stmts.\n");
7021       return false;
7022     }
7023
7024   /* All uses but the last are expected to be defined in the loop.
7025      The last use is the reduction variable.  In case of nested cycle this
7026      assumption is not true: we use reduc_index to record the index of the
7027      reduction variable.  */
7028   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7029   /* We need to skip an extra operand for COND_EXPRs with embedded
7030      comparison.  */
7031   unsigned opno_adjust = 0;
7032   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7033     opno_adjust = 1;
7034   for (i = 0; i < (int) op.num_ops; i++)
7035     {
7036       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7037       if (i == 0 && op.code == COND_EXPR)
7038         continue;
7039
7040       stmt_vec_info def_stmt_info;
7041       enum vect_def_type dt;
7042       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7043                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7044                                &vectype_op[i], &def_stmt_info))
7045         {
7046           if (dump_enabled_p ())
7047             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7048                              "use not simple.\n");
7049           return false;
7050         }
7051       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7052         continue;
7053
7054       /* There should be only one cycle def in the stmt, the one
7055          leading to reduc_def.  */
7056       if (VECTORIZABLE_CYCLE_DEF (dt))
7057         return false;
7058
7059       if (!vectype_op[i])
7060         vectype_op[i]
7061           = get_vectype_for_scalar_type (loop_vinfo,
7062                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7063
7064       /* To properly compute ncopies we are interested in the widest
7065          non-reduction input type in case we're looking at a widening
7066          accumulation that we later handle in vect_transform_reduction.  */
7067       if (lane_reduc_code_p
7068           && vectype_op[i]
7069           && (!vectype_in
7070               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7071                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7072         vectype_in = vectype_op[i];
7073
7074       if (op.code == COND_EXPR)
7075         {
7076           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7077           if (dt == vect_constant_def)
7078             {
7079               cond_reduc_dt = dt;
7080               cond_reduc_val = op.ops[i];
7081             }
7082           if (dt == vect_induction_def
7083               && def_stmt_info
7084               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7085             {
7086               cond_reduc_dt = dt;
7087               cond_stmt_vinfo = def_stmt_info;
7088             }
7089         }
7090     }
7091   if (!vectype_in)
7092     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7093   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7094
7095   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7096   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7097   /* If we have a condition reduction, see if we can simplify it further.  */
7098   if (v_reduc_type == COND_REDUCTION)
7099     {
7100       if (slp_node)
7101         return false;
7102
7103       /* When the condition uses the reduction value in the condition, fail.  */
7104       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7105         {
7106           if (dump_enabled_p ())
7107             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7108                              "condition depends on previous iteration\n");
7109           return false;
7110         }
7111
7112       if (reduc_chain_length == 1
7113           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7114                                              vectype_in, OPTIMIZE_FOR_SPEED))
7115         {
7116           if (dump_enabled_p ())
7117             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7118                              "optimizing condition reduction with"
7119                              " FOLD_EXTRACT_LAST.\n");
7120           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7121         }
7122       else if (cond_reduc_dt == vect_induction_def)
7123         {
7124           tree base
7125             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7126           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7127
7128           gcc_assert (TREE_CODE (base) == INTEGER_CST
7129                       && TREE_CODE (step) == INTEGER_CST);
7130           cond_reduc_val = NULL_TREE;
7131           enum tree_code cond_reduc_op_code = ERROR_MARK;
7132           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7133           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7134             ;
7135           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7136              above base; punt if base is the minimum value of the type for
7137              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7138           else if (tree_int_cst_sgn (step) == -1)
7139             {
7140               cond_reduc_op_code = MIN_EXPR;
7141               if (tree_int_cst_sgn (base) == -1)
7142                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7143               else if (tree_int_cst_lt (base,
7144                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7145                 cond_reduc_val
7146                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7147             }
7148           else
7149             {
7150               cond_reduc_op_code = MAX_EXPR;
7151               if (tree_int_cst_sgn (base) == 1)
7152                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7153               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7154                                         base))
7155                 cond_reduc_val
7156                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7157             }
7158           if (cond_reduc_val)
7159             {
7160               if (dump_enabled_p ())
7161                 dump_printf_loc (MSG_NOTE, vect_location,
7162                                  "condition expression based on "
7163                                  "integer induction.\n");
7164               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7165               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7166                 = cond_reduc_val;
7167               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7168             }
7169         }
7170       else if (cond_reduc_dt == vect_constant_def)
7171         {
7172           enum vect_def_type cond_initial_dt;
7173           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7174           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7175           if (cond_initial_dt == vect_constant_def
7176               && types_compatible_p (TREE_TYPE (cond_initial_val),
7177                                      TREE_TYPE (cond_reduc_val)))
7178             {
7179               tree e = fold_binary (LE_EXPR, boolean_type_node,
7180                                     cond_initial_val, cond_reduc_val);
7181               if (e && (integer_onep (e) || integer_zerop (e)))
7182                 {
7183                   if (dump_enabled_p ())
7184                     dump_printf_loc (MSG_NOTE, vect_location,
7185                                      "condition expression based on "
7186                                      "compile time constant.\n");
7187                   /* Record reduction code at analysis stage.  */
7188                   STMT_VINFO_REDUC_CODE (reduc_info)
7189                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7190                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7191                 }
7192             }
7193         }
7194     }
7195
7196   if (STMT_VINFO_LIVE_P (phi_info))
7197     return false;
7198
7199   if (slp_node)
7200     ncopies = 1;
7201   else
7202     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7203
7204   gcc_assert (ncopies >= 1);
7205
7206   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7207
7208   if (nested_cycle)
7209     {
7210       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7211                   == vect_double_reduction_def);
7212       double_reduc = true;
7213     }
7214
7215   /* 4.2. Check support for the epilog operation.
7216
7217           If STMT represents a reduction pattern, then the type of the
7218           reduction variable may be different than the type of the rest
7219           of the arguments.  For example, consider the case of accumulation
7220           of shorts into an int accumulator; The original code:
7221                         S1: int_a = (int) short_a;
7222           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7223
7224           was replaced with:
7225                         STMT: int_acc = widen_sum <short_a, int_acc>
7226
7227           This means that:
7228           1. The tree-code that is used to create the vector operation in the
7229              epilog code (that reduces the partial results) is not the
7230              tree-code of STMT, but is rather the tree-code of the original
7231              stmt from the pattern that STMT is replacing.  I.e, in the example
7232              above we want to use 'widen_sum' in the loop, but 'plus' in the
7233              epilog.
7234           2. The type (mode) we use to check available target support
7235              for the vector operation to be created in the *epilog*, is
7236              determined by the type of the reduction variable (in the example
7237              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7238              However the type (mode) we use to check available target support
7239              for the vector operation to be created *inside the loop*, is
7240              determined by the type of the other arguments to STMT (in the
7241              example we'd check this: optab_handler (widen_sum_optab,
7242              vect_short_mode)).
7243
7244           This is contrary to "regular" reductions, in which the types of all
7245           the arguments are the same as the type of the reduction variable.
7246           For "regular" reductions we can therefore use the same vector type
7247           (and also the same tree-code) when generating the epilog code and
7248           when generating the code inside the loop.  */
7249
7250   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7251   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7252
7253   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7254   if (reduction_type == TREE_CODE_REDUCTION)
7255     {
7256       /* Check whether it's ok to change the order of the computation.
7257          Generally, when vectorizing a reduction we change the order of the
7258          computation.  This may change the behavior of the program in some
7259          cases, so we need to check that this is ok.  One exception is when
7260          vectorizing an outer-loop: the inner-loop is executed sequentially,
7261          and therefore vectorizing reductions in the inner-loop during
7262          outer-loop vectorization is safe.  Likewise when we are vectorizing
7263          a series of reductions using SLP and the VF is one the reductions
7264          are performed in scalar order.  */
7265       if (slp_node
7266           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7267           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7268         ;
7269       else if (needs_fold_left_reduction_p (op.type, orig_code))
7270         {
7271           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7272              is not directy used in stmt.  */
7273           if (!only_slp_reduc_chain
7274               && reduc_chain_length != 1)
7275             {
7276               if (dump_enabled_p ())
7277                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7278                                  "in-order reduction chain without SLP.\n");
7279               return false;
7280             }
7281           STMT_VINFO_REDUC_TYPE (reduc_info)
7282             = reduction_type = FOLD_LEFT_REDUCTION;
7283         }
7284       else if (!commutative_binary_op_p (orig_code, op.type)
7285                || !associative_binary_op_p (orig_code, op.type))
7286         {
7287           if (dump_enabled_p ())
7288             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7289                             "reduction: not commutative/associative");
7290           return false;
7291         }
7292     }
7293
7294   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7295       && ncopies > 1)
7296     {
7297       if (dump_enabled_p ())
7298         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7299                          "multiple types in double reduction or condition "
7300                          "reduction or fold-left reduction.\n");
7301       return false;
7302     }
7303
7304   internal_fn reduc_fn = IFN_LAST;
7305   if (reduction_type == TREE_CODE_REDUCTION
7306       || reduction_type == FOLD_LEFT_REDUCTION
7307       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7308       || reduction_type == CONST_COND_REDUCTION)
7309     {
7310       if (reduction_type == FOLD_LEFT_REDUCTION
7311           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7312           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7313         {
7314           if (reduc_fn != IFN_LAST
7315               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7316                                                   OPTIMIZE_FOR_SPEED))
7317             {
7318               if (dump_enabled_p ())
7319                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7320                                  "reduc op not supported by target.\n");
7321
7322               reduc_fn = IFN_LAST;
7323             }
7324         }
7325       else
7326         {
7327           if (!nested_cycle || double_reduc)
7328             {
7329               if (dump_enabled_p ())
7330                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7331                                  "no reduc code for scalar code.\n");
7332
7333               return false;
7334             }
7335         }
7336     }
7337   else if (reduction_type == COND_REDUCTION)
7338     {
7339       int scalar_precision
7340         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7341       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7342       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7343                                                 vectype_out);
7344
7345       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7346                                           OPTIMIZE_FOR_SPEED))
7347         reduc_fn = IFN_REDUC_MAX;
7348     }
7349   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7350
7351   if (reduction_type != EXTRACT_LAST_REDUCTION
7352       && (!nested_cycle || double_reduc)
7353       && reduc_fn == IFN_LAST
7354       && !nunits_out.is_constant ())
7355     {
7356       if (dump_enabled_p ())
7357         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7358                          "missing target support for reduction on"
7359                          " variable-length vectors.\n");
7360       return false;
7361     }
7362
7363   /* For SLP reductions, see if there is a neutral value we can use.  */
7364   tree neutral_op = NULL_TREE;
7365   if (slp_node)
7366     {
7367       tree initial_value = NULL_TREE;
7368       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7369         initial_value = vect_phi_initial_value (reduc_def_phi);
7370       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7371                                              orig_code, initial_value);
7372     }
7373
7374   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7375     {
7376       /* We can't support in-order reductions of code such as this:
7377
7378            for (int i = 0; i < n1; ++i)
7379              for (int j = 0; j < n2; ++j)
7380                l += a[j];
7381
7382          since GCC effectively transforms the loop when vectorizing:
7383
7384            for (int i = 0; i < n1 / VF; ++i)
7385              for (int j = 0; j < n2; ++j)
7386                for (int k = 0; k < VF; ++k)
7387                  l += a[j];
7388
7389          which is a reassociation of the original operation.  */
7390       if (dump_enabled_p ())
7391         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7392                          "in-order double reduction not supported.\n");
7393
7394       return false;
7395     }
7396
7397   if (reduction_type == FOLD_LEFT_REDUCTION
7398       && slp_node
7399       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7400     {
7401       /* We cannot use in-order reductions in this case because there is
7402          an implicit reassociation of the operations involved.  */
7403       if (dump_enabled_p ())
7404         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7405                          "in-order unchained SLP reductions not supported.\n");
7406       return false;
7407     }
7408
7409   /* For double reductions, and for SLP reductions with a neutral value,
7410      we construct a variable-length initial vector by loading a vector
7411      full of the neutral value and then shift-and-inserting the start
7412      values into the low-numbered elements.  */
7413   if ((double_reduc || neutral_op)
7414       && !nunits_out.is_constant ()
7415       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7416                                           vectype_out, OPTIMIZE_FOR_SPEED))
7417     {
7418       if (dump_enabled_p ())
7419         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7420                          "reduction on variable-length vectors requires"
7421                          " target support for a vector-shift-and-insert"
7422                          " operation.\n");
7423       return false;
7424     }
7425
7426   /* Check extra constraints for variable-length unchained SLP reductions.  */
7427   if (slp_node
7428       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7429       && !nunits_out.is_constant ())
7430     {
7431       /* We checked above that we could build the initial vector when
7432          there's a neutral element value.  Check here for the case in
7433          which each SLP statement has its own initial value and in which
7434          that value needs to be repeated for every instance of the
7435          statement within the initial vector.  */
7436       unsigned int group_size = SLP_TREE_LANES (slp_node);
7437       if (!neutral_op
7438           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7439                                               TREE_TYPE (vectype_out)))
7440         {
7441           if (dump_enabled_p ())
7442             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7443                              "unsupported form of SLP reduction for"
7444                              " variable-length vectors: cannot build"
7445                              " initial vector.\n");
7446           return false;
7447         }
7448       /* The epilogue code relies on the number of elements being a multiple
7449          of the group size.  The duplicate-and-interleave approach to setting
7450          up the initial vector does too.  */
7451       if (!multiple_p (nunits_out, group_size))
7452         {
7453           if (dump_enabled_p ())
7454             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7455                              "unsupported form of SLP reduction for"
7456                              " variable-length vectors: the vector size"
7457                              " is not a multiple of the number of results.\n");
7458           return false;
7459         }
7460     }
7461
7462   if (reduction_type == COND_REDUCTION)
7463     {
7464       widest_int ni;
7465
7466       if (! max_loop_iterations (loop, &ni))
7467         {
7468           if (dump_enabled_p ())
7469             dump_printf_loc (MSG_NOTE, vect_location,
7470                              "loop count not known, cannot create cond "
7471                              "reduction.\n");
7472           return false;
7473         }
7474       /* Convert backedges to iterations.  */
7475       ni += 1;
7476
7477       /* The additional index will be the same type as the condition.  Check
7478          that the loop can fit into this less one (because we'll use up the
7479          zero slot for when there are no matches).  */
7480       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7481       if (wi::geu_p (ni, wi::to_widest (max_index)))
7482         {
7483           if (dump_enabled_p ())
7484             dump_printf_loc (MSG_NOTE, vect_location,
7485                              "loop size is greater than data size.\n");
7486           return false;
7487         }
7488     }
7489
7490   /* In case the vectorization factor (VF) is bigger than the number
7491      of elements that we can fit in a vectype (nunits), we have to generate
7492      more than one vector stmt - i.e - we need to "unroll" the
7493      vector stmt by a factor VF/nunits.  For more details see documentation
7494      in vectorizable_operation.  */
7495
7496   /* If the reduction is used in an outer loop we need to generate
7497      VF intermediate results, like so (e.g. for ncopies=2):
7498         r0 = phi (init, r0)
7499         r1 = phi (init, r1)
7500         r0 = x0 + r0;
7501         r1 = x1 + r1;
7502     (i.e. we generate VF results in 2 registers).
7503     In this case we have a separate def-use cycle for each copy, and therefore
7504     for each copy we get the vector def for the reduction variable from the
7505     respective phi node created for this copy.
7506
7507     Otherwise (the reduction is unused in the loop nest), we can combine
7508     together intermediate results, like so (e.g. for ncopies=2):
7509         r = phi (init, r)
7510         r = x0 + r;
7511         r = x1 + r;
7512    (i.e. we generate VF/2 results in a single register).
7513    In this case for each copy we get the vector def for the reduction variable
7514    from the vectorized reduction operation generated in the previous iteration.
7515
7516    This only works when we see both the reduction PHI and its only consumer
7517    in vectorizable_reduction and there are no intermediate stmts
7518    participating.  When unrolling we want each unrolled iteration to have its
7519    own reduction accumulator since one of the main goals of unrolling a
7520    reduction is to reduce the aggregate loop-carried latency.  */
7521   if (ncopies > 1
7522       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7523       && reduc_chain_length == 1
7524       && loop_vinfo->suggested_unroll_factor == 1)
7525     single_defuse_cycle = true;
7526
7527   if (single_defuse_cycle || lane_reduc_code_p)
7528     {
7529       gcc_assert (op.code != COND_EXPR);
7530
7531       /* 4. Supportable by target?  */
7532       bool ok = true;
7533
7534       /* 4.1. check support for the operation in the loop
7535
7536          This isn't necessary for the lane reduction codes, since they
7537          can only be produced by pattern matching, and it's up to the
7538          pattern matcher to test for support.  The main reason for
7539          specifically skipping this step is to avoid rechecking whether
7540          mixed-sign dot-products can be implemented using signed
7541          dot-products.  */
7542       machine_mode vec_mode = TYPE_MODE (vectype_in);
7543       if (!lane_reduc_code_p
7544           && !directly_supported_p (op.code, vectype_in, optab_vector))
7545         {
7546           if (dump_enabled_p ())
7547             dump_printf (MSG_NOTE, "op not supported by target.\n");
7548           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7549               || !vect_can_vectorize_without_simd_p (op.code))
7550             ok = false;
7551           else
7552             if (dump_enabled_p ())
7553               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7554         }
7555
7556       if (vect_emulated_vector_p (vectype_in)
7557           && !vect_can_vectorize_without_simd_p (op.code))
7558         {
7559           if (dump_enabled_p ())
7560             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7561           return false;
7562         }
7563
7564       /* lane-reducing operations have to go through vect_transform_reduction.
7565          For the other cases try without the single cycle optimization.  */
7566       if (!ok)
7567         {
7568           if (lane_reduc_code_p)
7569             return false;
7570           else
7571             single_defuse_cycle = false;
7572         }
7573     }
7574   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7575
7576   /* If the reduction stmt is one of the patterns that have lane
7577      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7578   if ((ncopies > 1 && ! single_defuse_cycle)
7579       && lane_reduc_code_p)
7580     {
7581       if (dump_enabled_p ())
7582         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7583                          "multi def-use cycle not possible for lane-reducing "
7584                          "reduction operation\n");
7585       return false;
7586     }
7587
7588   if (slp_node
7589       && !(!single_defuse_cycle
7590            && !lane_reduc_code_p
7591            && reduction_type != FOLD_LEFT_REDUCTION))
7592     for (i = 0; i < (int) op.num_ops; i++)
7593       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7594         {
7595           if (dump_enabled_p ())
7596             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7597                              "incompatible vector types for invariants\n");
7598           return false;
7599         }
7600
7601   if (slp_node)
7602     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7603   else
7604     vec_num = 1;
7605
7606   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7607                              reduction_type, ncopies, cost_vec);
7608   /* Cost the reduction op inside the loop if transformed via
7609      vect_transform_reduction.  Otherwise this is costed by the
7610      separate vectorizable_* routines.  */
7611   if (single_defuse_cycle || lane_reduc_code_p)
7612     {
7613       int factor = 1;
7614       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7615         /* Three dot-products and a subtraction.  */
7616         factor = 4;
7617       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7618                         stmt_info, 0, vect_body);
7619     }
7620
7621   if (dump_enabled_p ()
7622       && reduction_type == FOLD_LEFT_REDUCTION)
7623     dump_printf_loc (MSG_NOTE, vect_location,
7624                      "using an in-order (fold-left) reduction.\n");
7625   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7626   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7627      reductions go through their own vectorizable_* routines.  */
7628   if (!single_defuse_cycle
7629       && !lane_reduc_code_p
7630       && reduction_type != FOLD_LEFT_REDUCTION)
7631     {
7632       stmt_vec_info tem
7633         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7634       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7635         {
7636           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7637           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7638         }
7639       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7640       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7641     }
7642   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7643     {
7644       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7645       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7646
7647       if (reduction_type != FOLD_LEFT_REDUCTION
7648           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7649           && (cond_fn == IFN_LAST
7650               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7651                                                   OPTIMIZE_FOR_SPEED)))
7652         {
7653           if (dump_enabled_p ())
7654             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7655                              "can't operate on partial vectors because"
7656                              " no conditional operation is available.\n");
7657           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7658         }
7659       else if (reduction_type == FOLD_LEFT_REDUCTION
7660                && reduc_fn == IFN_LAST
7661                && !expand_vec_cond_expr_p (vectype_in,
7662                                            truth_type_for (vectype_in),
7663                                            SSA_NAME))
7664         {
7665           if (dump_enabled_p ())
7666             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7667                              "can't operate on partial vectors because"
7668                              " no conditional operation is available.\n");
7669           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7670         }
7671       else
7672         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7673                                vectype_in, NULL);
7674     }
7675   return true;
7676 }
7677
7678 /* STMT_INFO is a dot-product reduction whose multiplication operands
7679    have different signs.  Emit a sequence to emulate the operation
7680    using a series of signed DOT_PROD_EXPRs and return the last
7681    statement generated.  VEC_DEST is the result of the vector operation
7682    and VOP lists its inputs.  */
7683
7684 static gassign *
7685 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7686                              gimple_stmt_iterator *gsi, tree vec_dest,
7687                              tree vop[3])
7688 {
7689   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7690   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7691   tree narrow_elttype = TREE_TYPE (narrow_vectype);
7692   gimple *new_stmt;
7693
7694   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
7695   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7696     std::swap (vop[0], vop[1]);
7697
7698   /* Convert all inputs to signed types.  */
7699   for (int i = 0; i < 3; ++i)
7700     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7701       {
7702         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7703         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7704         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7705         vop[i] = tmp;
7706       }
7707
7708   /* In the comments below we assume 8-bit inputs for simplicity,
7709      but the approach works for any full integer type.  */
7710
7711   /* Create a vector of -128.  */
7712   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7713   tree min_narrow = build_vector_from_val (narrow_vectype,
7714                                            min_narrow_elttype);
7715
7716   /* Create a vector of 64.  */
7717   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7718   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7719   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7720
7721   /* Emit: SUB_RES = VOP[0] - 128.  */
7722   tree sub_res = make_ssa_name (narrow_vectype);
7723   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7724   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7725
7726   /* Emit:
7727
7728        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7729        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7730        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7731
7732      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7733      Doing the two 64 * y steps first allows more time to compute x.  */
7734   tree stage1 = make_ssa_name (wide_vectype);
7735   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7736                                   vop[1], half_narrow, vop[2]);
7737   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7738
7739   tree stage2 = make_ssa_name (wide_vectype);
7740   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7741                                   vop[1], half_narrow, stage1);
7742   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7743
7744   tree stage3 = make_ssa_name (wide_vectype);
7745   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7746                                   sub_res, vop[1], stage2);
7747   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7748
7749   /* Convert STAGE3 to the reduction type.  */
7750   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7751 }
7752
7753 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7754    value.  */
7755
7756 bool
7757 vect_transform_reduction (loop_vec_info loop_vinfo,
7758                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7759                           gimple **vec_stmt, slp_tree slp_node)
7760 {
7761   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7762   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7763   int i;
7764   int ncopies;
7765   int vec_num;
7766
7767   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7768   gcc_assert (reduc_info->is_reduc_info);
7769
7770   if (nested_in_vect_loop_p (loop, stmt_info))
7771     {
7772       loop = loop->inner;
7773       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7774     }
7775
7776   gimple_match_op op;
7777   if (!gimple_extract_op (stmt_info->stmt, &op))
7778     gcc_unreachable ();
7779
7780   /* All uses but the last are expected to be defined in the loop.
7781      The last use is the reduction variable.  In case of nested cycle this
7782      assumption is not true: we use reduc_index to record the index of the
7783      reduction variable.  */
7784   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7785   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7786   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7787   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7788
7789   if (slp_node)
7790     {
7791       ncopies = 1;
7792       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7793     }
7794   else
7795     {
7796       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7797       vec_num = 1;
7798     }
7799
7800   code_helper code = canonicalize_code (op.code, op.type);
7801   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
7802   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7803   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7804
7805   /* Transform.  */
7806   tree new_temp = NULL_TREE;
7807   auto_vec<tree> vec_oprnds0;
7808   auto_vec<tree> vec_oprnds1;
7809   auto_vec<tree> vec_oprnds2;
7810   tree def0;
7811
7812   if (dump_enabled_p ())
7813     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7814
7815   /* FORNOW: Multiple types are not supported for condition.  */
7816   if (code == COND_EXPR)
7817     gcc_assert (ncopies == 1);
7818
7819   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7820
7821   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7822   if (reduction_type == FOLD_LEFT_REDUCTION)
7823     {
7824       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7825       gcc_assert (code.is_tree_code ());
7826       return vectorize_fold_left_reduction
7827           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
7828            tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
7829     }
7830
7831   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7832   gcc_assert (single_defuse_cycle
7833               || code == DOT_PROD_EXPR
7834               || code == WIDEN_SUM_EXPR
7835               || code == SAD_EXPR);
7836
7837   /* Create the destination vector  */
7838   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7839   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7840
7841   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7842                      single_defuse_cycle && reduc_index == 0
7843                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
7844                      single_defuse_cycle && reduc_index == 1
7845                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
7846                      op.num_ops == 3
7847                      && !(single_defuse_cycle && reduc_index == 2)
7848                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7849   if (single_defuse_cycle)
7850     {
7851       gcc_assert (!slp_node);
7852       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7853                                      op.ops[reduc_index],
7854                                      reduc_index == 0 ? &vec_oprnds0
7855                                      : (reduc_index == 1 ? &vec_oprnds1
7856                                         : &vec_oprnds2));
7857     }
7858
7859   bool emulated_mixed_dot_prod
7860     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
7861   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7862     {
7863       gimple *new_stmt;
7864       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7865       if (masked_loop_p && !mask_by_cond_expr)
7866         {
7867           /* No conditional ifns have been defined for dot-product yet.  */
7868           gcc_assert (code != DOT_PROD_EXPR);
7869
7870           /* Make sure that the reduction accumulator is vop[0].  */
7871           if (reduc_index == 1)
7872             {
7873               gcc_assert (commutative_binary_op_p (code, op.type));
7874               std::swap (vop[0], vop[1]);
7875             }
7876           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7877                                           vectype_in, i);
7878           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7879                                                     vop[0], vop[1], vop[0]);
7880           new_temp = make_ssa_name (vec_dest, call);
7881           gimple_call_set_lhs (call, new_temp);
7882           gimple_call_set_nothrow (call, true);
7883           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7884           new_stmt = call;
7885         }
7886       else
7887         {
7888           if (op.num_ops == 3)
7889             vop[2] = vec_oprnds2[i];
7890
7891           if (masked_loop_p && mask_by_cond_expr)
7892             {
7893               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7894                                               vectype_in, i);
7895               build_vect_cond_expr (code, vop, mask, gsi);
7896             }
7897
7898           if (emulated_mixed_dot_prod)
7899             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
7900                                                     vec_dest, vop);
7901           else if (code.is_internal_fn ())
7902             new_stmt = gimple_build_call_internal (internal_fn (code),
7903                                                    op.num_ops,
7904                                                    vop[0], vop[1], vop[2]);
7905           else
7906             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
7907                                             vop[0], vop[1], vop[2]);
7908           new_temp = make_ssa_name (vec_dest, new_stmt);
7909           gimple_set_lhs (new_stmt, new_temp);
7910           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7911         }
7912
7913       if (slp_node)
7914         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7915       else if (single_defuse_cycle
7916                && i < ncopies - 1)
7917         {
7918           if (reduc_index == 0)
7919             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7920           else if (reduc_index == 1)
7921             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7922           else if (reduc_index == 2)
7923             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7924         }
7925       else
7926         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7927     }
7928
7929   if (!slp_node)
7930     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7931
7932   return true;
7933 }
7934
7935 /* Transform phase of a cycle PHI.  */
7936
7937 bool
7938 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7939                           stmt_vec_info stmt_info, gimple **vec_stmt,
7940                           slp_tree slp_node, slp_instance slp_node_instance)
7941 {
7942   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7943   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7944   int i;
7945   int ncopies;
7946   int j;
7947   bool nested_cycle = false;
7948   int vec_num;
7949
7950   if (nested_in_vect_loop_p (loop, stmt_info))
7951     {
7952       loop = loop->inner;
7953       nested_cycle = true;
7954     }
7955
7956   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7957   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7958   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7959   gcc_assert (reduc_info->is_reduc_info);
7960
7961   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7962       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7963     /* Leave the scalar phi in place.  */
7964     return true;
7965
7966   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7967   /* For a nested cycle we do not fill the above.  */
7968   if (!vectype_in)
7969     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7970   gcc_assert (vectype_in);
7971
7972   if (slp_node)
7973     {
7974       /* The size vect_schedule_slp_instance computes is off for us.  */
7975       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7976                                       * SLP_TREE_LANES (slp_node), vectype_in);
7977       ncopies = 1;
7978     }
7979   else
7980     {
7981       vec_num = 1;
7982       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7983     }
7984
7985   /* Check whether we should use a single PHI node and accumulate
7986      vectors to one before the backedge.  */
7987   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7988     ncopies = 1;
7989
7990   /* Create the destination vector  */
7991   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7992   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7993                                                vectype_out);
7994
7995   /* Get the loop-entry arguments.  */
7996   tree vec_initial_def = NULL_TREE;
7997   auto_vec<tree> vec_initial_defs;
7998   if (slp_node)
7999     {
8000       vec_initial_defs.reserve (vec_num);
8001       if (nested_cycle)
8002         {
8003           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8004           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8005                              &vec_initial_defs);
8006         }
8007       else
8008         {
8009           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8010           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8011           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8012
8013           unsigned int num_phis = stmts.length ();
8014           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8015             num_phis = 1;
8016           initial_values.reserve (num_phis);
8017           for (unsigned int i = 0; i < num_phis; ++i)
8018             {
8019               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8020               initial_values.quick_push (vect_phi_initial_value (this_phi));
8021             }
8022           if (vec_num == 1)
8023             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8024           if (!initial_values.is_empty ())
8025             {
8026               tree initial_value
8027                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8028               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8029               tree neutral_op
8030                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8031                                             code, initial_value);
8032               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8033                                               &vec_initial_defs, vec_num,
8034                                               stmts.length (), neutral_op);
8035             }
8036         }
8037     }
8038   else
8039     {
8040       /* Get at the scalar def before the loop, that defines the initial
8041          value of the reduction variable.  */
8042       tree initial_def = vect_phi_initial_value (phi);
8043       reduc_info->reduc_initial_values.safe_push (initial_def);
8044       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8045          and we can't use zero for induc_val, use initial_def.  Similarly
8046          for REDUC_MIN and initial_def larger than the base.  */
8047       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8048         {
8049           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8050           if (TREE_CODE (initial_def) == INTEGER_CST
8051               && !integer_zerop (induc_val)
8052               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8053                    && tree_int_cst_lt (initial_def, induc_val))
8054                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8055                       && tree_int_cst_lt (induc_val, initial_def))))
8056             {
8057               induc_val = initial_def;
8058               /* Communicate we used the initial_def to epilouge
8059                  generation.  */
8060               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8061             }
8062           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8063         }
8064       else if (nested_cycle)
8065         {
8066           /* Do not use an adjustment def as that case is not supported
8067              correctly if ncopies is not one.  */
8068           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8069                                          ncopies, initial_def,
8070                                          &vec_initial_defs);
8071         }
8072       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8073                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8074         /* Fill the initial vector with the initial scalar value.  */
8075         vec_initial_def
8076           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8077                                            initial_def, initial_def);
8078       else
8079         {
8080           if (ncopies == 1)
8081             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8082           if (!reduc_info->reduc_initial_values.is_empty ())
8083             {
8084               initial_def = reduc_info->reduc_initial_values[0];
8085               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8086               tree neutral_op
8087                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8088                                             code, initial_def);
8089               gcc_assert (neutral_op);
8090               /* Try to simplify the vector initialization by applying an
8091                  adjustment after the reduction has been performed.  */
8092               if (!reduc_info->reused_accumulator
8093                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8094                   && !operand_equal_p (neutral_op, initial_def))
8095                 {
8096                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8097                     = initial_def;
8098                   initial_def = neutral_op;
8099                 }
8100               vec_initial_def
8101                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8102                                                  initial_def, neutral_op);
8103             }
8104         }
8105     }
8106
8107   if (vec_initial_def)
8108     {
8109       vec_initial_defs.create (ncopies);
8110       for (i = 0; i < ncopies; ++i)
8111         vec_initial_defs.quick_push (vec_initial_def);
8112     }
8113
8114   if (auto *accumulator = reduc_info->reused_accumulator)
8115     {
8116       tree def = accumulator->reduc_input;
8117       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8118         {
8119           unsigned int nreduc;
8120           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8121                                             (TREE_TYPE (def)),
8122                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8123                                           &nreduc);
8124           gcc_assert (res);
8125           gimple_seq stmts = NULL;
8126           /* Reduce the single vector to a smaller one.  */
8127           if (nreduc != 1)
8128             {
8129               /* Perform the reduction in the appropriate type.  */
8130               tree rvectype = vectype_out;
8131               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8132                                               TREE_TYPE (TREE_TYPE (def))))
8133                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8134                                               TYPE_VECTOR_SUBPARTS
8135                                                 (vectype_out));
8136               def = vect_create_partial_epilog (def, rvectype,
8137                                                 STMT_VINFO_REDUC_CODE
8138                                                   (reduc_info),
8139                                                 &stmts);
8140             }
8141           /* The epilogue loop might use a different vector mode, like
8142              VNx2DI vs. V2DI.  */
8143           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8144             {
8145               tree reduc_type = build_vector_type_for_mode
8146                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8147               def = gimple_convert (&stmts, reduc_type, def);
8148             }
8149           /* Adjust the input so we pick up the partially reduced value
8150              for the skip edge in vect_create_epilog_for_reduction.  */
8151           accumulator->reduc_input = def;
8152           /* And the reduction could be carried out using a different sign.  */
8153           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8154             def = gimple_convert (&stmts, vectype_out, def);
8155           if (loop_vinfo->main_loop_edge)
8156             {
8157               /* While we'd like to insert on the edge this will split
8158                  blocks and disturb bookkeeping, we also will eventually
8159                  need this on the skip edge.  Rely on sinking to
8160                  fixup optimal placement and insert in the pred.  */
8161               gimple_stmt_iterator gsi
8162                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8163               /* Insert before a cond that eventually skips the
8164                  epilogue.  */
8165               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8166                 gsi_prev (&gsi);
8167               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8168             }
8169           else
8170             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8171                                               stmts);
8172         }
8173       if (loop_vinfo->main_loop_edge)
8174         vec_initial_defs[0]
8175           = vect_get_main_loop_result (loop_vinfo, def,
8176                                        vec_initial_defs[0]);
8177       else
8178         vec_initial_defs.safe_push (def);
8179     }
8180
8181   /* Generate the reduction PHIs upfront.  */
8182   for (i = 0; i < vec_num; i++)
8183     {
8184       tree vec_init_def = vec_initial_defs[i];
8185       for (j = 0; j < ncopies; j++)
8186         {
8187           /* Create the reduction-phi that defines the reduction
8188              operand.  */
8189           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8190
8191           /* Set the loop-entry arg of the reduction-phi.  */
8192           if (j != 0 && nested_cycle)
8193             vec_init_def = vec_initial_defs[j];
8194           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8195                        UNKNOWN_LOCATION);
8196
8197           /* The loop-latch arg is set in epilogue processing.  */
8198
8199           if (slp_node)
8200             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8201           else
8202             {
8203               if (j == 0)
8204                 *vec_stmt = new_phi;
8205               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8206             }
8207         }
8208     }
8209
8210   return true;
8211 }
8212
8213 /* Vectorizes LC PHIs.  */
8214
8215 bool
8216 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8217                      stmt_vec_info stmt_info, gimple **vec_stmt,
8218                      slp_tree slp_node)
8219 {
8220   if (!loop_vinfo
8221       || !is_a <gphi *> (stmt_info->stmt)
8222       || gimple_phi_num_args (stmt_info->stmt) != 1)
8223     return false;
8224
8225   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8226       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8227     return false;
8228
8229   if (!vec_stmt) /* transformation not required.  */
8230     {
8231       /* Deal with copies from externs or constants that disguise as
8232          loop-closed PHI nodes (PR97886).  */
8233       if (slp_node
8234           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8235                                                 SLP_TREE_VECTYPE (slp_node)))
8236         {
8237           if (dump_enabled_p ())
8238             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8239                              "incompatible vector types for invariants\n");
8240           return false;
8241         }
8242       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8243       return true;
8244     }
8245
8246   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8247   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8248   basic_block bb = gimple_bb (stmt_info->stmt);
8249   edge e = single_pred_edge (bb);
8250   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8251   auto_vec<tree> vec_oprnds;
8252   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8253                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8254                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8255   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8256     {
8257       /* Create the vectorized LC PHI node.  */
8258       gphi *new_phi = create_phi_node (vec_dest, bb);
8259       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8260       if (slp_node)
8261         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8262       else
8263         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8264     }
8265   if (!slp_node)
8266     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8267
8268   return true;
8269 }
8270
8271 /* Vectorizes PHIs.  */
8272
8273 bool
8274 vectorizable_phi (vec_info *,
8275                   stmt_vec_info stmt_info, gimple **vec_stmt,
8276                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8277 {
8278   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8279     return false;
8280
8281   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8282     return false;
8283
8284   tree vectype = SLP_TREE_VECTYPE (slp_node);
8285
8286   if (!vec_stmt) /* transformation not required.  */
8287     {
8288       slp_tree child;
8289       unsigned i;
8290       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8291         if (!child)
8292           {
8293             if (dump_enabled_p ())
8294               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8295                                "PHI node with unvectorized backedge def\n");
8296             return false;
8297           }
8298         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8299           {
8300             if (dump_enabled_p ())
8301               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8302                                "incompatible vector types for invariants\n");
8303             return false;
8304           }
8305         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8306                  && !useless_type_conversion_p (vectype,
8307                                                 SLP_TREE_VECTYPE (child)))
8308           {
8309             /* With bools we can have mask and non-mask precision vectors
8310                or different non-mask precisions.  while pattern recog is
8311                supposed to guarantee consistency here bugs in it can cause
8312                mismatches (PR103489 and PR103800 for example).
8313                Deal with them here instead of ICEing later.  */
8314             if (dump_enabled_p ())
8315               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8316                                "incompatible vector type setup from "
8317                                "bool pattern detection\n");
8318             return false;
8319           }
8320
8321       /* For single-argument PHIs assume coalescing which means zero cost
8322          for the scalar and the vector PHIs.  This avoids artificially
8323          favoring the vector path (but may pessimize it in some cases).  */
8324       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8325         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8326                           vector_stmt, stmt_info, vectype, 0, vect_body);
8327       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8328       return true;
8329     }
8330
8331   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8332   basic_block bb = gimple_bb (stmt_info->stmt);
8333   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8334   auto_vec<gphi *> new_phis;
8335   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8336     {
8337       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8338
8339       /* Skip not yet vectorized defs.  */
8340       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8341           && SLP_TREE_VEC_STMTS (child).is_empty ())
8342         continue;
8343
8344       auto_vec<tree> vec_oprnds;
8345       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8346       if (!new_phis.exists ())
8347         {
8348           new_phis.create (vec_oprnds.length ());
8349           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8350             {
8351               /* Create the vectorized LC PHI node.  */
8352               new_phis.quick_push (create_phi_node (vec_dest, bb));
8353               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8354             }
8355         }
8356       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8357       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8358         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8359     }
8360   /* We should have at least one already vectorized child.  */
8361   gcc_assert (new_phis.exists ());
8362
8363   return true;
8364 }
8365
8366 /* Vectorizes first order recurrences.  An overview of the transformation
8367    is described below. Suppose we have the following loop.
8368
8369      int t = 0;
8370      for (int i = 0; i < n; ++i)
8371        {
8372          b[i] = a[i] - t;
8373          t = a[i];
8374        }
8375
8376    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8377    looks (simplified) like:
8378
8379     scalar.preheader:
8380       init = 0;
8381
8382     scalar.body:
8383       i = PHI <0(scalar.preheader), i+1(scalar.body)>
8384       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8385       _1 = a[i]
8386       b[i] = _1 - _2
8387       if (i < n) goto scalar.body
8388
8389    In this example, _2 is a recurrence because it's value depends on the
8390    previous iteration.  We vectorize this as (VF = 4)
8391
8392     vector.preheader:
8393       vect_init = vect_cst(..., ..., ..., 0)
8394
8395     vector.body
8396       i = PHI <0(vector.preheader), i+4(vector.body)>
8397       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8398       vect_2 = a[i, i+1, i+2, i+3];
8399       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8400       b[i, i+1, i+2, i+3] = vect_2 - vect_3
8401       if (..) goto vector.body
8402
8403    In this function, vectorizable_recurr, we code generate both the
8404    vector PHI node and the permute since those together compute the
8405    vectorized value of the scalar PHI.  We do not yet have the
8406    backedge value to fill in there nor into the vec_perm.  Those
8407    are filled in maybe_set_vectorized_backedge_value and
8408    vect_schedule_scc.
8409
8410    TODO:  Since the scalar loop does not have a use of the recurrence
8411    outside of the loop the natural way to implement peeling via
8412    vectorizing the live value doesn't work.  For now peeling of loops
8413    with a recurrence is not implemented.  For SLP the supported cases
8414    are restricted to those requiring a single vector recurrence PHI.  */
8415
8416 bool
8417 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8418                      gimple **vec_stmt, slp_tree slp_node,
8419                      stmt_vector_for_cost *cost_vec)
8420 {
8421   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8422     return false;
8423
8424   gphi *phi = as_a<gphi *> (stmt_info->stmt);
8425
8426   /* So far we only support first-order recurrence auto-vectorization.  */
8427   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8428     return false;
8429
8430   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8431   unsigned ncopies;
8432   if (slp_node)
8433     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8434   else
8435     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8436   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8437   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8438   /* We need to be able to make progress with a single vector.  */
8439   if (maybe_gt (dist * 2, nunits))
8440     {
8441       if (dump_enabled_p ())
8442         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8443                          "first order recurrence exceeds half of "
8444                          "a vector\n");
8445       return false;
8446     }
8447
8448   /* First-order recurrence autovectorization needs to handle permutation
8449      with indices = [nunits-1, nunits, nunits+1, ...].  */
8450   vec_perm_builder sel (nunits, 1, 3);
8451   for (int i = 0; i < 3; ++i)
8452     sel.quick_push (nunits - dist + i);
8453   vec_perm_indices indices (sel, 2, nunits);
8454
8455   if (!vec_stmt) /* transformation not required.  */
8456     {
8457       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8458                                  indices))
8459         return false;
8460
8461       if (slp_node)
8462         {
8463           /* We eventually need to set a vector type on invariant
8464              arguments.  */
8465           unsigned j;
8466           slp_tree child;
8467           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8468             if (!vect_maybe_update_slp_op_vectype
8469                   (child, SLP_TREE_VECTYPE (slp_node)))
8470               {
8471                 if (dump_enabled_p ())
8472                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8473                                    "incompatible vector types for "
8474                                    "invariants\n");
8475                 return false;
8476               }
8477         }
8478       /* The recurrence costs the initialization vector and one permute
8479          for each copy.  */
8480       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8481                                                  stmt_info, 0, vect_prologue);
8482       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8483                                                stmt_info, 0, vect_body);
8484       if (dump_enabled_p ())
8485         dump_printf_loc (MSG_NOTE, vect_location,
8486                          "vectorizable_recurr: inside_cost = %d, "
8487                          "prologue_cost = %d .\n", inside_cost,
8488                          prologue_cost);
8489
8490       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8491       return true;
8492     }
8493
8494   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8495   basic_block bb = gimple_bb (phi);
8496   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8497   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8498     {
8499       gimple_seq stmts = NULL;
8500       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8501       gsi_insert_seq_on_edge_immediate (pe, stmts);
8502     }
8503   tree vec_init = build_vector_from_val (vectype, preheader);
8504   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8505
8506   /* Create the vectorized first-order PHI node.  */
8507   tree vec_dest = vect_get_new_vect_var (vectype,
8508                                          vect_simple_var, "vec_recur_");
8509   gphi *new_phi = create_phi_node (vec_dest, bb);
8510   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8511
8512   /* Insert shuffles the first-order recurrence autovectorization.
8513        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
8514   tree perm = vect_gen_perm_mask_checked (vectype, indices);
8515
8516   /* Insert the required permute after the latch definition.  The
8517      second and later operands are tentative and will be updated when we have
8518      vectorized the latch definition.  */
8519   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8520   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8521   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8522   gsi_next (&gsi2);
8523
8524   for (unsigned i = 0; i < ncopies; ++i)
8525     {
8526       vec_dest = make_ssa_name (vectype);
8527       gassign *vperm
8528           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8529                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
8530                                  NULL, perm);
8531       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8532
8533       if (slp_node)
8534         SLP_TREE_VEC_STMTS (slp_node).quick_push (vperm);
8535       else
8536         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8537     }
8538
8539   if (!slp_node)
8540     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8541   return true;
8542 }
8543
8544 /* Return true if VECTYPE represents a vector that requires lowering
8545    by the vector lowering pass.  */
8546
8547 bool
8548 vect_emulated_vector_p (tree vectype)
8549 {
8550   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8551           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8552               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8553 }
8554
8555 /* Return true if we can emulate CODE on an integer mode representation
8556    of a vector.  */
8557
8558 bool
8559 vect_can_vectorize_without_simd_p (tree_code code)
8560 {
8561   switch (code)
8562     {
8563     case PLUS_EXPR:
8564     case MINUS_EXPR:
8565     case NEGATE_EXPR:
8566     case BIT_AND_EXPR:
8567     case BIT_IOR_EXPR:
8568     case BIT_XOR_EXPR:
8569     case BIT_NOT_EXPR:
8570       return true;
8571
8572     default:
8573       return false;
8574     }
8575 }
8576
8577 /* Likewise, but taking a code_helper.  */
8578
8579 bool
8580 vect_can_vectorize_without_simd_p (code_helper code)
8581 {
8582   return (code.is_tree_code ()
8583           && vect_can_vectorize_without_simd_p (tree_code (code)));
8584 }
8585
8586 /* Create vector init for vectorized iv.  */
8587 static tree
8588 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8589                                tree step_expr, poly_uint64 nunits,
8590                                tree vectype,
8591                                enum vect_induction_op_type induction_type)
8592 {
8593   unsigned HOST_WIDE_INT const_nunits;
8594   tree vec_shift, vec_init, new_name;
8595   unsigned i;
8596   tree itype = TREE_TYPE (vectype);
8597
8598   /* iv_loop is the loop to be vectorized. Create:
8599      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
8600   new_name = gimple_convert (stmts, itype, init_expr);
8601   switch (induction_type)
8602     {
8603     case vect_step_op_shr:
8604     case vect_step_op_shl:
8605       /* Build the Initial value from shift_expr.  */
8606       vec_init = gimple_build_vector_from_val (stmts,
8607                                                vectype,
8608                                                new_name);
8609       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8610                                 build_zero_cst (itype), step_expr);
8611       vec_init = gimple_build (stmts,
8612                                (induction_type == vect_step_op_shr
8613                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
8614                                vectype, vec_init, vec_shift);
8615       break;
8616
8617     case vect_step_op_neg:
8618       {
8619         vec_init = gimple_build_vector_from_val (stmts,
8620                                                  vectype,
8621                                                  new_name);
8622         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8623                                      vectype, vec_init);
8624         /* The encoding has 2 interleaved stepped patterns.  */
8625         vec_perm_builder sel (nunits, 2, 3);
8626         sel.quick_grow (6);
8627         for (i = 0; i < 3; i++)
8628           {
8629             sel[2 * i] = i;
8630             sel[2 * i + 1] = i + nunits;
8631           }
8632         vec_perm_indices indices (sel, 2, nunits);
8633         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8634            fail when vec_init is const vector. In that situation vec_perm is not
8635            really needed.  */
8636         tree perm_mask_even
8637           = vect_gen_perm_mask_any (vectype, indices);
8638         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8639                                  vectype,
8640                                  vec_init, vec_neg,
8641                                  perm_mask_even);
8642       }
8643       break;
8644
8645     case vect_step_op_mul:
8646       {
8647         /* Use unsigned mult to avoid UD integer overflow.  */
8648         gcc_assert (nunits.is_constant (&const_nunits));
8649         tree utype = unsigned_type_for (itype);
8650         tree uvectype = build_vector_type (utype,
8651                                            TYPE_VECTOR_SUBPARTS (vectype));
8652         new_name = gimple_convert (stmts, utype, new_name);
8653         vec_init = gimple_build_vector_from_val (stmts,
8654                                                  uvectype,
8655                                                  new_name);
8656         tree_vector_builder elts (uvectype, const_nunits, 1);
8657         tree elt_step = build_one_cst (utype);
8658
8659         elts.quick_push (elt_step);
8660         for (i = 1; i < const_nunits; i++)
8661           {
8662             /* Create: new_name_i = new_name + step_expr.  */
8663             elt_step = gimple_build (stmts, MULT_EXPR,
8664                                      utype, elt_step, step_expr);
8665             elts.quick_push (elt_step);
8666           }
8667         /* Create a vector from [new_name_0, new_name_1, ...,
8668            new_name_nunits-1].  */
8669         tree vec_mul = gimple_build_vector (stmts, &elts);
8670         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8671                                  vec_init, vec_mul);
8672         vec_init = gimple_convert (stmts, vectype, vec_init);
8673       }
8674       break;
8675
8676     default:
8677       gcc_unreachable ();
8678     }
8679
8680   return vec_init;
8681 }
8682
8683 /* Peel init_expr by skip_niter for induction_type.  */
8684 tree
8685 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8686                              tree skip_niters, tree step_expr,
8687                              enum vect_induction_op_type induction_type)
8688 {
8689   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8690   tree type = TREE_TYPE (init_expr);
8691   unsigned prec = TYPE_PRECISION (type);
8692   switch (induction_type)
8693     {
8694     case vect_step_op_neg:
8695       if (TREE_INT_CST_LOW (skip_niters) % 2)
8696         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8697       /* else no change.  */
8698       break;
8699
8700     case vect_step_op_shr:
8701     case vect_step_op_shl:
8702       skip_niters = gimple_convert (stmts, type, skip_niters);
8703       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8704       /* When shift mount >= precision, need to avoid UD.
8705          In the original loop, there's no UD, and according to semantic,
8706          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
8707       if (!tree_fits_uhwi_p (step_expr)
8708           || tree_to_uhwi (step_expr) >= prec)
8709         {
8710           if (induction_type == vect_step_op_shl
8711               || TYPE_UNSIGNED (type))
8712             init_expr = build_zero_cst (type);
8713           else
8714             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8715                                       init_expr,
8716                                       wide_int_to_tree (type, prec - 1));
8717         }
8718       else
8719         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8720                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
8721                                   type, init_expr, step_expr);
8722       break;
8723
8724     case vect_step_op_mul:
8725       {
8726         tree utype = unsigned_type_for (type);
8727         init_expr = gimple_convert (stmts, utype, init_expr);
8728         unsigned skipn = TREE_INT_CST_LOW (skip_niters);
8729         wide_int begin = wi::to_wide (step_expr);
8730         for (unsigned i = 0; i != skipn - 1; i++)
8731           begin = wi::mul (begin, wi::to_wide (step_expr));
8732         tree mult_expr = wide_int_to_tree (utype, begin);
8733         init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
8734         init_expr = gimple_convert (stmts, type, init_expr);
8735       }
8736       break;
8737
8738     default:
8739       gcc_unreachable ();
8740     }
8741
8742   return init_expr;
8743 }
8744
8745 /* Create vector step for vectorized iv.  */
8746 static tree
8747 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
8748                                poly_uint64 vf,
8749                                enum vect_induction_op_type induction_type)
8750 {
8751   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
8752   tree new_name = NULL;
8753   /* Step should be pow (step, vf) for mult induction.  */
8754   if (induction_type == vect_step_op_mul)
8755     {
8756       gcc_assert (vf.is_constant ());
8757       wide_int begin = wi::to_wide (step_expr);
8758
8759       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
8760         begin = wi::mul (begin, wi::to_wide (step_expr));
8761
8762       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
8763     }
8764   else if (induction_type == vect_step_op_neg)
8765     /* Do nothing.  */
8766     ;
8767   else
8768     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
8769                              expr, step_expr);
8770   return new_name;
8771 }
8772
8773 static tree
8774 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
8775                                    stmt_vec_info stmt_info,
8776                                    tree new_name, tree vectype,
8777                                    enum vect_induction_op_type induction_type)
8778 {
8779   /* No step is needed for neg induction.  */
8780   if (induction_type == vect_step_op_neg)
8781     return NULL;
8782
8783   tree t = unshare_expr (new_name);
8784   gcc_assert (CONSTANT_CLASS_P (new_name)
8785               || TREE_CODE (new_name) == SSA_NAME);
8786   tree new_vec = build_vector_from_val (vectype, t);
8787   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
8788                                     new_vec, vectype, NULL);
8789   return vec_step;
8790 }
8791
8792 /* Update vectorized iv with vect_step, induc_def is init.  */
8793 static tree
8794 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
8795                           tree induc_def, tree vec_step,
8796                           enum vect_induction_op_type induction_type)
8797 {
8798   tree vec_def = induc_def;
8799   switch (induction_type)
8800     {
8801     case vect_step_op_mul:
8802       {
8803         /* Use unsigned mult to avoid UD integer overflow.  */
8804         tree uvectype
8805           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
8806                                TYPE_VECTOR_SUBPARTS (vectype));
8807         vec_def = gimple_convert (stmts, uvectype, vec_def);
8808         vec_step = gimple_convert (stmts, uvectype, vec_step);
8809         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
8810                                 vec_def, vec_step);
8811         vec_def = gimple_convert (stmts, vectype, vec_def);
8812       }
8813       break;
8814
8815     case vect_step_op_shr:
8816       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
8817                               vec_def, vec_step);
8818       break;
8819
8820     case vect_step_op_shl:
8821       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
8822                               vec_def, vec_step);
8823       break;
8824     case vect_step_op_neg:
8825       vec_def = induc_def;
8826       /* Do nothing.  */
8827       break;
8828     default:
8829       gcc_unreachable ();
8830     }
8831
8832   return vec_def;
8833
8834 }
8835
8836 /* Function vectorizable_induction
8837
8838    Check if STMT_INFO performs an nonlinear induction computation that can be
8839    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
8840    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
8841    basic block.
8842    Return true if STMT_INFO is vectorizable in this way.  */
8843
8844 static bool
8845 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
8846                                   stmt_vec_info stmt_info,
8847                                   gimple **vec_stmt, slp_tree slp_node,
8848                                   stmt_vector_for_cost *cost_vec)
8849 {
8850   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8851   unsigned ncopies;
8852   bool nested_in_vect_loop = false;
8853   class loop *iv_loop;
8854   tree vec_def;
8855   edge pe = loop_preheader_edge (loop);
8856   basic_block new_bb;
8857   tree vec_init, vec_step;
8858   tree new_name;
8859   gimple *new_stmt;
8860   gphi *induction_phi;
8861   tree induc_def, vec_dest;
8862   tree init_expr, step_expr;
8863   tree niters_skip;
8864   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8865   unsigned i;
8866   gimple_stmt_iterator si;
8867
8868   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8869
8870   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8871   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8872   enum vect_induction_op_type induction_type
8873     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
8874
8875   gcc_assert (induction_type > vect_step_op_add);
8876
8877   if (slp_node)
8878     ncopies = 1;
8879   else
8880     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8881   gcc_assert (ncopies >= 1);
8882
8883   /* FORNOW. Only handle nonlinear induction in the same loop.  */
8884   if (nested_in_vect_loop_p (loop, stmt_info))
8885     {
8886       if (dump_enabled_p ())
8887         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8888                          "nonlinear induction in nested loop.\n");
8889       return false;
8890     }
8891
8892   iv_loop = loop;
8893   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8894
8895   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
8896      update for each iv and a permutation to generate wanted vector iv.  */
8897   if (slp_node)
8898     {
8899       if (dump_enabled_p ())
8900         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8901                          "SLP induction not supported for nonlinear"
8902                          " induction.\n");
8903       return false;
8904     }
8905
8906   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
8907     {
8908       if (dump_enabled_p ())
8909         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8910                          "floating point nonlinear induction vectorization"
8911                          " not supported.\n");
8912       return false;
8913     }
8914
8915   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8916   init_expr = vect_phi_initial_value (phi);
8917   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
8918               && TREE_CODE (step_expr) == INTEGER_CST);
8919   /* step_expr should be aligned with init_expr,
8920      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
8921   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
8922
8923   if (TREE_CODE (init_expr) == INTEGER_CST)
8924     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
8925   else
8926     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
8927                                        TREE_TYPE (init_expr)));
8928
8929   switch (induction_type)
8930     {
8931     case vect_step_op_neg:
8932       if (TREE_CODE (init_expr) != INTEGER_CST
8933           && TREE_CODE (init_expr) != REAL_CST)
8934         {
8935           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
8936           if (!directly_supported_p (NEGATE_EXPR, vectype))
8937             return false;
8938
8939           /* The encoding has 2 interleaved stepped patterns.  */
8940           vec_perm_builder sel (nunits, 2, 3);
8941           machine_mode mode = TYPE_MODE (vectype);
8942           sel.quick_grow (6);
8943           for (i = 0; i < 3; i++)
8944             {
8945               sel[i * 2] = i;
8946               sel[i * 2 + 1] = i + nunits;
8947             }
8948           vec_perm_indices indices (sel, 2, nunits);
8949           if (!can_vec_perm_const_p (mode, mode, indices))
8950             return false;
8951         }
8952       break;
8953
8954     case vect_step_op_mul:
8955       {
8956         /* Check for backend support of MULT_EXPR.  */
8957         if (!directly_supported_p (MULT_EXPR, vectype))
8958           return false;
8959
8960         /* ?? How to construct vector step for variable number vector.
8961            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
8962         if (!vf.is_constant ())
8963           return false;
8964       }
8965       break;
8966
8967     case vect_step_op_shr:
8968       /* Check for backend support of RSHIFT_EXPR.  */
8969       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
8970         return false;
8971
8972       /* Don't shift more than type precision to avoid UD.  */
8973       if (!tree_fits_uhwi_p (step_expr)
8974           || maybe_ge (nunits * tree_to_uhwi (step_expr),
8975                        TYPE_PRECISION (TREE_TYPE (init_expr))))
8976         return false;
8977       break;
8978
8979     case vect_step_op_shl:
8980       /* Check for backend support of RSHIFT_EXPR.  */
8981       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
8982         return false;
8983
8984       /* Don't shift more than type precision to avoid UD.  */
8985       if (!tree_fits_uhwi_p (step_expr)
8986           || maybe_ge (nunits * tree_to_uhwi (step_expr),
8987                        TYPE_PRECISION (TREE_TYPE (init_expr))))
8988         return false;
8989
8990       break;
8991
8992     default:
8993       gcc_unreachable ();
8994     }
8995
8996   if (!vec_stmt) /* transformation not required.  */
8997     {
8998       unsigned inside_cost = 0, prologue_cost = 0;
8999       /* loop cost for vec_loop. Neg induction doesn't have any
9000          inside_cost.  */
9001       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9002                                       stmt_info, 0, vect_body);
9003
9004       /* loop cost for vec_loop. Neg induction doesn't have any
9005          inside_cost.  */
9006       if (induction_type == vect_step_op_neg)
9007         inside_cost = 0;
9008
9009       /* prologue cost for vec_init and vec_step.  */
9010       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9011                                         stmt_info, 0, vect_prologue);
9012
9013       if (dump_enabled_p ())
9014         dump_printf_loc (MSG_NOTE, vect_location,
9015                          "vect_model_induction_cost: inside_cost = %d, "
9016                          "prologue_cost = %d. \n", inside_cost,
9017                          prologue_cost);
9018
9019       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9020       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9021       return true;
9022     }
9023
9024   /* Transform.  */
9025
9026   /* Compute a vector variable, initialized with the first VF values of
9027      the induction variable.  E.g., for an iv with IV_PHI='X' and
9028      evolution S, for a vector of 4 units, we want to compute:
9029      [X, X + S, X + 2*S, X + 3*S].  */
9030
9031   if (dump_enabled_p ())
9032     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9033
9034   pe = loop_preheader_edge (iv_loop);
9035   /* Find the first insertion point in the BB.  */
9036   basic_block bb = gimple_bb (phi);
9037   si = gsi_after_labels (bb);
9038
9039   gimple_seq stmts = NULL;
9040
9041   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9042   /* If we are using the loop mask to "peel" for alignment then we need
9043      to adjust the start value here.  */
9044   if (niters_skip != NULL_TREE)
9045     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9046                                              step_expr, induction_type);
9047
9048   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9049                                             step_expr, nunits, vectype,
9050                                             induction_type);
9051   if (stmts)
9052     {
9053       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9054       gcc_assert (!new_bb);
9055     }
9056
9057   stmts = NULL;
9058   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9059                                             vf, induction_type);
9060   if (stmts)
9061     {
9062       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9063       gcc_assert (!new_bb);
9064     }
9065
9066   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9067                                                 new_name, vectype,
9068                                                 induction_type);
9069   /* Create the following def-use cycle:
9070      loop prolog:
9071      vec_init = ...
9072      vec_step = ...
9073      loop:
9074      vec_iv = PHI <vec_init, vec_loop>
9075      ...
9076      STMT
9077      ...
9078      vec_loop = vec_iv + vec_step;  */
9079
9080   /* Create the induction-phi that defines the induction-operand.  */
9081   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9082   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9083   induc_def = PHI_RESULT (induction_phi);
9084
9085   /* Create the iv update inside the loop.  */
9086   stmts = NULL;
9087   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9088                                       induc_def, vec_step,
9089                                       induction_type);
9090
9091   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9092   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9093
9094   /* Set the arguments of the phi node:  */
9095   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9096   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9097                UNKNOWN_LOCATION);
9098
9099   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9100   *vec_stmt = induction_phi;
9101
9102   /* In case that vectorization factor (VF) is bigger than the number
9103      of elements that we can fit in a vectype (nunits), we have to generate
9104      more than one vector stmt - i.e - we need to "unroll" the
9105      vector stmt by a factor VF/nunits.  For more details see documentation
9106      in vectorizable_operation.  */
9107
9108   if (ncopies > 1)
9109     {
9110       stmts = NULL;
9111       /* FORNOW. This restriction should be relaxed.  */
9112       gcc_assert (!nested_in_vect_loop);
9113
9114       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9115                                                 nunits, induction_type);
9116
9117       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9118                                                     new_name, vectype,
9119                                                     induction_type);
9120       vec_def = induc_def;
9121       for (i = 1; i < ncopies; i++)
9122         {
9123           /* vec_i = vec_prev + vec_step.  */
9124           stmts = NULL;
9125           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9126                                               vec_def, vec_step,
9127                                               induction_type);
9128           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9129           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9130           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9131         }
9132     }
9133
9134   if (dump_enabled_p ())
9135     dump_printf_loc (MSG_NOTE, vect_location,
9136                      "transform induction: created def-use cycle: %G%G",
9137                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9138
9139   return true;
9140 }
9141
9142 /* Function vectorizable_induction
9143
9144    Check if STMT_INFO performs an induction computation that can be vectorized.
9145    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9146    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9147    Return true if STMT_INFO is vectorizable in this way.  */
9148
9149 bool
9150 vectorizable_induction (loop_vec_info loop_vinfo,
9151                         stmt_vec_info stmt_info,
9152                         gimple **vec_stmt, slp_tree slp_node,
9153                         stmt_vector_for_cost *cost_vec)
9154 {
9155   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9156   unsigned ncopies;
9157   bool nested_in_vect_loop = false;
9158   class loop *iv_loop;
9159   tree vec_def;
9160   edge pe = loop_preheader_edge (loop);
9161   basic_block new_bb;
9162   tree new_vec, vec_init, vec_step, t;
9163   tree new_name;
9164   gimple *new_stmt;
9165   gphi *induction_phi;
9166   tree induc_def, vec_dest;
9167   tree init_expr, step_expr;
9168   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9169   unsigned i;
9170   tree expr;
9171   gimple_stmt_iterator si;
9172   enum vect_induction_op_type induction_type
9173     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9174
9175   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9176   if (!phi)
9177     return false;
9178
9179   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9180     return false;
9181
9182   /* Make sure it was recognized as induction computation.  */
9183   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9184     return false;
9185
9186   /* Handle nonlinear induction in a separate place.  */
9187   if (induction_type != vect_step_op_add)
9188     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9189                                              vec_stmt, slp_node, cost_vec);
9190
9191   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9192   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9193
9194   if (slp_node)
9195     ncopies = 1;
9196   else
9197     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9198   gcc_assert (ncopies >= 1);
9199
9200   /* FORNOW. These restrictions should be relaxed.  */
9201   if (nested_in_vect_loop_p (loop, stmt_info))
9202     {
9203       imm_use_iterator imm_iter;
9204       use_operand_p use_p;
9205       gimple *exit_phi;
9206       edge latch_e;
9207       tree loop_arg;
9208
9209       if (ncopies > 1)
9210         {
9211           if (dump_enabled_p ())
9212             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9213                              "multiple types in nested loop.\n");
9214           return false;
9215         }
9216
9217       exit_phi = NULL;
9218       latch_e = loop_latch_edge (loop->inner);
9219       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9220       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9221         {
9222           gimple *use_stmt = USE_STMT (use_p);
9223           if (is_gimple_debug (use_stmt))
9224             continue;
9225
9226           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9227             {
9228               exit_phi = use_stmt;
9229               break;
9230             }
9231         }
9232       if (exit_phi)
9233         {
9234           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9235           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9236                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9237             {
9238               if (dump_enabled_p ())
9239                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9240                                  "inner-loop induction only used outside "
9241                                  "of the outer vectorized loop.\n");
9242               return false;
9243             }
9244         }
9245
9246       nested_in_vect_loop = true;
9247       iv_loop = loop->inner;
9248     }
9249   else
9250     iv_loop = loop;
9251   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9252
9253   if (slp_node && !nunits.is_constant ())
9254     {
9255       /* The current SLP code creates the step value element-by-element.  */
9256       if (dump_enabled_p ())
9257         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9258                          "SLP induction not supported for variable-length"
9259                          " vectors.\n");
9260       return false;
9261     }
9262
9263   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9264     {
9265       if (dump_enabled_p ())
9266         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9267                          "floating point induction vectorization disabled\n");
9268       return false;
9269     }
9270
9271   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9272   gcc_assert (step_expr != NULL_TREE);
9273   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9274
9275   /* Check for backend support of PLUS/MINUS_EXPR. */
9276   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9277       || !directly_supported_p (MINUS_EXPR, step_vectype))
9278     return false;
9279
9280   if (!vec_stmt) /* transformation not required.  */
9281     {
9282       unsigned inside_cost = 0, prologue_cost = 0;
9283       if (slp_node)
9284         {
9285           /* We eventually need to set a vector type on invariant
9286              arguments.  */
9287           unsigned j;
9288           slp_tree child;
9289           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9290             if (!vect_maybe_update_slp_op_vectype
9291                 (child, SLP_TREE_VECTYPE (slp_node)))
9292               {
9293                 if (dump_enabled_p ())
9294                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9295                                    "incompatible vector types for "
9296                                    "invariants\n");
9297                 return false;
9298               }
9299           /* loop cost for vec_loop.  */
9300           inside_cost
9301             = record_stmt_cost (cost_vec,
9302                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9303                                 vector_stmt, stmt_info, 0, vect_body);
9304           /* prologue cost for vec_init (if not nested) and step.  */
9305           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9306                                             scalar_to_vec,
9307                                             stmt_info, 0, vect_prologue);
9308         }
9309       else /* if (!slp_node) */
9310         {
9311           /* loop cost for vec_loop.  */
9312           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9313                                           stmt_info, 0, vect_body);
9314           /* prologue cost for vec_init and vec_step.  */
9315           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9316                                             stmt_info, 0, vect_prologue);
9317         }
9318       if (dump_enabled_p ())
9319         dump_printf_loc (MSG_NOTE, vect_location,
9320                          "vect_model_induction_cost: inside_cost = %d, "
9321                          "prologue_cost = %d .\n", inside_cost,
9322                          prologue_cost);
9323
9324       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9325       DUMP_VECT_SCOPE ("vectorizable_induction");
9326       return true;
9327     }
9328
9329   /* Transform.  */
9330
9331   /* Compute a vector variable, initialized with the first VF values of
9332      the induction variable.  E.g., for an iv with IV_PHI='X' and
9333      evolution S, for a vector of 4 units, we want to compute:
9334      [X, X + S, X + 2*S, X + 3*S].  */
9335
9336   if (dump_enabled_p ())
9337     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9338
9339   pe = loop_preheader_edge (iv_loop);
9340   /* Find the first insertion point in the BB.  */
9341   basic_block bb = gimple_bb (phi);
9342   si = gsi_after_labels (bb);
9343
9344   /* For SLP induction we have to generate several IVs as for example
9345      with group size 3 we need
9346        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9347        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9348   if (slp_node)
9349     {
9350       /* Enforced above.  */
9351       unsigned int const_nunits = nunits.to_constant ();
9352
9353       /* The initial values are vectorized, but any lanes > group_size
9354          need adjustment.  */
9355       slp_tree init_node
9356         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9357
9358       /* Gather steps.  Since we do not vectorize inductions as
9359          cycles we have to reconstruct the step from SCEV data.  */
9360       unsigned group_size = SLP_TREE_LANES (slp_node);
9361       tree *steps = XALLOCAVEC (tree, group_size);
9362       tree *inits = XALLOCAVEC (tree, group_size);
9363       stmt_vec_info phi_info;
9364       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9365         {
9366           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9367           if (!init_node)
9368             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9369                                            pe->dest_idx);
9370         }
9371
9372       /* Now generate the IVs.  */
9373       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9374       gcc_assert ((const_nunits * nvects) % group_size == 0);
9375       unsigned nivs;
9376       if (nested_in_vect_loop)
9377         nivs = nvects;
9378       else
9379         {
9380           /* Compute the number of distinct IVs we need.  First reduce
9381              group_size if it is a multiple of const_nunits so we get
9382              one IV for a group_size of 4 but const_nunits 2.  */
9383           unsigned group_sizep = group_size;
9384           if (group_sizep % const_nunits == 0)
9385             group_sizep = group_sizep / const_nunits;
9386           nivs = least_common_multiple (group_sizep,
9387                                         const_nunits) / const_nunits;
9388         }
9389       tree stept = TREE_TYPE (step_vectype);
9390       tree lupdate_mul = NULL_TREE;
9391       if (!nested_in_vect_loop)
9392         {
9393           /* The number of iterations covered in one vector iteration.  */
9394           unsigned lup_mul = (nvects * const_nunits) / group_size;
9395           lupdate_mul
9396             = build_vector_from_val (step_vectype,
9397                                      SCALAR_FLOAT_TYPE_P (stept)
9398                                      ? build_real_from_wide (stept, lup_mul,
9399                                                              UNSIGNED)
9400                                      : build_int_cstu (stept, lup_mul));
9401         }
9402       tree peel_mul = NULL_TREE;
9403       gimple_seq init_stmts = NULL;
9404       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9405         {
9406           if (SCALAR_FLOAT_TYPE_P (stept))
9407             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9408                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9409           else
9410             peel_mul = gimple_convert (&init_stmts, stept,
9411                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9412           peel_mul = gimple_build_vector_from_val (&init_stmts,
9413                                                    step_vectype, peel_mul);
9414         }
9415       unsigned ivn;
9416       auto_vec<tree> vec_steps;
9417       for (ivn = 0; ivn < nivs; ++ivn)
9418         {
9419           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9420           tree_vector_builder init_elts (vectype, const_nunits, 1);
9421           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9422           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9423             {
9424               /* The scalar steps of the IVs.  */
9425               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9426               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9427               step_elts.quick_push (elt);
9428               if (!init_node)
9429                 {
9430                   /* The scalar inits of the IVs if not vectorized.  */
9431                   elt = inits[(ivn*const_nunits + eltn) % group_size];
9432                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
9433                                                   TREE_TYPE (elt)))
9434                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9435                                         TREE_TYPE (vectype), elt);
9436                   init_elts.quick_push (elt);
9437                 }
9438               /* The number of steps to add to the initial values.  */
9439               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9440               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9441                                    ? build_real_from_wide (stept,
9442                                                            mul_elt, UNSIGNED)
9443                                    : build_int_cstu (stept, mul_elt));
9444             }
9445           vec_step = gimple_build_vector (&init_stmts, &step_elts);
9446           vec_steps.safe_push (vec_step);
9447           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9448           if (peel_mul)
9449             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9450                                      step_mul, peel_mul);
9451           if (!init_node)
9452             vec_init = gimple_build_vector (&init_stmts, &init_elts);
9453
9454           /* Create the induction-phi that defines the induction-operand.  */
9455           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9456                                             "vec_iv_");
9457           induction_phi = create_phi_node (vec_dest, iv_loop->header);
9458           induc_def = PHI_RESULT (induction_phi);
9459
9460           /* Create the iv update inside the loop  */
9461           tree up = vec_step;
9462           if (lupdate_mul)
9463             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9464                                vec_step, lupdate_mul);
9465           gimple_seq stmts = NULL;
9466           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9467           vec_def = gimple_build (&stmts,
9468                                   PLUS_EXPR, step_vectype, vec_def, up);
9469           vec_def = gimple_convert (&stmts, vectype, vec_def);
9470           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9471           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9472                        UNKNOWN_LOCATION);
9473
9474           if (init_node)
9475             vec_init = vect_get_slp_vect_def (init_node, ivn);
9476           if (!nested_in_vect_loop
9477               && !integer_zerop (step_mul))
9478             {
9479               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9480               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9481                                  vec_step, step_mul);
9482               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9483                                       vec_def, up);
9484               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9485             }
9486
9487           /* Set the arguments of the phi node:  */
9488           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9489
9490           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9491         }
9492       if (!nested_in_vect_loop)
9493         {
9494           /* Fill up to the number of vectors we need for the whole group.  */
9495           nivs = least_common_multiple (group_size,
9496                                         const_nunits) / const_nunits;
9497           vec_steps.reserve (nivs-ivn);
9498           for (; ivn < nivs; ++ivn)
9499             {
9500               SLP_TREE_VEC_STMTS (slp_node)
9501                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9502               vec_steps.quick_push (vec_steps[0]);
9503             }
9504         }
9505
9506       /* Re-use IVs when we can.  We are generating further vector
9507          stmts by adding VF' * stride to the IVs generated above.  */
9508       if (ivn < nvects)
9509         {
9510           unsigned vfp
9511             = least_common_multiple (group_size, const_nunits) / group_size;
9512           tree lupdate_mul
9513             = build_vector_from_val (step_vectype,
9514                                      SCALAR_FLOAT_TYPE_P (stept)
9515                                      ? build_real_from_wide (stept,
9516                                                              vfp, UNSIGNED)
9517                                      : build_int_cstu (stept, vfp));
9518           for (; ivn < nvects; ++ivn)
9519             {
9520               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9521               tree def = gimple_get_lhs (iv);
9522               if (ivn < 2*nivs)
9523                 vec_steps[ivn - nivs]
9524                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9525                                   vec_steps[ivn - nivs], lupdate_mul);
9526               gimple_seq stmts = NULL;
9527               def = gimple_convert (&stmts, step_vectype, def);
9528               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9529                                   def, vec_steps[ivn % nivs]);
9530               def = gimple_convert (&stmts, vectype, def);
9531               if (gimple_code (iv) == GIMPLE_PHI)
9532                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9533               else
9534                 {
9535                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9536                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9537                 }
9538               SLP_TREE_VEC_STMTS (slp_node)
9539                 .quick_push (SSA_NAME_DEF_STMT (def));
9540             }
9541         }
9542
9543       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9544       gcc_assert (!new_bb);
9545
9546       return true;
9547     }
9548
9549   init_expr = vect_phi_initial_value (phi);
9550
9551   gimple_seq stmts = NULL;
9552   if (!nested_in_vect_loop)
9553     {
9554       /* Convert the initial value to the IV update type.  */
9555       tree new_type = TREE_TYPE (step_expr);
9556       init_expr = gimple_convert (&stmts, new_type, init_expr);
9557
9558       /* If we are using the loop mask to "peel" for alignment then we need
9559          to adjust the start value here.  */
9560       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9561       if (skip_niters != NULL_TREE)
9562         {
9563           if (FLOAT_TYPE_P (vectype))
9564             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9565                                         skip_niters);
9566           else
9567             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9568           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9569                                          skip_niters, step_expr);
9570           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9571                                     init_expr, skip_step);
9572         }
9573     }
9574
9575   if (stmts)
9576     {
9577       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9578       gcc_assert (!new_bb);
9579     }
9580
9581   /* Create the vector that holds the initial_value of the induction.  */
9582   if (nested_in_vect_loop)
9583     {
9584       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
9585          been created during vectorization of previous stmts.  We obtain it
9586          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
9587       auto_vec<tree> vec_inits;
9588       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9589                                      init_expr, &vec_inits);
9590       vec_init = vec_inits[0];
9591       /* If the initial value is not of proper type, convert it.  */
9592       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9593         {
9594           new_stmt
9595             = gimple_build_assign (vect_get_new_ssa_name (vectype,
9596                                                           vect_simple_var,
9597                                                           "vec_iv_"),
9598                                    VIEW_CONVERT_EXPR,
9599                                    build1 (VIEW_CONVERT_EXPR, vectype,
9600                                            vec_init));
9601           vec_init = gimple_assign_lhs (new_stmt);
9602           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9603                                                  new_stmt);
9604           gcc_assert (!new_bb);
9605         }
9606     }
9607   else
9608     {
9609       /* iv_loop is the loop to be vectorized. Create:
9610          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
9611       stmts = NULL;
9612       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9613
9614       unsigned HOST_WIDE_INT const_nunits;
9615       if (nunits.is_constant (&const_nunits))
9616         {
9617           tree_vector_builder elts (step_vectype, const_nunits, 1);
9618           elts.quick_push (new_name);
9619           for (i = 1; i < const_nunits; i++)
9620             {
9621               /* Create: new_name_i = new_name + step_expr  */
9622               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9623                                        new_name, step_expr);
9624               elts.quick_push (new_name);
9625             }
9626           /* Create a vector from [new_name_0, new_name_1, ...,
9627              new_name_nunits-1]  */
9628           vec_init = gimple_build_vector (&stmts, &elts);
9629         }
9630       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9631         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
9632         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9633                                  new_name, step_expr);
9634       else
9635         {
9636           /* Build:
9637                 [base, base, base, ...]
9638                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
9639           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9640           gcc_assert (flag_associative_math);
9641           tree index = build_index_vector (step_vectype, 0, 1);
9642           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9643                                                         new_name);
9644           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9645                                                         step_expr);
9646           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9647           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9648                                    vec_init, step_vec);
9649           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9650                                    vec_init, base_vec);
9651         }
9652       vec_init = gimple_convert (&stmts, vectype, vec_init);
9653
9654       if (stmts)
9655         {
9656           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9657           gcc_assert (!new_bb);
9658         }
9659     }
9660
9661
9662   /* Create the vector that holds the step of the induction.  */
9663   if (nested_in_vect_loop)
9664     /* iv_loop is nested in the loop to be vectorized. Generate:
9665        vec_step = [S, S, S, S]  */
9666     new_name = step_expr;
9667   else
9668     {
9669       /* iv_loop is the loop to be vectorized. Generate:
9670           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
9671       gimple_seq seq = NULL;
9672       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9673         {
9674           expr = build_int_cst (integer_type_node, vf);
9675           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9676         }
9677       else
9678         expr = build_int_cst (TREE_TYPE (step_expr), vf);
9679       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9680                                expr, step_expr);
9681       if (seq)
9682         {
9683           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9684           gcc_assert (!new_bb);
9685         }
9686     }
9687
9688   t = unshare_expr (new_name);
9689   gcc_assert (CONSTANT_CLASS_P (new_name)
9690               || TREE_CODE (new_name) == SSA_NAME);
9691   new_vec = build_vector_from_val (step_vectype, t);
9692   vec_step = vect_init_vector (loop_vinfo, stmt_info,
9693                                new_vec, step_vectype, NULL);
9694
9695
9696   /* Create the following def-use cycle:
9697      loop prolog:
9698          vec_init = ...
9699          vec_step = ...
9700      loop:
9701          vec_iv = PHI <vec_init, vec_loop>
9702          ...
9703          STMT
9704          ...
9705          vec_loop = vec_iv + vec_step;  */
9706
9707   /* Create the induction-phi that defines the induction-operand.  */
9708   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9709   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9710   induc_def = PHI_RESULT (induction_phi);
9711
9712   /* Create the iv update inside the loop  */
9713   stmts = NULL;
9714   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9715   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
9716   vec_def = gimple_convert (&stmts, vectype, vec_def);
9717   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9718   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9719
9720   /* Set the arguments of the phi node:  */
9721   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9722   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9723                UNKNOWN_LOCATION);
9724
9725   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9726   *vec_stmt = induction_phi;
9727
9728   /* In case that vectorization factor (VF) is bigger than the number
9729      of elements that we can fit in a vectype (nunits), we have to generate
9730      more than one vector stmt - i.e - we need to "unroll" the
9731      vector stmt by a factor VF/nunits.  For more details see documentation
9732      in vectorizable_operation.  */
9733
9734   if (ncopies > 1)
9735     {
9736       gimple_seq seq = NULL;
9737       /* FORNOW. This restriction should be relaxed.  */
9738       gcc_assert (!nested_in_vect_loop);
9739
9740       /* Create the vector that holds the step of the induction.  */
9741       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9742         {
9743           expr = build_int_cst (integer_type_node, nunits);
9744           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9745         }
9746       else
9747         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
9748       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9749                                expr, step_expr);
9750       if (seq)
9751         {
9752           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9753           gcc_assert (!new_bb);
9754         }
9755
9756       t = unshare_expr (new_name);
9757       gcc_assert (CONSTANT_CLASS_P (new_name)
9758                   || TREE_CODE (new_name) == SSA_NAME);
9759       new_vec = build_vector_from_val (step_vectype, t);
9760       vec_step = vect_init_vector (loop_vinfo, stmt_info,
9761                                    new_vec, step_vectype, NULL);
9762
9763       vec_def = induc_def;
9764       for (i = 1; i < ncopies; i++)
9765         {
9766           /* vec_i = vec_prev + vec_step  */
9767           gimple_seq stmts = NULL;
9768           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
9769           vec_def = gimple_build (&stmts,
9770                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
9771           vec_def = gimple_convert (&stmts, vectype, vec_def);
9772
9773           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9774           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9775           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9776         }
9777     }
9778
9779   if (dump_enabled_p ())
9780     dump_printf_loc (MSG_NOTE, vect_location,
9781                      "transform induction: created def-use cycle: %G%G",
9782                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9783
9784   return true;
9785 }
9786
9787 /* Function vectorizable_live_operation.
9788
9789    STMT_INFO computes a value that is used outside the loop.  Check if
9790    it can be supported.  */
9791
9792 bool
9793 vectorizable_live_operation (vec_info *vinfo,
9794                              stmt_vec_info stmt_info,
9795                              gimple_stmt_iterator *gsi,
9796                              slp_tree slp_node, slp_instance slp_node_instance,
9797                              int slp_index, bool vec_stmt_p,
9798                              stmt_vector_for_cost *cost_vec)
9799 {
9800   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9801   imm_use_iterator imm_iter;
9802   tree lhs, lhs_type, bitsize;
9803   tree vectype = (slp_node
9804                   ? SLP_TREE_VECTYPE (slp_node)
9805                   : STMT_VINFO_VECTYPE (stmt_info));
9806   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9807   int ncopies;
9808   gimple *use_stmt;
9809   auto_vec<tree> vec_oprnds;
9810   int vec_entry = 0;
9811   poly_uint64 vec_index = 0;
9812
9813   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
9814
9815   /* If a stmt of a reduction is live, vectorize it via
9816      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
9817      validity so just trigger the transform here.  */
9818   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
9819     {
9820       if (!vec_stmt_p)
9821         return true;
9822       if (slp_node)
9823         {
9824           /* For reduction chains the meta-info is attached to
9825              the group leader.  */
9826           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
9827             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
9828           /* For SLP reductions we vectorize the epilogue for
9829              all involved stmts together.  */
9830           else if (slp_index != 0)
9831             return true;
9832         }
9833       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
9834       gcc_assert (reduc_info->is_reduc_info);
9835       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
9836           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
9837         return true;
9838       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
9839                                         slp_node_instance);
9840       return true;
9841     }
9842
9843   /* If STMT is not relevant and it is a simple assignment and its inputs are
9844      invariant then it can remain in place, unvectorized.  The original last
9845      scalar value that it computes will be used.  */
9846   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9847     {
9848       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
9849       if (dump_enabled_p ())
9850         dump_printf_loc (MSG_NOTE, vect_location,
9851                          "statement is simple and uses invariant.  Leaving in "
9852                          "place.\n");
9853       return true;
9854     }
9855
9856   if (slp_node)
9857     ncopies = 1;
9858   else
9859     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9860
9861   if (slp_node)
9862     {
9863       gcc_assert (slp_index >= 0);
9864
9865       /* Get the last occurrence of the scalar index from the concatenation of
9866          all the slp vectors. Calculate which slp vector it is and the index
9867          within.  */
9868       int num_scalar = SLP_TREE_LANES (slp_node);
9869       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9870       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
9871
9872       /* Calculate which vector contains the result, and which lane of
9873          that vector we need.  */
9874       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
9875         {
9876           if (dump_enabled_p ())
9877             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9878                              "Cannot determine which vector holds the"
9879                              " final result.\n");
9880           return false;
9881         }
9882     }
9883
9884   if (!vec_stmt_p)
9885     {
9886       /* No transformation required.  */
9887       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
9888         {
9889           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
9890                                                OPTIMIZE_FOR_SPEED))
9891             {
9892               if (dump_enabled_p ())
9893                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9894                                  "can't operate on partial vectors "
9895                                  "because the target doesn't support extract "
9896                                  "last reduction.\n");
9897               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9898             }
9899           else if (slp_node)
9900             {
9901               if (dump_enabled_p ())
9902                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9903                                  "can't operate on partial vectors "
9904                                  "because an SLP statement is live after "
9905                                  "the loop.\n");
9906               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9907             }
9908           else if (ncopies > 1)
9909             {
9910               if (dump_enabled_p ())
9911                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9912                                  "can't operate on partial vectors "
9913                                  "because ncopies is greater than 1.\n");
9914               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9915             }
9916           else
9917             {
9918               gcc_assert (ncopies == 1 && !slp_node);
9919               vect_record_loop_mask (loop_vinfo,
9920                                      &LOOP_VINFO_MASKS (loop_vinfo),
9921                                      1, vectype, NULL);
9922             }
9923         }
9924       /* ???  Enable for loop costing as well.  */
9925       if (!loop_vinfo)
9926         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
9927                           0, vect_epilogue);
9928       return true;
9929     }
9930
9931   /* Use the lhs of the original scalar statement.  */
9932   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
9933   if (dump_enabled_p ())
9934     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
9935                      "stmt %G", stmt);
9936
9937   lhs = gimple_get_lhs (stmt);
9938   lhs_type = TREE_TYPE (lhs);
9939
9940   bitsize = vector_element_bits_tree (vectype);
9941
9942   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
9943   tree vec_lhs, bitstart;
9944   gimple *vec_stmt;
9945   if (slp_node)
9946     {
9947       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
9948
9949       /* Get the correct slp vectorized stmt.  */
9950       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
9951       vec_lhs = gimple_get_lhs (vec_stmt);
9952
9953       /* Get entry to use.  */
9954       bitstart = bitsize_int (vec_index);
9955       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
9956     }
9957   else
9958     {
9959       /* For multiple copies, get the last copy.  */
9960       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
9961       vec_lhs = gimple_get_lhs (vec_stmt);
9962
9963       /* Get the last lane in the vector.  */
9964       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
9965     }
9966
9967   if (loop_vinfo)
9968     {
9969       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
9970          requirement, insert one phi node for it.  It looks like:
9971            loop;
9972          BB:
9973            # lhs' = PHI <lhs>
9974          ==>
9975            loop;
9976          BB:
9977            # vec_lhs' = PHI <vec_lhs>
9978            new_tree = lane_extract <vec_lhs', ...>;
9979            lhs' = new_tree;  */
9980
9981       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9982       basic_block exit_bb = single_exit (loop)->dest;
9983       gcc_assert (single_pred_p (exit_bb));
9984
9985       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
9986       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
9987       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
9988
9989       gimple_seq stmts = NULL;
9990       tree new_tree;
9991       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9992         {
9993           /* Emit:
9994
9995                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
9996
9997              where VEC_LHS is the vectorized live-out result and MASK is
9998              the loop mask for the final iteration.  */
9999           gcc_assert (ncopies == 1 && !slp_node);
10000           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10001           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
10002                                           1, vectype, 0);
10003           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10004                                           mask, vec_lhs_phi);
10005
10006           /* Convert the extracted vector element to the scalar type.  */
10007           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10008         }
10009       else
10010         {
10011           tree bftype = TREE_TYPE (vectype);
10012           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10013             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10014           new_tree = build3 (BIT_FIELD_REF, bftype,
10015                              vec_lhs_phi, bitsize, bitstart);
10016           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10017                                            &stmts, true, NULL_TREE);
10018         }
10019
10020       if (stmts)
10021         {
10022           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10023           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10024
10025           /* Remove existing phi from lhs and create one copy from new_tree.  */
10026           tree lhs_phi = NULL_TREE;
10027           gimple_stmt_iterator gsi;
10028           for (gsi = gsi_start_phis (exit_bb);
10029                !gsi_end_p (gsi); gsi_next (&gsi))
10030             {
10031               gimple *phi = gsi_stmt (gsi);
10032               if ((gimple_phi_arg_def (phi, 0) == lhs))
10033                 {
10034                   remove_phi_node (&gsi, false);
10035                   lhs_phi = gimple_phi_result (phi);
10036                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10037                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10038                   break;
10039                 }
10040             }
10041         }
10042
10043       /* Replace use of lhs with newly computed result.  If the use stmt is a
10044          single arg PHI, just replace all uses of PHI result.  It's necessary
10045          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10046       use_operand_p use_p;
10047       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10048         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10049             && !is_gimple_debug (use_stmt))
10050           {
10051             if (gimple_code (use_stmt) == GIMPLE_PHI
10052                 && gimple_phi_num_args (use_stmt) == 1)
10053               {
10054                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10055               }
10056             else
10057               {
10058                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10059                     SET_USE (use_p, new_tree);
10060               }
10061             update_stmt (use_stmt);
10062           }
10063     }
10064   else
10065     {
10066       /* For basic-block vectorization simply insert the lane-extraction.  */
10067       tree bftype = TREE_TYPE (vectype);
10068       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10069         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10070       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10071                               vec_lhs, bitsize, bitstart);
10072       gimple_seq stmts = NULL;
10073       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10074                                        &stmts, true, NULL_TREE);
10075       if (TREE_CODE (new_tree) == SSA_NAME
10076           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10077         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10078       if (is_a <gphi *> (vec_stmt))
10079         {
10080           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10081           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10082         }
10083       else
10084         {
10085           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10086           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10087         }
10088
10089       /* Replace use of lhs with newly computed result.  If the use stmt is a
10090          single arg PHI, just replace all uses of PHI result.  It's necessary
10091          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10092       use_operand_p use_p;
10093       stmt_vec_info use_stmt_info;
10094       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10095         if (!is_gimple_debug (use_stmt)
10096             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10097                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10098           {
10099             /* ???  This can happen when the live lane ends up being
10100                used in a vector construction code-generated by an
10101                external SLP node (and code-generation for that already
10102                happened).  See gcc.dg/vect/bb-slp-47.c.
10103                Doing this is what would happen if that vector CTOR
10104                were not code-generated yet so it is not too bad.
10105                ???  In fact we'd likely want to avoid this situation
10106                in the first place.  */
10107             if (TREE_CODE (new_tree) == SSA_NAME
10108                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10109                 && gimple_code (use_stmt) != GIMPLE_PHI
10110                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10111                                                 use_stmt))
10112               {
10113                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10114                 gcc_assert (code == CONSTRUCTOR
10115                             || code == VIEW_CONVERT_EXPR
10116                             || CONVERT_EXPR_CODE_P (code));
10117                 if (dump_enabled_p ())
10118                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10119                                    "Using original scalar computation for "
10120                                    "live lane because use preceeds vector "
10121                                    "def\n");
10122                 continue;
10123               }
10124             /* ???  It can also happen that we end up pulling a def into
10125                a loop where replacing out-of-loop uses would require
10126                a new LC SSA PHI node.  Retain the original scalar in
10127                those cases as well.  PR98064.  */
10128             if (TREE_CODE (new_tree) == SSA_NAME
10129                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10130                 && (gimple_bb (use_stmt)->loop_father
10131                     != gimple_bb (vec_stmt)->loop_father)
10132                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10133                                         gimple_bb (use_stmt)->loop_father))
10134               {
10135                 if (dump_enabled_p ())
10136                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10137                                    "Using original scalar computation for "
10138                                    "live lane because there is an out-of-loop "
10139                                    "definition for it\n");
10140                 continue;
10141               }
10142             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10143               SET_USE (use_p, new_tree);
10144             update_stmt (use_stmt);
10145           }
10146     }
10147
10148   return true;
10149 }
10150
10151 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10152
10153 static void
10154 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10155 {
10156   ssa_op_iter op_iter;
10157   imm_use_iterator imm_iter;
10158   def_operand_p def_p;
10159   gimple *ustmt;
10160
10161   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10162     {
10163       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10164         {
10165           basic_block bb;
10166
10167           if (!is_gimple_debug (ustmt))
10168             continue;
10169
10170           bb = gimple_bb (ustmt);
10171
10172           if (!flow_bb_inside_loop_p (loop, bb))
10173             {
10174               if (gimple_debug_bind_p (ustmt))
10175                 {
10176                   if (dump_enabled_p ())
10177                     dump_printf_loc (MSG_NOTE, vect_location,
10178                                      "killing debug use\n");
10179
10180                   gimple_debug_bind_reset_value (ustmt);
10181                   update_stmt (ustmt);
10182                 }
10183               else
10184                 gcc_unreachable ();
10185             }
10186         }
10187     }
10188 }
10189
10190 /* Given loop represented by LOOP_VINFO, return true if computation of
10191    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10192    otherwise.  */
10193
10194 static bool
10195 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10196 {
10197   /* Constant case.  */
10198   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10199     {
10200       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10201       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10202
10203       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10204       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10205       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10206         return true;
10207     }
10208
10209   widest_int max;
10210   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10211   /* Check the upper bound of loop niters.  */
10212   if (get_max_loop_iterations (loop, &max))
10213     {
10214       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10215       signop sgn = TYPE_SIGN (type);
10216       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10217       if (max < type_max)
10218         return true;
10219     }
10220   return false;
10221 }
10222
10223 /* Return a mask type with half the number of elements as OLD_TYPE,
10224    given that it should have mode NEW_MODE.  */
10225
10226 tree
10227 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10228 {
10229   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10230   return build_truth_vector_type_for_mode (nunits, new_mode);
10231 }
10232
10233 /* Return a mask type with twice as many elements as OLD_TYPE,
10234    given that it should have mode NEW_MODE.  */
10235
10236 tree
10237 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10238 {
10239   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10240   return build_truth_vector_type_for_mode (nunits, new_mode);
10241 }
10242
10243 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10244    contain a sequence of NVECTORS masks that each control a vector of type
10245    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10246    these vector masks with the vector version of SCALAR_MASK.  */
10247
10248 void
10249 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10250                        unsigned int nvectors, tree vectype, tree scalar_mask)
10251 {
10252   gcc_assert (nvectors != 0);
10253   if (masks->length () < nvectors)
10254     masks->safe_grow_cleared (nvectors, true);
10255   rgroup_controls *rgm = &(*masks)[nvectors - 1];
10256   /* The number of scalars per iteration and the number of vectors are
10257      both compile-time constants.  */
10258   unsigned int nscalars_per_iter
10259     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10260                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10261
10262   if (scalar_mask)
10263     {
10264       scalar_cond_masked_key cond (scalar_mask, nvectors);
10265       loop_vinfo->scalar_cond_masked_set.add (cond);
10266     }
10267
10268   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
10269     {
10270       rgm->max_nscalars_per_iter = nscalars_per_iter;
10271       rgm->type = truth_type_for (vectype);
10272       rgm->factor = 1;
10273     }
10274 }
10275
10276 /* Given a complete set of masks MASKS, extract mask number INDEX
10277    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10278    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10279
10280    See the comment above vec_loop_masks for more details about the mask
10281    arrangement.  */
10282
10283 tree
10284 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10285                     unsigned int nvectors, tree vectype, unsigned int index)
10286 {
10287   rgroup_controls *rgm = &(*masks)[nvectors - 1];
10288   tree mask_type = rgm->type;
10289
10290   /* Populate the rgroup's mask array, if this is the first time we've
10291      used it.  */
10292   if (rgm->controls.is_empty ())
10293     {
10294       rgm->controls.safe_grow_cleared (nvectors, true);
10295       for (unsigned int i = 0; i < nvectors; ++i)
10296         {
10297           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10298           /* Provide a dummy definition until the real one is available.  */
10299           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10300           rgm->controls[i] = mask;
10301         }
10302     }
10303
10304   tree mask = rgm->controls[index];
10305   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10306                 TYPE_VECTOR_SUBPARTS (vectype)))
10307     {
10308       /* A loop mask for data type X can be reused for data type Y
10309          if X has N times more elements than Y and if Y's elements
10310          are N times bigger than X's.  In this case each sequence
10311          of N elements in the loop mask will be all-zero or all-one.
10312          We can then view-convert the mask so that each sequence of
10313          N elements is replaced by a single element.  */
10314       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10315                               TYPE_VECTOR_SUBPARTS (vectype)));
10316       gimple_seq seq = NULL;
10317       mask_type = truth_type_for (vectype);
10318       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10319       if (seq)
10320         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10321     }
10322   return mask;
10323 }
10324
10325 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10326    lengths for controlling an operation on VECTYPE.  The operation splits
10327    each element of VECTYPE into FACTOR separate subelements, measuring the
10328    length as a number of these subelements.  */
10329
10330 void
10331 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10332                       unsigned int nvectors, tree vectype, unsigned int factor)
10333 {
10334   gcc_assert (nvectors != 0);
10335   if (lens->length () < nvectors)
10336     lens->safe_grow_cleared (nvectors, true);
10337   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10338
10339   /* The number of scalars per iteration, scalar occupied bytes and
10340      the number of vectors are both compile-time constants.  */
10341   unsigned int nscalars_per_iter
10342     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10343                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10344
10345   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10346     {
10347       /* For now, we only support cases in which all loads and stores fall back
10348          to VnQI or none do.  */
10349       gcc_assert (!rgl->max_nscalars_per_iter
10350                   || (rgl->factor == 1 && factor == 1)
10351                   || (rgl->max_nscalars_per_iter * rgl->factor
10352                       == nscalars_per_iter * factor));
10353       rgl->max_nscalars_per_iter = nscalars_per_iter;
10354       rgl->type = vectype;
10355       rgl->factor = factor;
10356     }
10357 }
10358
10359 /* Given a complete set of length LENS, extract length number INDEX for an
10360    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
10361
10362 tree
10363 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10364                    unsigned int nvectors, unsigned int index)
10365 {
10366   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10367   bool use_bias_adjusted_len =
10368     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10369
10370   /* Populate the rgroup's len array, if this is the first time we've
10371      used it.  */
10372   if (rgl->controls.is_empty ())
10373     {
10374       rgl->controls.safe_grow_cleared (nvectors, true);
10375       for (unsigned int i = 0; i < nvectors; ++i)
10376         {
10377           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10378           gcc_assert (len_type != NULL_TREE);
10379
10380           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10381
10382           /* Provide a dummy definition until the real one is available.  */
10383           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10384           rgl->controls[i] = len;
10385
10386           if (use_bias_adjusted_len)
10387             {
10388               gcc_assert (i == 0);
10389               tree adjusted_len =
10390                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10391               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10392               rgl->bias_adjusted_ctrl = adjusted_len;
10393             }
10394         }
10395     }
10396
10397   if (use_bias_adjusted_len)
10398     return rgl->bias_adjusted_ctrl;
10399   else
10400     return rgl->controls[index];
10401 }
10402
10403 /* Scale profiling counters by estimation for LOOP which is vectorized
10404    by factor VF.  */
10405
10406 static void
10407 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
10408 {
10409   edge preheader = loop_preheader_edge (loop);
10410   /* Reduce loop iterations by the vectorization factor.  */
10411   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
10412   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
10413
10414   if (freq_h.nonzero_p ())
10415     {
10416       profile_probability p;
10417
10418       /* Avoid dropping loop body profile counter to 0 because of zero count
10419          in loop's preheader.  */
10420       if (!(freq_e == profile_count::zero ()))
10421         freq_e = freq_e.force_nonzero ();
10422       p = (freq_e * (new_est_niter + 1)).probability_in (freq_h);
10423       scale_loop_frequencies (loop, p);
10424     }
10425
10426   edge exit_e = single_exit (loop);
10427   exit_e->probability = profile_probability::always () / (new_est_niter + 1);
10428
10429   edge exit_l = single_pred_edge (loop->latch);
10430   profile_probability prob = exit_l->probability;
10431   exit_l->probability = exit_e->probability.invert ();
10432   if (prob.initialized_p () && exit_l->probability.initialized_p ())
10433     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
10434 }
10435
10436 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10437    latch edge values originally defined by it.  */
10438
10439 static void
10440 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10441                                      stmt_vec_info def_stmt_info)
10442 {
10443   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10444   if (!def || TREE_CODE (def) != SSA_NAME)
10445     return;
10446   stmt_vec_info phi_info;
10447   imm_use_iterator iter;
10448   use_operand_p use_p;
10449   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10450     {
10451       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10452       if (!phi)
10453         continue;
10454       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10455             && (phi_info = loop_vinfo->lookup_stmt (phi))
10456             && STMT_VINFO_RELEVANT_P (phi_info)))
10457         continue;
10458       loop_p loop = gimple_bb (phi)->loop_father;
10459       edge e = loop_latch_edge (loop);
10460       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10461         continue;
10462
10463       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10464           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10465           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10466         {
10467           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10468           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10469           gcc_assert (phi_defs.length () == latch_defs.length ());
10470           for (unsigned i = 0; i < phi_defs.length (); ++i)
10471             add_phi_arg (as_a <gphi *> (phi_defs[i]),
10472                          gimple_get_lhs (latch_defs[i]), e,
10473                          gimple_phi_arg_location (phi, e->dest_idx));
10474         }
10475       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10476         {
10477           /* For first order recurrences we have to update both uses of
10478              the latch definition, the one in the PHI node and the one
10479              in the generated VEC_PERM_EXPR.  */
10480           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10481           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10482           gcc_assert (phi_defs.length () == latch_defs.length ());
10483           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10484           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10485           for (unsigned i = 0; i < phi_defs.length (); ++i)
10486             {
10487               gassign *perm = as_a <gassign *> (phi_defs[i]);
10488               if (i > 0)
10489                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10490               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10491               update_stmt (perm);
10492             }
10493           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10494                        gimple_phi_arg_location (phi, e->dest_idx));
10495         }
10496     }
10497 }
10498
10499 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10500    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10501    stmt_vec_info.  */
10502
10503 static bool
10504 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10505                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10506 {
10507   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10508   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10509
10510   if (dump_enabled_p ())
10511     dump_printf_loc (MSG_NOTE, vect_location,
10512                      "------>vectorizing statement: %G", stmt_info->stmt);
10513
10514   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10515     vect_loop_kill_debug_uses (loop, stmt_info);
10516
10517   if (!STMT_VINFO_RELEVANT_P (stmt_info)
10518       && !STMT_VINFO_LIVE_P (stmt_info))
10519     return false;
10520
10521   if (STMT_VINFO_VECTYPE (stmt_info))
10522     {
10523       poly_uint64 nunits
10524         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10525       if (!STMT_SLP_TYPE (stmt_info)
10526           && maybe_ne (nunits, vf)
10527           && dump_enabled_p ())
10528         /* For SLP VF is set according to unrolling factor, and not
10529            to vector size, hence for SLP this print is not valid.  */
10530         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10531     }
10532
10533   /* Pure SLP statements have already been vectorized.  We still need
10534      to apply loop vectorization to hybrid SLP statements.  */
10535   if (PURE_SLP_STMT (stmt_info))
10536     return false;
10537
10538   if (dump_enabled_p ())
10539     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10540
10541   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10542     *seen_store = stmt_info;
10543
10544   return true;
10545 }
10546
10547 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10548    in the hash_map with its corresponding values.  */
10549
10550 static tree
10551 find_in_mapping (tree t, void *context)
10552 {
10553   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
10554
10555   tree *value = mapping->get (t);
10556   return value ? *value : t;
10557 }
10558
10559 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
10560    original loop that has now been vectorized.
10561
10562    The inits of the data_references need to be advanced with the number of
10563    iterations of the main loop.  This has been computed in vect_do_peeling and
10564    is stored in parameter ADVANCE.  We first restore the data_references
10565    initial offset with the values recored in ORIG_DRS_INIT.
10566
10567    Since the loop_vec_info of this EPILOGUE was constructed for the original
10568    loop, its stmt_vec_infos all point to the original statements.  These need
10569    to be updated to point to their corresponding copies as well as the SSA_NAMES
10570    in their PATTERN_DEF_SEQs and RELATED_STMTs.
10571
10572    The data_reference's connections also need to be updated.  Their
10573    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10574    stmt_vec_infos, their statements need to point to their corresponding copy,
10575    if they are gather loads or scatter stores then their reference needs to be
10576    updated to point to its corresponding copy and finally we set
10577    'base_misaligned' to false as we have already peeled for alignment in the
10578    prologue of the main loop.  */
10579
10580 static void
10581 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10582 {
10583   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10584   auto_vec<gimple *> stmt_worklist;
10585   hash_map<tree,tree> mapping;
10586   gimple *orig_stmt, *new_stmt;
10587   gimple_stmt_iterator epilogue_gsi;
10588   gphi_iterator epilogue_phi_gsi;
10589   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10590   basic_block *epilogue_bbs = get_loop_body (epilogue);
10591   unsigned i;
10592
10593   free (LOOP_VINFO_BBS (epilogue_vinfo));
10594   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10595
10596   /* Advance data_reference's with the number of iterations of the previous
10597      loop and its prologue.  */
10598   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10599
10600
10601   /* The EPILOGUE loop is a copy of the original loop so they share the same
10602      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
10603      point to the copied statements.  We also create a mapping of all LHS' in
10604      the original loop and all the LHS' in the EPILOGUE and create worklists to
10605      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
10606   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10607     {
10608       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10609            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10610         {
10611           new_stmt = epilogue_phi_gsi.phi ();
10612
10613           gcc_assert (gimple_uid (new_stmt) > 0);
10614           stmt_vinfo
10615             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10616
10617           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10618           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10619
10620           mapping.put (gimple_phi_result (orig_stmt),
10621                        gimple_phi_result (new_stmt));
10622           /* PHI nodes can not have patterns or related statements.  */
10623           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
10624                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
10625         }
10626
10627       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10628            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10629         {
10630           new_stmt = gsi_stmt (epilogue_gsi);
10631           if (is_gimple_debug (new_stmt))
10632             continue;
10633
10634           gcc_assert (gimple_uid (new_stmt) > 0);
10635           stmt_vinfo
10636             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10637
10638           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10639           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10640
10641           if (tree old_lhs = gimple_get_lhs (orig_stmt))
10642             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
10643
10644           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
10645             {
10646               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
10647               for (gimple_stmt_iterator gsi = gsi_start (seq);
10648                    !gsi_end_p (gsi); gsi_next (&gsi))
10649                 stmt_worklist.safe_push (gsi_stmt (gsi));
10650             }
10651
10652           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10653           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10654             {
10655               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10656               stmt_worklist.safe_push (stmt);
10657               /* Set BB such that the assert in
10658                 'get_initial_def_for_reduction' is able to determine that
10659                 the BB of the related stmt is inside this loop.  */
10660               gimple_set_bb (stmt,
10661                              gimple_bb (new_stmt));
10662               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10663               gcc_assert (related_vinfo == NULL
10664                           || related_vinfo == stmt_vinfo);
10665             }
10666         }
10667     }
10668
10669   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
10670      using the original main loop and thus need to be updated to refer to the
10671      cloned variables used in the epilogue.  */
10672   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
10673     {
10674       gimple *stmt = stmt_worklist[i];
10675       tree *new_op;
10676
10677       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
10678         {
10679           tree op = gimple_op (stmt, j);
10680           if ((new_op = mapping.get(op)))
10681             gimple_set_op (stmt, j, *new_op);
10682           else
10683             {
10684               /* PR92429: The last argument of simplify_replace_tree disables
10685                  folding when replacing arguments.  This is required as
10686                  otherwise you might end up with different statements than the
10687                  ones analyzed in vect_loop_analyze, leading to different
10688                  vectorization.  */
10689               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
10690                                           &find_in_mapping, &mapping, false);
10691               gimple_set_op (stmt, j, op);
10692             }
10693         }
10694     }
10695
10696   struct data_reference *dr;
10697   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10698   FOR_EACH_VEC_ELT (datarefs, i, dr)
10699     {
10700       orig_stmt = DR_STMT (dr);
10701       gcc_assert (gimple_uid (orig_stmt) > 0);
10702       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10703       /* Data references for gather loads and scatter stores do not use the
10704          updated offset we set using ADVANCE.  Instead we have to make sure the
10705          reference in the data references point to the corresponding copy of
10706          the original in the epilogue.  */
10707       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
10708           == VMAT_GATHER_SCATTER)
10709         {
10710           DR_REF (dr)
10711             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
10712                                      &find_in_mapping, &mapping);
10713           DR_BASE_ADDRESS (dr)
10714             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
10715                                      &find_in_mapping, &mapping);
10716         }
10717       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10718       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
10719       /* The vector size of the epilogue is smaller than that of the main loop
10720          so the alignment is either the same or lower. This means the dr will
10721          thus by definition be aligned.  */
10722       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
10723     }
10724
10725   epilogue_vinfo->shared->datarefs_copy.release ();
10726   epilogue_vinfo->shared->save_datarefs ();
10727 }
10728
10729 /* Function vect_transform_loop.
10730
10731    The analysis phase has determined that the loop is vectorizable.
10732    Vectorize the loop - created vectorized stmts to replace the scalar
10733    stmts in the loop, and update the loop exit condition.
10734    Returns scalar epilogue loop if any.  */
10735
10736 class loop *
10737 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
10738 {
10739   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10740   class loop *epilogue = NULL;
10741   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
10742   int nbbs = loop->num_nodes;
10743   int i;
10744   tree niters_vector = NULL_TREE;
10745   tree step_vector = NULL_TREE;
10746   tree niters_vector_mult_vf = NULL_TREE;
10747   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10748   unsigned int lowest_vf = constant_lower_bound (vf);
10749   gimple *stmt;
10750   bool check_profitability = false;
10751   unsigned int th;
10752
10753   DUMP_VECT_SCOPE ("vec_transform_loop");
10754
10755   loop_vinfo->shared->check_datarefs ();
10756
10757   /* Use the more conservative vectorization threshold.  If the number
10758      of iterations is constant assume the cost check has been performed
10759      by our caller.  If the threshold makes all loops profitable that
10760      run at least the (estimated) vectorization factor number of times
10761      checking is pointless, too.  */
10762   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
10763   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
10764     {
10765       if (dump_enabled_p ())
10766         dump_printf_loc (MSG_NOTE, vect_location,
10767                          "Profitability threshold is %d loop iterations.\n",
10768                          th);
10769       check_profitability = true;
10770     }
10771
10772   /* Make sure there exists a single-predecessor exit bb.  Do this before
10773      versioning.   */
10774   edge e = single_exit (loop);
10775   if (! single_pred_p (e->dest))
10776     {
10777       split_loop_exit_edge (e, true);
10778       if (dump_enabled_p ())
10779         dump_printf (MSG_NOTE, "split exit edge\n");
10780     }
10781
10782   /* Version the loop first, if required, so the profitability check
10783      comes first.  */
10784
10785   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
10786     {
10787       class loop *sloop
10788         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
10789       sloop->force_vectorize = false;
10790       check_profitability = false;
10791     }
10792
10793   /* Make sure there exists a single-predecessor exit bb also on the
10794      scalar loop copy.  Do this after versioning but before peeling
10795      so CFG structure is fine for both scalar and if-converted loop
10796      to make slpeel_duplicate_current_defs_from_edges face matched
10797      loop closed PHI nodes on the exit.  */
10798   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
10799     {
10800       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
10801       if (! single_pred_p (e->dest))
10802         {
10803           split_loop_exit_edge (e, true);
10804           if (dump_enabled_p ())
10805             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
10806         }
10807     }
10808
10809   tree niters = vect_build_loop_niters (loop_vinfo);
10810   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
10811   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
10812   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
10813   tree advance;
10814   drs_init_vec orig_drs_init;
10815
10816   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
10817                               &step_vector, &niters_vector_mult_vf, th,
10818                               check_profitability, niters_no_overflow,
10819                               &advance);
10820
10821   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
10822       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
10823     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
10824                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
10825
10826   if (niters_vector == NULL_TREE)
10827     {
10828       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
10829           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
10830           && known_eq (lowest_vf, vf))
10831         {
10832           niters_vector
10833             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
10834                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
10835           step_vector = build_one_cst (TREE_TYPE (niters));
10836         }
10837       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10838         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
10839                                      &step_vector, niters_no_overflow);
10840       else
10841         /* vect_do_peeling subtracted the number of peeled prologue
10842            iterations from LOOP_VINFO_NITERS.  */
10843         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
10844                                      &niters_vector, &step_vector,
10845                                      niters_no_overflow);
10846     }
10847
10848   /* 1) Make sure the loop header has exactly two entries
10849      2) Make sure we have a preheader basic block.  */
10850
10851   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
10852
10853   split_edge (loop_preheader_edge (loop));
10854
10855   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10856     /* This will deal with any possible peeling.  */
10857     vect_prepare_for_masked_peels (loop_vinfo);
10858
10859   /* Schedule the SLP instances first, then handle loop vectorization
10860      below.  */
10861   if (!loop_vinfo->slp_instances.is_empty ())
10862     {
10863       DUMP_VECT_SCOPE ("scheduling SLP instances");
10864       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
10865     }
10866
10867   /* FORNOW: the vectorizer supports only loops which body consist
10868      of one basic block (header + empty latch). When the vectorizer will
10869      support more involved loop forms, the order by which the BBs are
10870      traversed need to be reconsidered.  */
10871
10872   for (i = 0; i < nbbs; i++)
10873     {
10874       basic_block bb = bbs[i];
10875       stmt_vec_info stmt_info;
10876
10877       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10878            gsi_next (&si))
10879         {
10880           gphi *phi = si.phi ();
10881           if (dump_enabled_p ())
10882             dump_printf_loc (MSG_NOTE, vect_location,
10883                              "------>vectorizing phi: %G", (gimple *) phi);
10884           stmt_info = loop_vinfo->lookup_stmt (phi);
10885           if (!stmt_info)
10886             continue;
10887
10888           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10889             vect_loop_kill_debug_uses (loop, stmt_info);
10890
10891           if (!STMT_VINFO_RELEVANT_P (stmt_info)
10892               && !STMT_VINFO_LIVE_P (stmt_info))
10893             continue;
10894
10895           if (STMT_VINFO_VECTYPE (stmt_info)
10896               && (maybe_ne
10897                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
10898               && dump_enabled_p ())
10899             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10900
10901           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10902                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10903                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10904                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10905                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
10906                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
10907               && ! PURE_SLP_STMT (stmt_info))
10908             {
10909               if (dump_enabled_p ())
10910                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
10911               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
10912             }
10913         }
10914
10915       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10916            gsi_next (&si))
10917         {
10918           gphi *phi = si.phi ();
10919           stmt_info = loop_vinfo->lookup_stmt (phi);
10920           if (!stmt_info)
10921             continue;
10922
10923           if (!STMT_VINFO_RELEVANT_P (stmt_info)
10924               && !STMT_VINFO_LIVE_P (stmt_info))
10925             continue;
10926
10927           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10928                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10929                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10930                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10931                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
10932                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
10933               && ! PURE_SLP_STMT (stmt_info))
10934             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
10935         }
10936
10937       for (gimple_stmt_iterator si = gsi_start_bb (bb);
10938            !gsi_end_p (si);)
10939         {
10940           stmt = gsi_stmt (si);
10941           /* During vectorization remove existing clobber stmts.  */
10942           if (gimple_clobber_p (stmt))
10943             {
10944               unlink_stmt_vdef (stmt);
10945               gsi_remove (&si, true);
10946               release_defs (stmt);
10947             }
10948           else
10949             {
10950               /* Ignore vector stmts created in the outer loop.  */
10951               stmt_info = loop_vinfo->lookup_stmt (stmt);
10952
10953               /* vector stmts created in the outer-loop during vectorization of
10954                  stmts in an inner-loop may not have a stmt_info, and do not
10955                  need to be vectorized.  */
10956               stmt_vec_info seen_store = NULL;
10957               if (stmt_info)
10958                 {
10959                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
10960                     {
10961                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
10962                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
10963                            !gsi_end_p (subsi); gsi_next (&subsi))
10964                         {
10965                           stmt_vec_info pat_stmt_info
10966                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
10967                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10968                                                     &si, &seen_store);
10969                         }
10970                       stmt_vec_info pat_stmt_info
10971                         = STMT_VINFO_RELATED_STMT (stmt_info);
10972                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10973                                                     &si, &seen_store))
10974                         maybe_set_vectorized_backedge_value (loop_vinfo,
10975                                                              pat_stmt_info);
10976                     }
10977                   else
10978                     {
10979                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
10980                                                     &seen_store))
10981                         maybe_set_vectorized_backedge_value (loop_vinfo,
10982                                                              stmt_info);
10983                     }
10984                 }
10985               gsi_next (&si);
10986               if (seen_store)
10987                 {
10988                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
10989                     /* Interleaving.  If IS_STORE is TRUE, the
10990                        vectorization of the interleaving chain was
10991                        completed - free all the stores in the chain.  */
10992                     vect_remove_stores (loop_vinfo,
10993                                         DR_GROUP_FIRST_ELEMENT (seen_store));
10994                   else
10995                     /* Free the attached stmt_vec_info and remove the stmt.  */
10996                     loop_vinfo->remove_stmt (stmt_info);
10997                 }
10998             }
10999         }
11000
11001       /* Stub out scalar statements that must not survive vectorization.
11002          Doing this here helps with grouped statements, or statements that
11003          are involved in patterns.  */
11004       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11005            !gsi_end_p (gsi); gsi_next (&gsi))
11006         {
11007           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11008           if (!call || !gimple_call_internal_p (call))
11009             continue;
11010           internal_fn ifn = gimple_call_internal_fn (call);
11011           if (ifn == IFN_MASK_LOAD)
11012             {
11013               tree lhs = gimple_get_lhs (call);
11014               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11015                 {
11016                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11017                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11018                   gsi_replace (&gsi, new_stmt, true);
11019                 }
11020             }
11021           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11022             {
11023               tree lhs = gimple_get_lhs (call);
11024               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11025                 {
11026                   tree else_arg
11027                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11028                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11029                   gsi_replace (&gsi, new_stmt, true);
11030                 }
11031             }
11032         }
11033     }                           /* BBs in loop */
11034
11035   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11036      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11037   if (integer_onep (step_vector))
11038     niters_no_overflow = true;
11039   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11040                            niters_vector_mult_vf, !niters_no_overflow);
11041
11042   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11043   scale_profile_for_vect_loop (loop, assumed_vf);
11044
11045   /* True if the final iteration might not handle a full vector's
11046      worth of scalar iterations.  */
11047   bool final_iter_may_be_partial
11048     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11049   /* The minimum number of iterations performed by the epilogue.  This
11050      is 1 when peeling for gaps because we always need a final scalar
11051      iteration.  */
11052   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11053   /* +1 to convert latch counts to loop iteration counts,
11054      -min_epilogue_iters to remove iterations that cannot be performed
11055        by the vector code.  */
11056   int bias_for_lowest = 1 - min_epilogue_iters;
11057   int bias_for_assumed = bias_for_lowest;
11058   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11059   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11060     {
11061       /* When the amount of peeling is known at compile time, the first
11062          iteration will have exactly alignment_npeels active elements.
11063          In the worst case it will have at least one.  */
11064       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11065       bias_for_lowest += lowest_vf - min_first_active;
11066       bias_for_assumed += assumed_vf - min_first_active;
11067     }
11068   /* In these calculations the "- 1" converts loop iteration counts
11069      back to latch counts.  */
11070   if (loop->any_upper_bound)
11071     {
11072       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11073       loop->nb_iterations_upper_bound
11074         = (final_iter_may_be_partial
11075            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11076                             lowest_vf) - 1
11077            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11078                              lowest_vf) - 1);
11079       if (main_vinfo
11080           /* Both peeling for alignment and peeling for gaps can end up
11081              with the scalar epilogue running for more than VF-1 iterations.  */
11082           && !main_vinfo->peeling_for_alignment
11083           && !main_vinfo->peeling_for_gaps)
11084         {
11085           unsigned int bound;
11086           poly_uint64 main_iters
11087             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11088                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11089           main_iters
11090             = upper_bound (main_iters,
11091                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11092           if (can_div_away_from_zero_p (main_iters,
11093                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11094                                         &bound))
11095             loop->nb_iterations_upper_bound
11096               = wi::umin ((widest_int) (bound - 1),
11097                           loop->nb_iterations_upper_bound);
11098       }
11099   }
11100   if (loop->any_likely_upper_bound)
11101     loop->nb_iterations_likely_upper_bound
11102       = (final_iter_may_be_partial
11103          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11104                           + bias_for_lowest, lowest_vf) - 1
11105          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11106                            + bias_for_lowest, lowest_vf) - 1);
11107   if (loop->any_estimate)
11108     loop->nb_iterations_estimate
11109       = (final_iter_may_be_partial
11110          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11111                           assumed_vf) - 1
11112          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11113                            assumed_vf) - 1);
11114
11115   if (dump_enabled_p ())
11116     {
11117       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11118         {
11119           dump_printf_loc (MSG_NOTE, vect_location,
11120                            "LOOP VECTORIZED\n");
11121           if (loop->inner)
11122             dump_printf_loc (MSG_NOTE, vect_location,
11123                              "OUTER LOOP VECTORIZED\n");
11124           dump_printf (MSG_NOTE, "\n");
11125         }
11126       else
11127         dump_printf_loc (MSG_NOTE, vect_location,
11128                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11129                          GET_MODE_NAME (loop_vinfo->vector_mode));
11130     }
11131
11132   /* Loops vectorized with a variable factor won't benefit from
11133      unrolling/peeling.  */
11134   if (!vf.is_constant ())
11135     {
11136       loop->unroll = 1;
11137       if (dump_enabled_p ())
11138         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11139                          " variable-length vectorization factor\n");
11140     }
11141   /* Free SLP instances here because otherwise stmt reference counting
11142      won't work.  */
11143   slp_instance instance;
11144   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11145     vect_free_slp_instance (instance);
11146   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11147   /* Clear-up safelen field since its value is invalid after vectorization
11148      since vectorized loop can have loop-carried dependencies.  */
11149   loop->safelen = 0;
11150
11151   if (epilogue)
11152     {
11153       update_epilogue_loop_vinfo (epilogue, advance);
11154
11155       epilogue->simduid = loop->simduid;
11156       epilogue->force_vectorize = loop->force_vectorize;
11157       epilogue->dont_vectorize = false;
11158     }
11159
11160   return epilogue;
11161 }
11162
11163 /* The code below is trying to perform simple optimization - revert
11164    if-conversion for masked stores, i.e. if the mask of a store is zero
11165    do not perform it and all stored value producers also if possible.
11166    For example,
11167      for (i=0; i<n; i++)
11168        if (c[i])
11169         {
11170           p1[i] += 1;
11171           p2[i] = p3[i] +2;
11172         }
11173    this transformation will produce the following semi-hammock:
11174
11175    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11176      {
11177        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11178        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11179        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11180        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11181        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11182        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11183      }
11184 */
11185
11186 void
11187 optimize_mask_stores (class loop *loop)
11188 {
11189   basic_block *bbs = get_loop_body (loop);
11190   unsigned nbbs = loop->num_nodes;
11191   unsigned i;
11192   basic_block bb;
11193   class loop *bb_loop;
11194   gimple_stmt_iterator gsi;
11195   gimple *stmt;
11196   auto_vec<gimple *> worklist;
11197   auto_purge_vect_location sentinel;
11198
11199   vect_location = find_loop_location (loop);
11200   /* Pick up all masked stores in loop if any.  */
11201   for (i = 0; i < nbbs; i++)
11202     {
11203       bb = bbs[i];
11204       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11205            gsi_next (&gsi))
11206         {
11207           stmt = gsi_stmt (gsi);
11208           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11209             worklist.safe_push (stmt);
11210         }
11211     }
11212
11213   free (bbs);
11214   if (worklist.is_empty ())
11215     return;
11216
11217   /* Loop has masked stores.  */
11218   while (!worklist.is_empty ())
11219     {
11220       gimple *last, *last_store;
11221       edge e, efalse;
11222       tree mask;
11223       basic_block store_bb, join_bb;
11224       gimple_stmt_iterator gsi_to;
11225       tree vdef, new_vdef;
11226       gphi *phi;
11227       tree vectype;
11228       tree zero;
11229
11230       last = worklist.pop ();
11231       mask = gimple_call_arg (last, 2);
11232       bb = gimple_bb (last);
11233       /* Create then_bb and if-then structure in CFG, then_bb belongs to
11234          the same loop as if_bb.  It could be different to LOOP when two
11235          level loop-nest is vectorized and mask_store belongs to the inner
11236          one.  */
11237       e = split_block (bb, last);
11238       bb_loop = bb->loop_father;
11239       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11240       join_bb = e->dest;
11241       store_bb = create_empty_bb (bb);
11242       add_bb_to_loop (store_bb, bb_loop);
11243       e->flags = EDGE_TRUE_VALUE;
11244       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11245       /* Put STORE_BB to likely part.  */
11246       efalse->probability = profile_probability::unlikely ();
11247       store_bb->count = efalse->count ();
11248       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11249       if (dom_info_available_p (CDI_DOMINATORS))
11250         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11251       if (dump_enabled_p ())
11252         dump_printf_loc (MSG_NOTE, vect_location,
11253                          "Create new block %d to sink mask stores.",
11254                          store_bb->index);
11255       /* Create vector comparison with boolean result.  */
11256       vectype = TREE_TYPE (mask);
11257       zero = build_zero_cst (vectype);
11258       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11259       gsi = gsi_last_bb (bb);
11260       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11261       /* Create new PHI node for vdef of the last masked store:
11262          .MEM_2 = VDEF <.MEM_1>
11263          will be converted to
11264          .MEM.3 = VDEF <.MEM_1>
11265          and new PHI node will be created in join bb
11266          .MEM_2 = PHI <.MEM_1, .MEM_3>
11267       */
11268       vdef = gimple_vdef (last);
11269       new_vdef = make_ssa_name (gimple_vop (cfun), last);
11270       gimple_set_vdef (last, new_vdef);
11271       phi = create_phi_node (vdef, join_bb);
11272       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11273
11274       /* Put all masked stores with the same mask to STORE_BB if possible.  */
11275       while (true)
11276         {
11277           gimple_stmt_iterator gsi_from;
11278           gimple *stmt1 = NULL;
11279
11280           /* Move masked store to STORE_BB.  */
11281           last_store = last;
11282           gsi = gsi_for_stmt (last);
11283           gsi_from = gsi;
11284           /* Shift GSI to the previous stmt for further traversal.  */
11285           gsi_prev (&gsi);
11286           gsi_to = gsi_start_bb (store_bb);
11287           gsi_move_before (&gsi_from, &gsi_to);
11288           /* Setup GSI_TO to the non-empty block start.  */
11289           gsi_to = gsi_start_bb (store_bb);
11290           if (dump_enabled_p ())
11291             dump_printf_loc (MSG_NOTE, vect_location,
11292                              "Move stmt to created bb\n%G", last);
11293           /* Move all stored value producers if possible.  */
11294           while (!gsi_end_p (gsi))
11295             {
11296               tree lhs;
11297               imm_use_iterator imm_iter;
11298               use_operand_p use_p;
11299               bool res;
11300
11301               /* Skip debug statements.  */
11302               if (is_gimple_debug (gsi_stmt (gsi)))
11303                 {
11304                   gsi_prev (&gsi);
11305                   continue;
11306                 }
11307               stmt1 = gsi_stmt (gsi);
11308               /* Do not consider statements writing to memory or having
11309                  volatile operand.  */
11310               if (gimple_vdef (stmt1)
11311                   || gimple_has_volatile_ops (stmt1))
11312                 break;
11313               gsi_from = gsi;
11314               gsi_prev (&gsi);
11315               lhs = gimple_get_lhs (stmt1);
11316               if (!lhs)
11317                 break;
11318
11319               /* LHS of vectorized stmt must be SSA_NAME.  */
11320               if (TREE_CODE (lhs) != SSA_NAME)
11321                 break;
11322
11323               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11324                 {
11325                   /* Remove dead scalar statement.  */
11326                   if (has_zero_uses (lhs))
11327                     {
11328                       gsi_remove (&gsi_from, true);
11329                       continue;
11330                     }
11331                 }
11332
11333               /* Check that LHS does not have uses outside of STORE_BB.  */
11334               res = true;
11335               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11336                 {
11337                   gimple *use_stmt;
11338                   use_stmt = USE_STMT (use_p);
11339                   if (is_gimple_debug (use_stmt))
11340                     continue;
11341                   if (gimple_bb (use_stmt) != store_bb)
11342                     {
11343                       res = false;
11344                       break;
11345                     }
11346                 }
11347               if (!res)
11348                 break;
11349
11350               if (gimple_vuse (stmt1)
11351                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
11352                 break;
11353
11354               /* Can move STMT1 to STORE_BB.  */
11355               if (dump_enabled_p ())
11356                 dump_printf_loc (MSG_NOTE, vect_location,
11357                                  "Move stmt to created bb\n%G", stmt1);
11358               gsi_move_before (&gsi_from, &gsi_to);
11359               /* Shift GSI_TO for further insertion.  */
11360               gsi_prev (&gsi_to);
11361             }
11362           /* Put other masked stores with the same mask to STORE_BB.  */
11363           if (worklist.is_empty ()
11364               || gimple_call_arg (worklist.last (), 2) != mask
11365               || worklist.last () != stmt1)
11366             break;
11367           last = worklist.pop ();
11368         }
11369       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11370     }
11371 }
11372
11373 /* Decide whether it is possible to use a zero-based induction variable
11374    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
11375    the value that the induction variable must be able to hold in order
11376    to ensure that the rgroups eventually have no active vector elements.
11377    Return -1 otherwise.  */
11378
11379 widest_int
11380 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11381 {
11382   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11383   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11384   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11385
11386   /* Calculate the value that the induction variable must be able
11387      to hit in order to ensure that we end the loop with an all-false mask.
11388      This involves adding the maximum number of inactive trailing scalar
11389      iterations.  */
11390   widest_int iv_limit = -1;
11391   if (max_loop_iterations (loop, &iv_limit))
11392     {
11393       if (niters_skip)
11394         {
11395           /* Add the maximum number of skipped iterations to the
11396              maximum iteration count.  */
11397           if (TREE_CODE (niters_skip) == INTEGER_CST)
11398             iv_limit += wi::to_widest (niters_skip);
11399           else
11400             iv_limit += max_vf - 1;
11401         }
11402       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11403         /* Make a conservatively-correct assumption.  */
11404         iv_limit += max_vf - 1;
11405
11406       /* IV_LIMIT is the maximum number of latch iterations, which is also
11407          the maximum in-range IV value.  Round this value down to the previous
11408          vector alignment boundary and then add an extra full iteration.  */
11409       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11410       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11411     }
11412   return iv_limit;
11413 }
11414
11415 /* For the given rgroup_controls RGC, check whether an induction variable
11416    would ever hit a value that produces a set of all-false masks or zero
11417    lengths before wrapping around.  Return true if it's possible to wrap
11418    around before hitting the desirable value, otherwise return false.  */
11419
11420 bool
11421 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11422 {
11423   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11424
11425   if (iv_limit == -1)
11426     return true;
11427
11428   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11429   unsigned int compare_precision = TYPE_PRECISION (compare_type);
11430   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11431
11432   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11433     return true;
11434
11435   return false;
11436 }