]> gcc.gnu.org Git - gcc.git/blob - gcc/tree-vect-loop.cc
tree-optimization/114297 - SLP reduction with early break fix
[gcc.git] / gcc / tree-vect-loop.cc
1 /* Loop Vectorization
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
61
62 /* Loop Vectorization Pass.
63
64 This pass tries to vectorize loops.
65
66 For example, the vectorizer transforms the following simple loop:
67
68 short a[N]; short b[N]; short c[N]; int i;
69
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
72 }
73
74 as if it was manually vectorized by rewriting the source code into:
75
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
80
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
86 }
87
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
99
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
105
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
110
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
121
122 For example, say stmt S1 was vectorized into stmt VS1:
123
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
127
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
132
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
137
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
140
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
148
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
155
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
158 */
159
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
164
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
168
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
173 {
174 gimple *stmt = stmt_info->stmt;
175
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
179 {
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
183 }
184
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
191
192 if (stmt_vectype)
193 {
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
203 }
204
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
207
208 return opt_result::success ();
209 }
210
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
215
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
219 {
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
226
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
229 {
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
236 {
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
245 }
246
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
254 }
255
256 return opt_result::success ();
257 }
258
259 /* Function vect_determine_vectorization_factor
260
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
266
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
271
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
276 }
277
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
281 }
282 */
283
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
286 {
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
296
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
298
299 for (i = 0; i < nbbs; i++)
300 {
301 basic_block bb = bbs[i];
302
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
305 {
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
311
312 gcc_assert (stmt_info);
313
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
316 {
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
319
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
324
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
332
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
336
337 if (dump_enabled_p ())
338 {
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
342 }
343
344 vect_update_max_nunits (&vectorization_factor, vectype);
345 }
346 }
347
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
350 {
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
359 }
360 }
361
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
364 {
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
368 }
369
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
375 }
376
377
378 /* Function vect_is_simple_iv_evolution.
379
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
382
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
386 {
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
391
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
396
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
401
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
404
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
408
409 *init = init_expr;
410 *step = step_expr;
411
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
421 {
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
426 }
427
428 return true;
429 }
430
431 /* Function vect_is_nonlinear_iv_evolution
432
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
437
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
442 {
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
445
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
448
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
451
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
455
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
458
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
463
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
466 {
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
473
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
491
492 default:
493 return false;
494 }
495
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
498
499 return true;
500 }
501
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
505
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
508 ...
509
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
512 ...
513 x_3 = ...;
514 ...
515
516 outer2:
517 x_4 = PHI <x_3(inner)>;
518 ...
519
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
522
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
525 {
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
533 }
534
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
539
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
543 {
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
547
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
556
557 tree def = gimple_phi_result (phi);
558
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
569
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
575
576 return true;
577 }
578
579 /* Function vect_analyze_scalar_cycles_1.
580
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
586
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
590 {
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
596
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
598
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
603 {
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
608
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
612
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
617
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
619
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
623 {
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
632 }
633
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
644 {
645 worklist.safe_push (stmt_vinfo);
646 continue;
647 }
648
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
652
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
656 }
657
658
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
661 {
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
665
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
669
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
672
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
677 {
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
681 {
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
685
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
688 }
689 else
690 {
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
692 {
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
696
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
698 }
699 else
700 {
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
704
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
713 }
714 }
715 }
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
722 }
723 }
724
725
726 /* Function vect_analyze_scalar_cycles.
727
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
734
735 Example1: reduction:
736
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
740
741 Example2: induction:
742
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
746
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
749 {
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
751
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
753
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
762
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
765 }
766
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
769
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
772 {
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
778 do
779 {
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
788 }
789 while (stmt_info);
790 }
791
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
793
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
796 {
797 stmt_vec_info first;
798 unsigned i;
799
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
801 {
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
804 {
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
810 }
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
815 {
816 if (STMT_VINFO_IN_PATTERN_P (first))
817 {
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
821 }
822 }
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
827 {
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
831 {
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
837 }
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
843 }
844 }
845 }
846
847 /* Function vect_get_loop_niters.
848
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
853
854 Return the loop exit conditions. */
855
856
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
860 {
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
866
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
870
871 DUMP_VECT_SCOPE ("get_loop_niters");
872
873 if (exits.is_empty ())
874 return conds;
875
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
879
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
883 {
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
887
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
890
891 if (exit != main_exit)
892 continue;
893
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
898
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
902
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
905
906 if (may_be_zero)
907 {
908 if (COMPARISON_CLASS_P (may_be_zero))
909 {
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
922
923 may_be_zero = NULL_TREE;
924 }
925 else if (integer_nonzerop (may_be_zero))
926 {
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
930 }
931 else
932 continue;
933 }
934
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
938
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
944 {
945 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
946 unshare_expr (niter),
947 build_int_cst (TREE_TYPE (niter), 1));
948 if (TREE_CODE (niter) == INTEGER_CST
949 && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
950 {
951 /* If we manage to fold niter + 1 into INTEGER_CST even when
952 niter is some complex expression, ensure back
953 *number_of_iterationsm1 is an INTEGER_CST as well. See
954 PR113210. */
955 *number_of_iterationsm1
956 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
957 build_minus_one_cst (TREE_TYPE (niter)));
958 }
959 }
960 *number_of_iterations = niter;
961 }
962
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
965
966 return conds;
967 }
968
969 /* Determine the main loop exit for the vectorizer. */
970
971 edge
972 vec_init_loop_exit_info (class loop *loop)
973 {
974 /* Before we begin we must first determine which exit is the main one and
975 which are auxilary exits. */
976 auto_vec<edge> exits = get_loop_exit_edges (loop);
977 if (exits.length () == 1)
978 return exits[0];
979
980 /* If we have multiple exits we only support counting IV at the moment.
981 Analyze all exits and return the last one we can analyze. */
982 class tree_niter_desc niter_desc;
983 edge candidate = NULL;
984 for (edge exit : exits)
985 {
986 if (!get_loop_exit_condition (exit))
987 continue;
988
989 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
990 && !chrec_contains_undetermined (niter_desc.niter))
991 {
992 tree may_be_zero = niter_desc.may_be_zero;
993 if ((integer_zerop (may_be_zero)
994 /* As we are handling may_be_zero that's not false by
995 rewriting niter to may_be_zero ? 0 : niter we require
996 an empty latch. */
997 || (single_pred_p (loop->latch)
998 && exit->src == single_pred (loop->latch)
999 && (integer_nonzerop (may_be_zero)
1000 || COMPARISON_CLASS_P (may_be_zero))))
1001 && (!candidate
1002 || dominated_by_p (CDI_DOMINATORS, exit->src,
1003 candidate->src)))
1004 candidate = exit;
1005 }
1006 }
1007
1008 return candidate;
1009 }
1010
1011 /* Function bb_in_loop_p
1012
1013 Used as predicate for dfs order traversal of the loop bbs. */
1014
1015 static bool
1016 bb_in_loop_p (const_basic_block bb, const void *data)
1017 {
1018 const class loop *const loop = (const class loop *)data;
1019 if (flow_bb_inside_loop_p (loop, bb))
1020 return true;
1021 return false;
1022 }
1023
1024
1025 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1026 stmt_vec_info structs for all the stmts in LOOP_IN. */
1027
1028 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1029 : vec_info (vec_info::loop, shared),
1030 loop (loop_in),
1031 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1032 num_itersm1 (NULL_TREE),
1033 num_iters (NULL_TREE),
1034 num_iters_unchanged (NULL_TREE),
1035 num_iters_assumptions (NULL_TREE),
1036 vector_costs (nullptr),
1037 scalar_costs (nullptr),
1038 th (0),
1039 versioning_threshold (0),
1040 vectorization_factor (0),
1041 main_loop_edge (nullptr),
1042 skip_main_loop_edge (nullptr),
1043 skip_this_loop_edge (nullptr),
1044 reusable_accumulators (),
1045 suggested_unroll_factor (1),
1046 max_vectorization_factor (0),
1047 mask_skip_niters (NULL_TREE),
1048 rgroup_compare_type (NULL_TREE),
1049 simd_if_cond (NULL_TREE),
1050 partial_vector_style (vect_partial_vectors_none),
1051 unaligned_dr (NULL),
1052 peeling_for_alignment (0),
1053 ptr_mask (0),
1054 ivexpr_map (NULL),
1055 scan_map (NULL),
1056 slp_unrolling_factor (1),
1057 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1058 vectorizable (false),
1059 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1060 using_partial_vectors_p (false),
1061 using_decrementing_iv_p (false),
1062 using_select_vl_p (false),
1063 epil_using_partial_vectors_p (false),
1064 partial_load_store_bias (0),
1065 peeling_for_gaps (false),
1066 peeling_for_niter (false),
1067 early_breaks (false),
1068 no_data_dependencies (false),
1069 has_mask_store (false),
1070 scalar_loop_scaling (profile_probability::uninitialized ()),
1071 scalar_loop (NULL),
1072 orig_loop_info (NULL),
1073 vec_loop_iv_exit (NULL),
1074 vec_epilogue_loop_iv_exit (NULL),
1075 scalar_loop_iv_exit (NULL)
1076 {
1077 /* CHECKME: We want to visit all BBs before their successors (except for
1078 latch blocks, for which this assertion wouldn't hold). In the simple
1079 case of the loop forms we allow, a dfs order of the BBs would the same
1080 as reversed postorder traversal, so we are safe. */
1081
1082 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1083 bbs, loop->num_nodes, loop);
1084 gcc_assert (nbbs == loop->num_nodes);
1085
1086 for (unsigned int i = 0; i < nbbs; i++)
1087 {
1088 basic_block bb = bbs[i];
1089 gimple_stmt_iterator si;
1090
1091 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1092 {
1093 gimple *phi = gsi_stmt (si);
1094 gimple_set_uid (phi, 0);
1095 add_stmt (phi);
1096 }
1097
1098 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1099 {
1100 gimple *stmt = gsi_stmt (si);
1101 gimple_set_uid (stmt, 0);
1102 if (is_gimple_debug (stmt))
1103 continue;
1104 add_stmt (stmt);
1105 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1106 third argument is the #pragma omp simd if (x) condition, when 0,
1107 loop shouldn't be vectorized, when non-zero constant, it should
1108 be vectorized normally, otherwise versioned with vectorized loop
1109 done if the condition is non-zero at runtime. */
1110 if (loop_in->simduid
1111 && is_gimple_call (stmt)
1112 && gimple_call_internal_p (stmt)
1113 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1114 && gimple_call_num_args (stmt) >= 3
1115 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1116 && (loop_in->simduid
1117 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1118 {
1119 tree arg = gimple_call_arg (stmt, 2);
1120 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1121 simd_if_cond = arg;
1122 else
1123 gcc_assert (integer_nonzerop (arg));
1124 }
1125 }
1126 }
1127
1128 epilogue_vinfos.create (6);
1129 }
1130
1131 /* Free all levels of rgroup CONTROLS. */
1132
1133 void
1134 release_vec_loop_controls (vec<rgroup_controls> *controls)
1135 {
1136 rgroup_controls *rgc;
1137 unsigned int i;
1138 FOR_EACH_VEC_ELT (*controls, i, rgc)
1139 rgc->controls.release ();
1140 controls->release ();
1141 }
1142
1143 /* Free all memory used by the _loop_vec_info, as well as all the
1144 stmt_vec_info structs of all the stmts in the loop. */
1145
1146 _loop_vec_info::~_loop_vec_info ()
1147 {
1148 free (bbs);
1149
1150 release_vec_loop_controls (&masks.rgc_vec);
1151 release_vec_loop_controls (&lens);
1152 delete ivexpr_map;
1153 delete scan_map;
1154 epilogue_vinfos.release ();
1155 delete scalar_costs;
1156 delete vector_costs;
1157
1158 /* When we release an epiloge vinfo that we do not intend to use
1159 avoid clearing AUX of the main loop which should continue to
1160 point to the main loop vinfo since otherwise we'll leak that. */
1161 if (loop->aux == this)
1162 loop->aux = NULL;
1163 }
1164
1165 /* Return an invariant or register for EXPR and emit necessary
1166 computations in the LOOP_VINFO loop preheader. */
1167
1168 tree
1169 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1170 {
1171 if (is_gimple_reg (expr)
1172 || is_gimple_min_invariant (expr))
1173 return expr;
1174
1175 if (! loop_vinfo->ivexpr_map)
1176 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1177 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1178 if (! cached)
1179 {
1180 gimple_seq stmts = NULL;
1181 cached = force_gimple_operand (unshare_expr (expr),
1182 &stmts, true, NULL_TREE);
1183 if (stmts)
1184 {
1185 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1186 gsi_insert_seq_on_edge_immediate (e, stmts);
1187 }
1188 }
1189 return cached;
1190 }
1191
1192 /* Return true if we can use CMP_TYPE as the comparison type to produce
1193 all masks required to mask LOOP_VINFO. */
1194
1195 static bool
1196 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1197 {
1198 rgroup_controls *rgm;
1199 unsigned int i;
1200 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1201 if (rgm->type != NULL_TREE
1202 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1203 cmp_type, rgm->type,
1204 OPTIMIZE_FOR_SPEED))
1205 return false;
1206 return true;
1207 }
1208
1209 /* Calculate the maximum number of scalars per iteration for every
1210 rgroup in LOOP_VINFO. */
1211
1212 static unsigned int
1213 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1214 {
1215 unsigned int res = 1;
1216 unsigned int i;
1217 rgroup_controls *rgm;
1218 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1219 res = MAX (res, rgm->max_nscalars_per_iter);
1220 return res;
1221 }
1222
1223 /* Calculate the minimum precision necessary to represent:
1224
1225 MAX_NITERS * FACTOR
1226
1227 as an unsigned integer, where MAX_NITERS is the maximum number of
1228 loop header iterations for the original scalar form of LOOP_VINFO. */
1229
1230 static unsigned
1231 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1232 {
1233 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1234
1235 /* Get the maximum number of iterations that is representable
1236 in the counter type. */
1237 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1238 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1239
1240 /* Get a more refined estimate for the number of iterations. */
1241 widest_int max_back_edges;
1242 if (max_loop_iterations (loop, &max_back_edges))
1243 max_ni = wi::smin (max_ni, max_back_edges + 1);
1244
1245 /* Work out how many bits we need to represent the limit. */
1246 return wi::min_precision (max_ni * factor, UNSIGNED);
1247 }
1248
1249 /* True if the loop needs peeling or partial vectors when vectorized. */
1250
1251 static bool
1252 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1253 {
1254 unsigned HOST_WIDE_INT const_vf;
1255 HOST_WIDE_INT max_niter
1256 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1257
1258 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1259 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1260 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1261 (loop_vinfo));
1262
1263 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1264 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1265 {
1266 /* Work out the (constant) number of iterations that need to be
1267 peeled for reasons other than niters. */
1268 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1269 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1270 peel_niter += 1;
1271 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1272 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1273 return true;
1274 }
1275 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1276 /* ??? When peeling for gaps but not alignment, we could
1277 try to check whether the (variable) niters is known to be
1278 VF * N + 1. That's something of a niche case though. */
1279 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1280 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1281 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1282 < (unsigned) exact_log2 (const_vf))
1283 /* In case of versioning, check if the maximum number of
1284 iterations is greater than th. If they are identical,
1285 the epilogue is unnecessary. */
1286 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1287 || ((unsigned HOST_WIDE_INT) max_niter
1288 /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1289 but that's only computed later based on our result.
1290 The following is the most conservative approximation. */
1291 > (std::max ((unsigned HOST_WIDE_INT) th,
1292 const_vf) / const_vf) * const_vf))))
1293 return true;
1294
1295 return false;
1296 }
1297
1298 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1299 whether we can actually generate the masks required. Return true if so,
1300 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1301
1302 static bool
1303 vect_verify_full_masking (loop_vec_info loop_vinfo)
1304 {
1305 unsigned int min_ni_width;
1306
1307 /* Use a normal loop if there are no statements that need masking.
1308 This only happens in rare degenerate cases: it means that the loop
1309 has no loads, no stores, and no live-out values. */
1310 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1311 return false;
1312
1313 /* Produce the rgroup controls. */
1314 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1315 {
1316 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1317 tree vectype = mask.first;
1318 unsigned nvectors = mask.second;
1319
1320 if (masks->rgc_vec.length () < nvectors)
1321 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1322 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1323 /* The number of scalars per iteration and the number of vectors are
1324 both compile-time constants. */
1325 unsigned int nscalars_per_iter
1326 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1327 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1328
1329 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1330 {
1331 rgm->max_nscalars_per_iter = nscalars_per_iter;
1332 rgm->type = truth_type_for (vectype);
1333 rgm->factor = 1;
1334 }
1335 }
1336
1337 unsigned int max_nscalars_per_iter
1338 = vect_get_max_nscalars_per_iter (loop_vinfo);
1339
1340 /* Work out how many bits we need to represent the limit. */
1341 min_ni_width
1342 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1343
1344 /* Find a scalar mode for which WHILE_ULT is supported. */
1345 opt_scalar_int_mode cmp_mode_iter;
1346 tree cmp_type = NULL_TREE;
1347 tree iv_type = NULL_TREE;
1348 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1349 unsigned int iv_precision = UINT_MAX;
1350
1351 if (iv_limit != -1)
1352 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1353 UNSIGNED);
1354
1355 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1356 {
1357 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1358 if (cmp_bits >= min_ni_width
1359 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1360 {
1361 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1362 if (this_type
1363 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1364 {
1365 /* Although we could stop as soon as we find a valid mode,
1366 there are at least two reasons why that's not always the
1367 best choice:
1368
1369 - An IV that's Pmode or wider is more likely to be reusable
1370 in address calculations than an IV that's narrower than
1371 Pmode.
1372
1373 - Doing the comparison in IV_PRECISION or wider allows
1374 a natural 0-based IV, whereas using a narrower comparison
1375 type requires mitigations against wrap-around.
1376
1377 Conversely, if the IV limit is variable, doing the comparison
1378 in a wider type than the original type can introduce
1379 unnecessary extensions, so picking the widest valid mode
1380 is not always a good choice either.
1381
1382 Here we prefer the first IV type that's Pmode or wider,
1383 and the first comparison type that's IV_PRECISION or wider.
1384 (The comparison type must be no wider than the IV type,
1385 to avoid extensions in the vector loop.)
1386
1387 ??? We might want to try continuing beyond Pmode for ILP32
1388 targets if CMP_BITS < IV_PRECISION. */
1389 iv_type = this_type;
1390 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1391 cmp_type = this_type;
1392 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1393 break;
1394 }
1395 }
1396 }
1397
1398 if (!cmp_type)
1399 {
1400 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1401 return false;
1402 }
1403
1404 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1405 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1406 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1407 return true;
1408 }
1409
1410 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1411 whether we can actually generate AVX512 style masks. Return true if so,
1412 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1413
1414 static bool
1415 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1416 {
1417 /* Produce differently organized rgc_vec and differently check
1418 we can produce masks. */
1419
1420 /* Use a normal loop if there are no statements that need masking.
1421 This only happens in rare degenerate cases: it means that the loop
1422 has no loads, no stores, and no live-out values. */
1423 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1424 return false;
1425
1426 /* For the decrementing IV we need to represent all values in
1427 [0, niter + niter_skip] where niter_skip is the elements we
1428 skip in the first iteration for prologue peeling. */
1429 tree iv_type = NULL_TREE;
1430 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1431 unsigned int iv_precision = UINT_MAX;
1432 if (iv_limit != -1)
1433 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1434
1435 /* First compute the type for the IV we use to track the remaining
1436 scalar iterations. */
1437 opt_scalar_int_mode cmp_mode_iter;
1438 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1439 {
1440 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1441 if (cmp_bits >= iv_precision
1442 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1443 {
1444 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1445 if (iv_type)
1446 break;
1447 }
1448 }
1449 if (!iv_type)
1450 return false;
1451
1452 /* Produce the rgroup controls. */
1453 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1454 {
1455 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1456 tree vectype = mask.first;
1457 unsigned nvectors = mask.second;
1458
1459 /* The number of scalars per iteration and the number of vectors are
1460 both compile-time constants. */
1461 unsigned int nscalars_per_iter
1462 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1463 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1464
1465 /* We index the rgroup_controls vector with nscalars_per_iter
1466 which we keep constant and instead have a varying nvectors,
1467 remembering the vector mask with the fewest nV. */
1468 if (masks->rgc_vec.length () < nscalars_per_iter)
1469 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1470 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1471
1472 if (!rgm->type || rgm->factor > nvectors)
1473 {
1474 rgm->type = truth_type_for (vectype);
1475 rgm->compare_type = NULL_TREE;
1476 rgm->max_nscalars_per_iter = nscalars_per_iter;
1477 rgm->factor = nvectors;
1478 rgm->bias_adjusted_ctrl = NULL_TREE;
1479 }
1480 }
1481
1482 /* There is no fixed compare type we are going to use but we have to
1483 be able to get at one for each mask group. */
1484 unsigned int min_ni_width
1485 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1486
1487 bool ok = true;
1488 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1489 {
1490 tree mask_type = rgc.type;
1491 if (!mask_type)
1492 continue;
1493
1494 /* For now vect_get_loop_mask only supports integer mode masks
1495 when we need to split it. */
1496 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1497 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1498 {
1499 ok = false;
1500 break;
1501 }
1502
1503 /* If iv_type is usable as compare type use that - we can elide the
1504 saturation in that case. */
1505 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1506 {
1507 tree cmp_vectype
1508 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1509 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1510 rgc.compare_type = cmp_vectype;
1511 }
1512 if (!rgc.compare_type)
1513 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1514 {
1515 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1516 if (cmp_bits >= min_ni_width
1517 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1518 {
1519 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1520 if (!cmp_type)
1521 continue;
1522
1523 /* Check whether we can produce the mask with cmp_type. */
1524 tree cmp_vectype
1525 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1526 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1527 {
1528 rgc.compare_type = cmp_vectype;
1529 break;
1530 }
1531 }
1532 }
1533 if (!rgc.compare_type)
1534 {
1535 ok = false;
1536 break;
1537 }
1538 }
1539 if (!ok)
1540 {
1541 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1542 return false;
1543 }
1544
1545 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1546 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1547 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1548 return true;
1549 }
1550
1551 /* Check whether we can use vector access with length based on precison
1552 comparison. So far, to keep it simple, we only allow the case that the
1553 precision of the target supported length is larger than the precision
1554 required by loop niters. */
1555
1556 static bool
1557 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1558 {
1559 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1560 return false;
1561
1562 machine_mode len_load_mode, len_store_mode;
1563 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1564 .exists (&len_load_mode))
1565 return false;
1566 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1567 .exists (&len_store_mode))
1568 return false;
1569
1570 signed char partial_load_bias = internal_len_load_store_bias
1571 (IFN_LEN_LOAD, len_load_mode);
1572
1573 signed char partial_store_bias = internal_len_load_store_bias
1574 (IFN_LEN_STORE, len_store_mode);
1575
1576 gcc_assert (partial_load_bias == partial_store_bias);
1577
1578 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1579 return false;
1580
1581 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1582 len_loads with a length of zero. In order to avoid that we prohibit
1583 more than one loop length here. */
1584 if (partial_load_bias == -1
1585 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1586 return false;
1587
1588 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1589
1590 unsigned int max_nitems_per_iter = 1;
1591 unsigned int i;
1592 rgroup_controls *rgl;
1593 /* Find the maximum number of items per iteration for every rgroup. */
1594 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1595 {
1596 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1597 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1598 }
1599
1600 /* Work out how many bits we need to represent the length limit. */
1601 unsigned int min_ni_prec
1602 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1603
1604 /* Now use the maximum of below precisions for one suitable IV type:
1605 - the IV's natural precision
1606 - the precision needed to hold: the maximum number of scalar
1607 iterations multiplied by the scale factor (min_ni_prec above)
1608 - the Pmode precision
1609
1610 If min_ni_prec is less than the precision of the current niters,
1611 we perfer to still use the niters type. Prefer to use Pmode and
1612 wider IV to avoid narrow conversions. */
1613
1614 unsigned int ni_prec
1615 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1616 min_ni_prec = MAX (min_ni_prec, ni_prec);
1617 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1618
1619 tree iv_type = NULL_TREE;
1620 opt_scalar_int_mode tmode_iter;
1621 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1622 {
1623 scalar_mode tmode = tmode_iter.require ();
1624 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1625
1626 /* ??? Do we really want to construct one IV whose precision exceeds
1627 BITS_PER_WORD? */
1628 if (tbits > BITS_PER_WORD)
1629 break;
1630
1631 /* Find the first available standard integral type. */
1632 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1633 {
1634 iv_type = build_nonstandard_integer_type (tbits, true);
1635 break;
1636 }
1637 }
1638
1639 if (!iv_type)
1640 {
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 "can't vectorize with length-based partial vectors"
1644 " because there is no suitable iv type.\n");
1645 return false;
1646 }
1647
1648 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1649 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1650 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1651
1652 return true;
1653 }
1654
1655 /* Calculate the cost of one scalar iteration of the loop. */
1656 static void
1657 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1658 {
1659 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1660 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1661 int nbbs = loop->num_nodes, factor;
1662 int innerloop_iters, i;
1663
1664 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1665
1666 /* Gather costs for statements in the scalar loop. */
1667
1668 /* FORNOW. */
1669 innerloop_iters = 1;
1670 if (loop->inner)
1671 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1672
1673 for (i = 0; i < nbbs; i++)
1674 {
1675 gimple_stmt_iterator si;
1676 basic_block bb = bbs[i];
1677
1678 if (bb->loop_father == loop->inner)
1679 factor = innerloop_iters;
1680 else
1681 factor = 1;
1682
1683 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1684 {
1685 gimple *stmt = gsi_stmt (si);
1686 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1687
1688 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1689 continue;
1690
1691 /* Skip stmts that are not vectorized inside the loop. */
1692 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1693 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1694 && (!STMT_VINFO_LIVE_P (vstmt_info)
1695 || !VECTORIZABLE_CYCLE_DEF
1696 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1697 continue;
1698
1699 vect_cost_for_stmt kind;
1700 if (STMT_VINFO_DATA_REF (stmt_info))
1701 {
1702 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1703 kind = scalar_load;
1704 else
1705 kind = scalar_store;
1706 }
1707 else if (vect_nop_conversion_p (stmt_info))
1708 continue;
1709 else
1710 kind = scalar_stmt;
1711
1712 /* We are using vect_prologue here to avoid scaling twice
1713 by the inner loop factor. */
1714 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1715 factor, kind, stmt_info, 0, vect_prologue);
1716 }
1717 }
1718
1719 /* Now accumulate cost. */
1720 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1721 add_stmt_costs (loop_vinfo->scalar_costs,
1722 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1723 loop_vinfo->scalar_costs->finish_cost (nullptr);
1724 }
1725
1726 /* Function vect_analyze_loop_form.
1727
1728 Verify that certain CFG restrictions hold, including:
1729 - the loop has a pre-header
1730 - the loop has a single entry
1731 - nested loops can have only a single exit.
1732 - the loop exit condition is simple enough
1733 - the number of iterations can be analyzed, i.e, a countable loop. The
1734 niter could be analyzed under some assumptions. */
1735
1736 opt_result
1737 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1738 {
1739 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1740
1741 edge exit_e = vec_init_loop_exit_info (loop);
1742 if (!exit_e)
1743 return opt_result::failure_at (vect_location,
1744 "not vectorized:"
1745 " could not determine main exit from"
1746 " loop with multiple exits.\n");
1747 info->loop_exit = exit_e;
1748 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE, vect_location,
1750 "using as main loop exit: %d -> %d [AUX: %p]\n",
1751 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1752
1753 /* Check if we have any control flow that doesn't leave the loop. */
1754 class loop *v_loop = loop->inner ? loop->inner : loop;
1755 basic_block *bbs = get_loop_body (v_loop);
1756 for (unsigned i = 0; i < v_loop->num_nodes; i++)
1757 if (EDGE_COUNT (bbs[i]->succs) != 1
1758 && (EDGE_COUNT (bbs[i]->succs) != 2
1759 || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1760 {
1761 free (bbs);
1762 return opt_result::failure_at (vect_location,
1763 "not vectorized:"
1764 " unsupported control flow in loop.\n");
1765 }
1766 free (bbs);
1767
1768 /* Different restrictions apply when we are considering an inner-most loop,
1769 vs. an outer (nested) loop.
1770 (FORNOW. May want to relax some of these restrictions in the future). */
1771
1772 info->inner_loop_cond = NULL;
1773 if (!loop->inner)
1774 {
1775 /* Inner-most loop. */
1776
1777 if (empty_block_p (loop->header))
1778 return opt_result::failure_at (vect_location,
1779 "not vectorized: empty loop.\n");
1780 }
1781 else
1782 {
1783 class loop *innerloop = loop->inner;
1784 edge entryedge;
1785
1786 /* Nested loop. We currently require that the loop is doubly-nested,
1787 contains a single inner loop with a single exit to the block
1788 with the single exit condition in the outer loop.
1789 Vectorizable outer-loops look like this:
1790
1791 (pre-header)
1792 |
1793 header <---+
1794 | |
1795 inner-loop |
1796 | |
1797 tail ------+
1798 |
1799 (exit-bb)
1800
1801 The inner-loop also has the properties expected of inner-most loops
1802 as described above. */
1803
1804 if ((loop->inner)->inner || (loop->inner)->next)
1805 return opt_result::failure_at (vect_location,
1806 "not vectorized:"
1807 " multiple nested loops.\n");
1808
1809 entryedge = loop_preheader_edge (innerloop);
1810 if (entryedge->src != loop->header
1811 || !single_exit (innerloop)
1812 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1813 return opt_result::failure_at (vect_location,
1814 "not vectorized:"
1815 " unsupported outerloop form.\n");
1816
1817 /* Analyze the inner-loop. */
1818 vect_loop_form_info inner;
1819 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1820 if (!res)
1821 {
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824 "not vectorized: Bad inner loop.\n");
1825 return res;
1826 }
1827
1828 /* Don't support analyzing niter under assumptions for inner
1829 loop. */
1830 if (!integer_onep (inner.assumptions))
1831 return opt_result::failure_at (vect_location,
1832 "not vectorized: Bad inner loop.\n");
1833
1834 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1835 return opt_result::failure_at (vect_location,
1836 "not vectorized: inner-loop count not"
1837 " invariant.\n");
1838
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE, vect_location,
1841 "Considering outer-loop vectorization.\n");
1842 info->inner_loop_cond = inner.conds[0];
1843 }
1844
1845 if (EDGE_COUNT (loop->header->preds) != 2)
1846 return opt_result::failure_at (vect_location,
1847 "not vectorized:"
1848 " too many incoming edges.\n");
1849
1850 /* We assume that the latch is empty. */
1851 if (!empty_block_p (loop->latch)
1852 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1853 return opt_result::failure_at (vect_location,
1854 "not vectorized: latch block not empty.\n");
1855
1856 /* Make sure there is no abnormal exit. */
1857 auto_vec<edge> exits = get_loop_exit_edges (loop);
1858 for (edge e : exits)
1859 {
1860 if (e->flags & EDGE_ABNORMAL)
1861 return opt_result::failure_at (vect_location,
1862 "not vectorized:"
1863 " abnormal loop exit edge.\n");
1864 }
1865
1866 info->conds
1867 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1868 &info->number_of_iterations,
1869 &info->number_of_iterationsm1);
1870 if (info->conds.is_empty ())
1871 return opt_result::failure_at
1872 (vect_location,
1873 "not vectorized: complicated exit condition.\n");
1874
1875 /* Determine what the primary and alternate exit conds are. */
1876 for (unsigned i = 0; i < info->conds.length (); i++)
1877 {
1878 gcond *cond = info->conds[i];
1879 if (exit_e->src == gimple_bb (cond))
1880 std::swap (info->conds[0], info->conds[i]);
1881 }
1882
1883 if (integer_zerop (info->assumptions)
1884 || !info->number_of_iterations
1885 || chrec_contains_undetermined (info->number_of_iterations))
1886 return opt_result::failure_at
1887 (info->conds[0],
1888 "not vectorized: number of iterations cannot be computed.\n");
1889
1890 if (integer_zerop (info->number_of_iterations))
1891 return opt_result::failure_at
1892 (info->conds[0],
1893 "not vectorized: number of iterations = 0.\n");
1894
1895 if (!(tree_fits_shwi_p (info->number_of_iterations)
1896 && tree_to_shwi (info->number_of_iterations) > 0))
1897 {
1898 if (dump_enabled_p ())
1899 {
1900 dump_printf_loc (MSG_NOTE, vect_location,
1901 "Symbolic number of iterations is ");
1902 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1903 dump_printf (MSG_NOTE, "\n");
1904 }
1905 }
1906
1907 return opt_result::success ();
1908 }
1909
1910 /* Create a loop_vec_info for LOOP with SHARED and the
1911 vect_analyze_loop_form result. */
1912
1913 loop_vec_info
1914 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1915 const vect_loop_form_info *info,
1916 loop_vec_info main_loop_info)
1917 {
1918 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1919 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1920 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1921 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1922 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1923 /* Also record the assumptions for versioning. */
1924 if (!integer_onep (info->assumptions) && !main_loop_info)
1925 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1926
1927 for (gcond *cond : info->conds)
1928 {
1929 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1930 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1931 /* Mark the statement as a condition. */
1932 STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1933 }
1934
1935 for (unsigned i = 1; i < info->conds.length (); i ++)
1936 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1937 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1938
1939 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1940
1941 /* Check to see if we're vectorizing multiple exits. */
1942 LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1943 = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1944
1945 if (info->inner_loop_cond)
1946 {
1947 stmt_vec_info inner_loop_cond_info
1948 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1949 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1950 /* If we have an estimate on the number of iterations of the inner
1951 loop use that to limit the scale for costing, otherwise use
1952 --param vect-inner-loop-cost-factor literally. */
1953 widest_int nit;
1954 if (estimated_stmt_executions (loop->inner, &nit))
1955 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1956 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1957 }
1958
1959 return loop_vinfo;
1960 }
1961
1962
1963
1964 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1965 statements update the vectorization factor. */
1966
1967 static void
1968 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1969 {
1970 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1972 int nbbs = loop->num_nodes;
1973 poly_uint64 vectorization_factor;
1974 int i;
1975
1976 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1977
1978 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979 gcc_assert (known_ne (vectorization_factor, 0U));
1980
1981 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1982 vectorization factor of the loop is the unrolling factor required by
1983 the SLP instances. If that unrolling factor is 1, we say, that we
1984 perform pure SLP on loop - cross iteration parallelism is not
1985 exploited. */
1986 bool only_slp_in_loop = true;
1987 for (i = 0; i < nbbs; i++)
1988 {
1989 basic_block bb = bbs[i];
1990 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1991 gsi_next (&si))
1992 {
1993 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1994 if (!stmt_info)
1995 continue;
1996 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1997 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1998 && !PURE_SLP_STMT (stmt_info))
1999 /* STMT needs both SLP and loop-based vectorization. */
2000 only_slp_in_loop = false;
2001 }
2002 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2003 gsi_next (&si))
2004 {
2005 if (is_gimple_debug (gsi_stmt (si)))
2006 continue;
2007 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2008 stmt_info = vect_stmt_to_vectorize (stmt_info);
2009 if ((STMT_VINFO_RELEVANT_P (stmt_info)
2010 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2011 && !PURE_SLP_STMT (stmt_info))
2012 /* STMT needs both SLP and loop-based vectorization. */
2013 only_slp_in_loop = false;
2014 }
2015 }
2016
2017 if (only_slp_in_loop)
2018 {
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_NOTE, vect_location,
2021 "Loop contains only SLP stmts\n");
2022 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2023 }
2024 else
2025 {
2026 if (dump_enabled_p ())
2027 dump_printf_loc (MSG_NOTE, vect_location,
2028 "Loop contains SLP and non-SLP stmts\n");
2029 /* Both the vectorization factor and unroll factor have the form
2030 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2031 so they must have a common multiple. */
2032 vectorization_factor
2033 = force_common_multiple (vectorization_factor,
2034 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2035 }
2036
2037 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2038 if (dump_enabled_p ())
2039 {
2040 dump_printf_loc (MSG_NOTE, vect_location,
2041 "Updating vectorization factor to ");
2042 dump_dec (MSG_NOTE, vectorization_factor);
2043 dump_printf (MSG_NOTE, ".\n");
2044 }
2045 }
2046
2047 /* Return true if STMT_INFO describes a double reduction phi and if
2048 the other phi in the reduction is also relevant for vectorization.
2049 This rejects cases such as:
2050
2051 outer1:
2052 x_1 = PHI <x_3(outer2), ...>;
2053 ...
2054
2055 inner:
2056 x_2 = ...;
2057 ...
2058
2059 outer2:
2060 x_3 = PHI <x_2(inner)>;
2061
2062 if nothing in x_2 or elsewhere makes x_1 relevant. */
2063
2064 static bool
2065 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2066 {
2067 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2068 return false;
2069
2070 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2071 }
2072
2073 /* Function vect_analyze_loop_operations.
2074
2075 Scan the loop stmts and make sure they are all vectorizable. */
2076
2077 static opt_result
2078 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2079 {
2080 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2081 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2082 int nbbs = loop->num_nodes;
2083 int i;
2084 stmt_vec_info stmt_info;
2085 bool need_to_vectorize = false;
2086 bool ok;
2087
2088 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2089
2090 auto_vec<stmt_info_for_cost> cost_vec;
2091
2092 for (i = 0; i < nbbs; i++)
2093 {
2094 basic_block bb = bbs[i];
2095
2096 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2097 gsi_next (&si))
2098 {
2099 gphi *phi = si.phi ();
2100 ok = true;
2101
2102 stmt_info = loop_vinfo->lookup_stmt (phi);
2103 if (dump_enabled_p ())
2104 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2105 (gimple *) phi);
2106 if (virtual_operand_p (gimple_phi_result (phi)))
2107 continue;
2108
2109 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2110 (i.e., a phi in the tail of the outer-loop). */
2111 if (! is_loop_header_bb_p (bb))
2112 {
2113 /* FORNOW: we currently don't support the case that these phis
2114 are not used in the outerloop (unless it is double reduction,
2115 i.e., this phi is vect_reduction_def), cause this case
2116 requires to actually do something here. */
2117 if (STMT_VINFO_LIVE_P (stmt_info)
2118 && !vect_active_double_reduction_p (stmt_info))
2119 return opt_result::failure_at (phi,
2120 "Unsupported loop-closed phi"
2121 " in outer-loop.\n");
2122
2123 /* If PHI is used in the outer loop, we check that its operand
2124 is defined in the inner loop. */
2125 if (STMT_VINFO_RELEVANT_P (stmt_info))
2126 {
2127 tree phi_op;
2128
2129 if (gimple_phi_num_args (phi) != 1)
2130 return opt_result::failure_at (phi, "unsupported phi");
2131
2132 phi_op = PHI_ARG_DEF (phi, 0);
2133 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2134 if (!op_def_info)
2135 return opt_result::failure_at (phi, "unsupported phi\n");
2136
2137 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2138 && (STMT_VINFO_RELEVANT (op_def_info)
2139 != vect_used_in_outer_by_reduction))
2140 return opt_result::failure_at (phi, "unsupported phi\n");
2141
2142 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2143 || (STMT_VINFO_DEF_TYPE (stmt_info)
2144 == vect_double_reduction_def))
2145 && !vectorizable_lc_phi (loop_vinfo,
2146 stmt_info, NULL, NULL))
2147 return opt_result::failure_at (phi, "unsupported phi\n");
2148 }
2149
2150 continue;
2151 }
2152
2153 gcc_assert (stmt_info);
2154
2155 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2156 || STMT_VINFO_LIVE_P (stmt_info))
2157 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2158 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2159 /* A scalar-dependence cycle that we don't support. */
2160 return opt_result::failure_at (phi,
2161 "not vectorized:"
2162 " scalar dependence cycle.\n");
2163
2164 if (STMT_VINFO_RELEVANT_P (stmt_info))
2165 {
2166 need_to_vectorize = true;
2167 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2168 && ! PURE_SLP_STMT (stmt_info))
2169 ok = vectorizable_induction (loop_vinfo,
2170 stmt_info, NULL, NULL,
2171 &cost_vec);
2172 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2173 || (STMT_VINFO_DEF_TYPE (stmt_info)
2174 == vect_double_reduction_def)
2175 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2176 && ! PURE_SLP_STMT (stmt_info))
2177 ok = vectorizable_reduction (loop_vinfo,
2178 stmt_info, NULL, NULL, &cost_vec);
2179 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2180 == vect_first_order_recurrence)
2181 && ! PURE_SLP_STMT (stmt_info))
2182 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2183 &cost_vec);
2184 }
2185
2186 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2187 if (ok
2188 && STMT_VINFO_LIVE_P (stmt_info)
2189 && !PURE_SLP_STMT (stmt_info))
2190 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2191 -1, false, &cost_vec);
2192
2193 if (!ok)
2194 return opt_result::failure_at (phi,
2195 "not vectorized: relevant phi not "
2196 "supported: %G",
2197 static_cast <gimple *> (phi));
2198 }
2199
2200 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2201 gsi_next (&si))
2202 {
2203 gimple *stmt = gsi_stmt (si);
2204 if (!gimple_clobber_p (stmt)
2205 && !is_gimple_debug (stmt))
2206 {
2207 opt_result res
2208 = vect_analyze_stmt (loop_vinfo,
2209 loop_vinfo->lookup_stmt (stmt),
2210 &need_to_vectorize,
2211 NULL, NULL, &cost_vec);
2212 if (!res)
2213 return res;
2214 }
2215 }
2216 } /* bbs */
2217
2218 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2219
2220 /* All operations in the loop are either irrelevant (deal with loop
2221 control, or dead), or only used outside the loop and can be moved
2222 out of the loop (e.g. invariants, inductions). The loop can be
2223 optimized away by scalar optimizations. We're better off not
2224 touching this loop. */
2225 if (!need_to_vectorize)
2226 {
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_NOTE, vect_location,
2229 "All the computation can be taken out of the loop.\n");
2230 return opt_result::failure_at
2231 (vect_location,
2232 "not vectorized: redundant loop. no profit to vectorize.\n");
2233 }
2234
2235 return opt_result::success ();
2236 }
2237
2238 /* Return true if we know that the iteration count is smaller than the
2239 vectorization factor. Return false if it isn't, or if we can't be sure
2240 either way. */
2241
2242 static bool
2243 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2244 {
2245 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2246
2247 HOST_WIDE_INT max_niter;
2248 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2249 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2250 else
2251 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2252
2253 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2254 return true;
2255
2256 return false;
2257 }
2258
2259 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2260 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2261 definitely no, or -1 if it's worth retrying. */
2262
2263 static int
2264 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2265 unsigned *suggested_unroll_factor)
2266 {
2267 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2268 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2269
2270 /* Only loops that can handle partially-populated vectors can have iteration
2271 counts less than the vectorization factor. */
2272 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2273 && vect_known_niters_smaller_than_vf (loop_vinfo))
2274 {
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277 "not vectorized: iteration count smaller than "
2278 "vectorization factor.\n");
2279 return 0;
2280 }
2281
2282 /* If we know the number of iterations we can do better, for the
2283 epilogue we can also decide whether the main loop leaves us
2284 with enough iterations, prefering a smaller vector epilog then
2285 also possibly used for the case we skip the vector loop. */
2286 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2287 {
2288 widest_int scalar_niters
2289 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2290 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2291 {
2292 loop_vec_info orig_loop_vinfo
2293 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2294 unsigned lowest_vf
2295 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2296 int prolog_peeling = 0;
2297 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2298 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2299 if (prolog_peeling >= 0
2300 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2301 lowest_vf))
2302 {
2303 unsigned gap
2304 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2305 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2306 % lowest_vf + gap);
2307 }
2308 }
2309 /* Reject vectorizing for a single scalar iteration, even if
2310 we could in principle implement that using partial vectors. */
2311 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2312 if (scalar_niters <= peeling_gap + 1)
2313 {
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316 "not vectorized: loop only has a single "
2317 "scalar iteration.\n");
2318 return 0;
2319 }
2320
2321 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2322 {
2323 /* Check that the loop processes at least one full vector. */
2324 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2325 if (known_lt (scalar_niters, vf))
2326 {
2327 if (dump_enabled_p ())
2328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329 "loop does not have enough iterations "
2330 "to support vectorization.\n");
2331 return 0;
2332 }
2333
2334 /* If we need to peel an extra epilogue iteration to handle data
2335 accesses with gaps, check that there are enough scalar iterations
2336 available.
2337
2338 The check above is redundant with this one when peeling for gaps,
2339 but the distinction is useful for diagnostics. */
2340 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2341 && known_le (scalar_niters, vf))
2342 {
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345 "loop does not have enough iterations "
2346 "to support peeling for gaps.\n");
2347 return 0;
2348 }
2349 }
2350 }
2351
2352 /* If using the "very cheap" model. reject cases in which we'd keep
2353 a copy of the scalar code (even if we might be able to vectorize it). */
2354 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2355 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2356 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2358 {
2359 if (dump_enabled_p ())
2360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361 "some scalar iterations would need to be peeled\n");
2362 return 0;
2363 }
2364
2365 int min_profitable_iters, min_profitable_estimate;
2366 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2367 &min_profitable_estimate,
2368 suggested_unroll_factor);
2369
2370 if (min_profitable_iters < 0)
2371 {
2372 if (dump_enabled_p ())
2373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374 "not vectorized: vectorization not profitable.\n");
2375 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377 "not vectorized: vector version will never be "
2378 "profitable.\n");
2379 return -1;
2380 }
2381
2382 int min_scalar_loop_bound = (param_min_vect_loop_bound
2383 * assumed_vf);
2384
2385 /* Use the cost model only if it is more conservative than user specified
2386 threshold. */
2387 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2388 min_profitable_iters);
2389
2390 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2391
2392 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2393 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2394 {
2395 if (dump_enabled_p ())
2396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397 "not vectorized: vectorization not profitable.\n");
2398 if (dump_enabled_p ())
2399 dump_printf_loc (MSG_NOTE, vect_location,
2400 "not vectorized: iteration count smaller than user "
2401 "specified loop bound parameter or minimum profitable "
2402 "iterations (whichever is more conservative).\n");
2403 return 0;
2404 }
2405
2406 /* The static profitablity threshold min_profitable_estimate includes
2407 the cost of having to check at runtime whether the scalar loop
2408 should be used instead. If it turns out that we don't need or want
2409 such a check, the threshold we should use for the static estimate
2410 is simply the point at which the vector loop becomes more profitable
2411 than the scalar loop. */
2412 if (min_profitable_estimate > min_profitable_iters
2413 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2414 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2415 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2416 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2417 {
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2420 " choice between the scalar and vector loops\n");
2421 min_profitable_estimate = min_profitable_iters;
2422 }
2423
2424 /* If the vector loop needs multiple iterations to be beneficial then
2425 things are probably too close to call, and the conservative thing
2426 would be to stick with the scalar code. */
2427 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2428 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2429 {
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432 "one iteration of the vector loop would be"
2433 " more expensive than the equivalent number of"
2434 " iterations of the scalar loop\n");
2435 return 0;
2436 }
2437
2438 HOST_WIDE_INT estimated_niter;
2439
2440 /* If we are vectorizing an epilogue then we know the maximum number of
2441 scalar iterations it will cover is at least one lower than the
2442 vectorization factor of the main loop. */
2443 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2444 estimated_niter
2445 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2446 else
2447 {
2448 estimated_niter = estimated_stmt_executions_int (loop);
2449 if (estimated_niter == -1)
2450 estimated_niter = likely_max_stmt_executions_int (loop);
2451 }
2452 if (estimated_niter != -1
2453 && ((unsigned HOST_WIDE_INT) estimated_niter
2454 < MAX (th, (unsigned) min_profitable_estimate)))
2455 {
2456 if (dump_enabled_p ())
2457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458 "not vectorized: estimated iteration count too "
2459 "small.\n");
2460 if (dump_enabled_p ())
2461 dump_printf_loc (MSG_NOTE, vect_location,
2462 "not vectorized: estimated iteration count smaller "
2463 "than specified loop bound parameter or minimum "
2464 "profitable iterations (whichever is more "
2465 "conservative).\n");
2466 return -1;
2467 }
2468
2469 return 1;
2470 }
2471
2472 static opt_result
2473 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2474 vec<data_reference_p> *datarefs,
2475 unsigned int *n_stmts)
2476 {
2477 *n_stmts = 0;
2478 for (unsigned i = 0; i < loop->num_nodes; i++)
2479 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2480 !gsi_end_p (gsi); gsi_next (&gsi))
2481 {
2482 gimple *stmt = gsi_stmt (gsi);
2483 if (is_gimple_debug (stmt))
2484 continue;
2485 ++(*n_stmts);
2486 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2487 NULL, 0);
2488 if (!res)
2489 {
2490 if (is_gimple_call (stmt) && loop->safelen)
2491 {
2492 tree fndecl = gimple_call_fndecl (stmt), op;
2493 if (fndecl == NULL_TREE
2494 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2495 {
2496 fndecl = gimple_call_arg (stmt, 0);
2497 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2498 fndecl = TREE_OPERAND (fndecl, 0);
2499 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2500 }
2501 if (fndecl != NULL_TREE)
2502 {
2503 cgraph_node *node = cgraph_node::get (fndecl);
2504 if (node != NULL && node->simd_clones != NULL)
2505 {
2506 unsigned int j, n = gimple_call_num_args (stmt);
2507 for (j = 0; j < n; j++)
2508 {
2509 op = gimple_call_arg (stmt, j);
2510 if (DECL_P (op)
2511 || (REFERENCE_CLASS_P (op)
2512 && get_base_address (op)))
2513 break;
2514 }
2515 op = gimple_call_lhs (stmt);
2516 /* Ignore #pragma omp declare simd functions
2517 if they don't have data references in the
2518 call stmt itself. */
2519 if (j == n
2520 && !(op
2521 && (DECL_P (op)
2522 || (REFERENCE_CLASS_P (op)
2523 && get_base_address (op)))))
2524 continue;
2525 }
2526 }
2527 }
2528 return res;
2529 }
2530 /* If dependence analysis will give up due to the limit on the
2531 number of datarefs stop here and fail fatally. */
2532 if (datarefs->length ()
2533 > (unsigned)param_loop_max_datarefs_for_datadeps)
2534 return opt_result::failure_at (stmt, "exceeded param "
2535 "loop-max-datarefs-for-datadeps\n");
2536 }
2537 return opt_result::success ();
2538 }
2539
2540 /* Look for SLP-only access groups and turn each individual access into its own
2541 group. */
2542 static void
2543 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2544 {
2545 unsigned int i;
2546 struct data_reference *dr;
2547
2548 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2549
2550 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2551 FOR_EACH_VEC_ELT (datarefs, i, dr)
2552 {
2553 gcc_assert (DR_REF (dr));
2554 stmt_vec_info stmt_info
2555 = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2556
2557 /* Check if the load is a part of an interleaving chain. */
2558 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2559 {
2560 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2561 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2562 unsigned int group_size = DR_GROUP_SIZE (first_element);
2563
2564 /* Check if SLP-only groups. */
2565 if (!STMT_SLP_TYPE (stmt_info)
2566 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2567 {
2568 /* Dissolve the group. */
2569 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2570
2571 stmt_vec_info vinfo = first_element;
2572 while (vinfo)
2573 {
2574 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2575 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2576 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2577 DR_GROUP_SIZE (vinfo) = 1;
2578 if (STMT_VINFO_STRIDED_P (first_element)
2579 /* We cannot handle stores with gaps. */
2580 || DR_IS_WRITE (dr_info->dr))
2581 {
2582 STMT_VINFO_STRIDED_P (vinfo) = true;
2583 DR_GROUP_GAP (vinfo) = 0;
2584 }
2585 else
2586 DR_GROUP_GAP (vinfo) = group_size - 1;
2587 /* Duplicate and adjust alignment info, it needs to
2588 be present on each group leader, see dr_misalignment. */
2589 if (vinfo != first_element)
2590 {
2591 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2592 dr_info2->target_alignment = dr_info->target_alignment;
2593 int misalignment = dr_info->misalignment;
2594 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2595 {
2596 HOST_WIDE_INT diff
2597 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2598 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2599 unsigned HOST_WIDE_INT align_c
2600 = dr_info->target_alignment.to_constant ();
2601 misalignment = (misalignment + diff) % align_c;
2602 }
2603 dr_info2->misalignment = misalignment;
2604 }
2605 vinfo = next;
2606 }
2607 }
2608 }
2609 }
2610 }
2611
2612 /* Determine if operating on full vectors for LOOP_VINFO might leave
2613 some scalar iterations still to do. If so, decide how we should
2614 handle those scalar iterations. The possibilities are:
2615
2616 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2617 In this case:
2618
2619 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2620 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2621 LOOP_VINFO_PEELING_FOR_NITER == false
2622
2623 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2624 to handle the remaining scalar iterations. In this case:
2625
2626 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2627 LOOP_VINFO_PEELING_FOR_NITER == true
2628
2629 There are two choices:
2630
2631 (2a) Consider vectorizing the epilogue loop at the same VF as the
2632 main loop, but using partial vectors instead of full vectors.
2633 In this case:
2634
2635 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2636
2637 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2638 In this case:
2639
2640 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2641 */
2642
2643 opt_result
2644 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2645 {
2646 /* Determine whether there would be any scalar iterations left over. */
2647 bool need_peeling_or_partial_vectors_p
2648 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2649
2650 /* Decide whether to vectorize the loop with partial vectors. */
2651 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2652 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2653 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2654 && need_peeling_or_partial_vectors_p)
2655 {
2656 /* For partial-vector-usage=1, try to push the handling of partial
2657 vectors to the epilogue, with the main loop continuing to operate
2658 on full vectors.
2659
2660 If we are unrolling we also do not want to use partial vectors. This
2661 is to avoid the overhead of generating multiple masks and also to
2662 avoid having to execute entire iterations of FALSE masked instructions
2663 when dealing with one or less full iterations.
2664
2665 ??? We could then end up failing to use partial vectors if we
2666 decide to peel iterations into a prologue, and if the main loop
2667 then ends up processing fewer than VF iterations. */
2668 if ((param_vect_partial_vector_usage == 1
2669 || loop_vinfo->suggested_unroll_factor > 1)
2670 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2671 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2672 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2673 else
2674 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2675 }
2676
2677 if (dump_enabled_p ())
2678 dump_printf_loc (MSG_NOTE, vect_location,
2679 "operating on %s vectors%s.\n",
2680 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2681 ? "partial" : "full",
2682 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2683 ? " for epilogue loop" : "");
2684
2685 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2686 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2687 && need_peeling_or_partial_vectors_p);
2688
2689 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2690 analysis that we don't know whether the loop is vectorized by partial
2691 vectors (More details see tree-vect-loop-manip.cc).
2692
2693 However, SELECT_VL vectorizaton style should only applied on partial
2694 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2695 number of elements to be process for each iteration.
2696
2697 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2698 if it is not partial vectorized loop. */
2699 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2700 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2701
2702 return opt_result::success ();
2703 }
2704
2705 /* Function vect_analyze_loop_2.
2706
2707 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2708 analyses will record information in some members of LOOP_VINFO. FATAL
2709 indicates if some analysis meets fatal error. If one non-NULL pointer
2710 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2711 worked out suggested unroll factor, while one NULL pointer shows it's
2712 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2713 is to hold the slp decision when the suggested unroll factor is worked
2714 out. */
2715 static opt_result
2716 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2717 unsigned *suggested_unroll_factor,
2718 bool& slp_done_for_suggested_uf)
2719 {
2720 opt_result ok = opt_result::success ();
2721 int res;
2722 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2723 poly_uint64 min_vf = 2;
2724 loop_vec_info orig_loop_vinfo = NULL;
2725
2726 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2727 loop_vec_info of the first vectorized loop. */
2728 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2729 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2730 else
2731 orig_loop_vinfo = loop_vinfo;
2732 gcc_assert (orig_loop_vinfo);
2733
2734 /* The first group of checks is independent of the vector size. */
2735 fatal = true;
2736
2737 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2738 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2739 return opt_result::failure_at (vect_location,
2740 "not vectorized: simd if(0)\n");
2741
2742 /* Find all data references in the loop (which correspond to vdefs/vuses)
2743 and analyze their evolution in the loop. */
2744
2745 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2746
2747 /* Gather the data references and count stmts in the loop. */
2748 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2749 {
2750 opt_result res
2751 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2752 &LOOP_VINFO_DATAREFS (loop_vinfo),
2753 &LOOP_VINFO_N_STMTS (loop_vinfo));
2754 if (!res)
2755 {
2756 if (dump_enabled_p ())
2757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2758 "not vectorized: loop contains function "
2759 "calls or data references that cannot "
2760 "be analyzed\n");
2761 return res;
2762 }
2763 loop_vinfo->shared->save_datarefs ();
2764 }
2765 else
2766 loop_vinfo->shared->check_datarefs ();
2767
2768 /* Analyze the data references and also adjust the minimal
2769 vectorization factor according to the loads and stores. */
2770
2771 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2772 if (!ok)
2773 {
2774 if (dump_enabled_p ())
2775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776 "bad data references.\n");
2777 return ok;
2778 }
2779
2780 /* Check if we are applying unroll factor now. */
2781 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2782 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2783
2784 /* If the slp decision is false when suggested unroll factor is worked
2785 out, and we are applying suggested unroll factor, we can simply skip
2786 all slp related analyses this time. */
2787 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2788
2789 /* Classify all cross-iteration scalar data-flow cycles.
2790 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2791 vect_analyze_scalar_cycles (loop_vinfo, slp);
2792
2793 vect_pattern_recog (loop_vinfo);
2794
2795 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2796
2797 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2798 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2799
2800 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2801 if (!ok)
2802 {
2803 if (dump_enabled_p ())
2804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2805 "bad data access.\n");
2806 return ok;
2807 }
2808
2809 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2810
2811 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2812 if (!ok)
2813 {
2814 if (dump_enabled_p ())
2815 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2816 "unexpected pattern.\n");
2817 return ok;
2818 }
2819
2820 /* While the rest of the analysis below depends on it in some way. */
2821 fatal = false;
2822
2823 /* Analyze data dependences between the data-refs in the loop
2824 and adjust the maximum vectorization factor according to
2825 the dependences.
2826 FORNOW: fail at the first data dependence that we encounter. */
2827
2828 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2829 if (!ok)
2830 {
2831 if (dump_enabled_p ())
2832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2833 "bad data dependence.\n");
2834 return ok;
2835 }
2836 if (max_vf != MAX_VECTORIZATION_FACTOR
2837 && maybe_lt (max_vf, min_vf))
2838 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2839 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2840
2841 ok = vect_determine_vectorization_factor (loop_vinfo);
2842 if (!ok)
2843 {
2844 if (dump_enabled_p ())
2845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2846 "can't determine vectorization factor.\n");
2847 return ok;
2848 }
2849
2850 /* Compute the scalar iteration cost. */
2851 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2852
2853 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2854
2855 if (slp)
2856 {
2857 /* Check the SLP opportunities in the loop, analyze and build
2858 SLP trees. */
2859 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2860 if (!ok)
2861 return ok;
2862
2863 /* If there are any SLP instances mark them as pure_slp. */
2864 slp = vect_make_slp_decision (loop_vinfo);
2865 if (slp)
2866 {
2867 /* Find stmts that need to be both vectorized and SLPed. */
2868 vect_detect_hybrid_slp (loop_vinfo);
2869
2870 /* Update the vectorization factor based on the SLP decision. */
2871 vect_update_vf_for_slp (loop_vinfo);
2872
2873 /* Optimize the SLP graph with the vectorization factor fixed. */
2874 vect_optimize_slp (loop_vinfo);
2875
2876 /* Gather the loads reachable from the SLP graph entries. */
2877 vect_gather_slp_loads (loop_vinfo);
2878 }
2879 }
2880
2881 bool saved_can_use_partial_vectors_p
2882 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2883
2884 /* We don't expect to have to roll back to anything other than an empty
2885 set of rgroups. */
2886 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2887
2888 /* This is the point where we can re-start analysis with SLP forced off. */
2889 start_over:
2890
2891 /* Apply the suggested unrolling factor, this was determined by the backend
2892 during finish_cost the first time we ran the analyzis for this
2893 vector mode. */
2894 if (applying_suggested_uf)
2895 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2896
2897 /* Now the vectorization factor is final. */
2898 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2899 gcc_assert (known_ne (vectorization_factor, 0U));
2900
2901 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2902 {
2903 dump_printf_loc (MSG_NOTE, vect_location,
2904 "vectorization_factor = ");
2905 dump_dec (MSG_NOTE, vectorization_factor);
2906 dump_printf (MSG_NOTE, ", niters = %wd\n",
2907 LOOP_VINFO_INT_NITERS (loop_vinfo));
2908 }
2909
2910 if (max_vf != MAX_VECTORIZATION_FACTOR
2911 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2912 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2913
2914 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2915
2916 /* Analyze the alignment of the data-refs in the loop.
2917 Fail if a data reference is found that cannot be vectorized. */
2918
2919 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2920 if (!ok)
2921 {
2922 if (dump_enabled_p ())
2923 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2924 "bad data alignment.\n");
2925 return ok;
2926 }
2927
2928 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2929 It is important to call pruning after vect_analyze_data_ref_accesses,
2930 since we use grouping information gathered by interleaving analysis. */
2931 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2932 if (!ok)
2933 return ok;
2934
2935 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2936 vectorization, since we do not want to add extra peeling or
2937 add versioning for alignment. */
2938 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2939 /* This pass will decide on using loop versioning and/or loop peeling in
2940 order to enhance the alignment of data references in the loop. */
2941 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2942 if (!ok)
2943 return ok;
2944
2945 if (slp)
2946 {
2947 /* Analyze operations in the SLP instances. Note this may
2948 remove unsupported SLP instances which makes the above
2949 SLP kind detection invalid. */
2950 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2951 vect_slp_analyze_operations (loop_vinfo);
2952 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2953 {
2954 ok = opt_result::failure_at (vect_location,
2955 "unsupported SLP instances\n");
2956 goto again;
2957 }
2958
2959 /* Check whether any load in ALL SLP instances is possibly permuted. */
2960 slp_tree load_node, slp_root;
2961 unsigned i, x;
2962 slp_instance instance;
2963 bool can_use_lanes = true;
2964 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2965 {
2966 slp_root = SLP_INSTANCE_TREE (instance);
2967 int group_size = SLP_TREE_LANES (slp_root);
2968 tree vectype = SLP_TREE_VECTYPE (slp_root);
2969 bool loads_permuted = false;
2970 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2971 {
2972 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2973 continue;
2974 unsigned j;
2975 stmt_vec_info load_info;
2976 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2977 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2978 {
2979 loads_permuted = true;
2980 break;
2981 }
2982 }
2983
2984 /* If the loads and stores can be handled with load/store-lane
2985 instructions record it and move on to the next instance. */
2986 if (loads_permuted
2987 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2988 && vect_store_lanes_supported (vectype, group_size, false)
2989 != IFN_LAST)
2990 {
2991 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2992 if (STMT_VINFO_GROUPED_ACCESS
2993 (SLP_TREE_REPRESENTATIVE (load_node)))
2994 {
2995 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2996 (SLP_TREE_REPRESENTATIVE (load_node));
2997 /* Use SLP for strided accesses (or if we can't
2998 load-lanes). */
2999 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
3000 || vect_load_lanes_supported
3001 (STMT_VINFO_VECTYPE (stmt_vinfo),
3002 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3003 break;
3004 }
3005
3006 can_use_lanes
3007 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3008
3009 if (can_use_lanes && dump_enabled_p ())
3010 dump_printf_loc (MSG_NOTE, vect_location,
3011 "SLP instance %p can use load/store-lanes\n",
3012 (void *) instance);
3013 }
3014 else
3015 {
3016 can_use_lanes = false;
3017 break;
3018 }
3019 }
3020
3021 /* If all SLP instances can use load/store-lanes abort SLP and try again
3022 with SLP disabled. */
3023 if (can_use_lanes)
3024 {
3025 ok = opt_result::failure_at (vect_location,
3026 "Built SLP cancelled: can use "
3027 "load/store-lanes\n");
3028 if (dump_enabled_p ())
3029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3030 "Built SLP cancelled: all SLP instances support "
3031 "load/store-lanes\n");
3032 goto again;
3033 }
3034 }
3035
3036 /* Dissolve SLP-only groups. */
3037 vect_dissolve_slp_only_groups (loop_vinfo);
3038
3039 /* Scan all the remaining operations in the loop that are not subject
3040 to SLP and make sure they are vectorizable. */
3041 ok = vect_analyze_loop_operations (loop_vinfo);
3042 if (!ok)
3043 {
3044 if (dump_enabled_p ())
3045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3046 "bad operation or unsupported loop bound.\n");
3047 return ok;
3048 }
3049
3050 /* For now, we don't expect to mix both masking and length approaches for one
3051 loop, disable it if both are recorded. */
3052 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3053 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3054 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3055 {
3056 if (dump_enabled_p ())
3057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058 "can't vectorize a loop with partial vectors"
3059 " because we don't expect to mix different"
3060 " approaches with partial vectors for the"
3061 " same loop.\n");
3062 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3063 }
3064
3065 /* If we still have the option of using partial vectors,
3066 check whether we can generate the necessary loop controls. */
3067 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3068 {
3069 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3070 {
3071 if (!vect_verify_full_masking (loop_vinfo)
3072 && !vect_verify_full_masking_avx512 (loop_vinfo))
3073 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3074 }
3075 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3076 if (!vect_verify_loop_lens (loop_vinfo))
3077 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3078 }
3079
3080 /* If we're vectorizing a loop that uses length "controls" and
3081 can iterate more than once, we apply decrementing IV approach
3082 in loop control. */
3083 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3084 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3085 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3086 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3087 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3088 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3089 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3090
3091 /* If a loop uses length controls and has a decrementing loop control IV,
3092 we will normally pass that IV through a MIN_EXPR to calcaluate the
3093 basis for the length controls. E.g. in a loop that processes one
3094 element per scalar iteration, the number of elements would be
3095 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3096
3097 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3098 step, since only the final iteration of the vector loop can have
3099 inactive lanes.
3100
3101 However, some targets have a dedicated instruction for calculating the
3102 preferred length, given the total number of elements that still need to
3103 be processed. This is encapsulated in the SELECT_VL internal function.
3104
3105 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3106 to determine the basis for the length controls. However, unlike the
3107 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3108 lanes inactive in any iteration of the vector loop, not just the last
3109 iteration. This SELECT_VL approach therefore requires us to use pointer
3110 IVs with variable steps.
3111
3112 Once we've decided how many elements should be processed by one
3113 iteration of the vector loop, we need to populate the rgroup controls.
3114 If a loop has multiple rgroups, we need to make sure that those rgroups
3115 "line up" (that is, they must be consistent about which elements are
3116 active and which aren't). This is done by vect_adjust_loop_lens_control.
3117
3118 In principle, it would be possible to use vect_adjust_loop_lens_control
3119 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3120 However:
3121
3122 (1) In practice, it only makes sense to use SELECT_VL when a vector
3123 operation will be controlled directly by the result. It is not
3124 worth using SELECT_VL if it would only be the input to other
3125 calculations.
3126
3127 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3128 pointer IV will need N updates by a variable amount (N-1 updates
3129 within the iteration and 1 update to move to the next iteration).
3130
3131 Because of this, we prefer to use the MIN_EXPR approach whenever there
3132 is more than one length control.
3133
3134 In addition, SELECT_VL always operates to a granularity of 1 unit.
3135 If we wanted to use it to control an SLP operation on N consecutive
3136 elements, we would need to make the SELECT_VL inputs measure scalar
3137 iterations (rather than elements) and then multiply the SELECT_VL
3138 result by N. But using SELECT_VL this way is inefficient because
3139 of (1) above.
3140
3141 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3142 satisfied:
3143
3144 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3145 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3146
3147 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3148 we will fail to gain benefits of following unroll optimizations. We prefer
3149 using the MIN_EXPR approach in this situation. */
3150 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3151 {
3152 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3153 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3154 OPTIMIZE_FOR_SPEED)
3155 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3156 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3157 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3158 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3159 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3160 }
3161
3162 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3163 assuming that the loop will be used as a main loop. We will redo
3164 this analysis later if we instead decide to use the loop as an
3165 epilogue loop. */
3166 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3167 if (!ok)
3168 return ok;
3169
3170 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3171 to be able to handle fewer than VF scalars, or needs to have a lower VF
3172 than the main loop. */
3173 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3174 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3175 {
3176 poly_uint64 unscaled_vf
3177 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3178 orig_loop_vinfo->suggested_unroll_factor);
3179 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3180 return opt_result::failure_at (vect_location,
3181 "Vectorization factor too high for"
3182 " epilogue loop.\n");
3183 }
3184
3185 /* Check the costings of the loop make vectorizing worthwhile. */
3186 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3187 if (res < 0)
3188 {
3189 ok = opt_result::failure_at (vect_location,
3190 "Loop costings may not be worthwhile.\n");
3191 goto again;
3192 }
3193 if (!res)
3194 return opt_result::failure_at (vect_location,
3195 "Loop costings not worthwhile.\n");
3196
3197 /* If an epilogue loop is required make sure we can create one. */
3198 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3199 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3200 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3201 {
3202 if (dump_enabled_p ())
3203 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3204 if (!vect_can_advance_ivs_p (loop_vinfo)
3205 || !slpeel_can_duplicate_loop_p (loop,
3206 LOOP_VINFO_IV_EXIT (loop_vinfo),
3207 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3208 {
3209 ok = opt_result::failure_at (vect_location,
3210 "not vectorized: can't create required "
3211 "epilog loop\n");
3212 goto again;
3213 }
3214 }
3215
3216 /* During peeling, we need to check if number of loop iterations is
3217 enough for both peeled prolog loop and vector loop. This check
3218 can be merged along with threshold check of loop versioning, so
3219 increase threshold for this case if necessary.
3220
3221 If we are analyzing an epilogue we still want to check what its
3222 versioning threshold would be. If we decide to vectorize the epilogues we
3223 will want to use the lowest versioning threshold of all epilogues and main
3224 loop. This will enable us to enter a vectorized epilogue even when
3225 versioning the loop. We can't simply check whether the epilogue requires
3226 versioning though since we may have skipped some versioning checks when
3227 analyzing the epilogue. For instance, checks for alias versioning will be
3228 skipped when dealing with epilogues as we assume we already checked them
3229 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3230 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3231 {
3232 poly_uint64 niters_th = 0;
3233 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3234
3235 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3236 {
3237 /* Niters for peeled prolog loop. */
3238 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3239 {
3240 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3241 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3242 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3243 }
3244 else
3245 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3246 }
3247
3248 /* Niters for at least one iteration of vectorized loop. */
3249 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3250 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3251 /* One additional iteration because of peeling for gap. */
3252 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3253 niters_th += 1;
3254
3255 /* Use the same condition as vect_transform_loop to decide when to use
3256 the cost to determine a versioning threshold. */
3257 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3258 && ordered_p (th, niters_th))
3259 niters_th = ordered_max (poly_uint64 (th), niters_th);
3260
3261 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3262 }
3263
3264 gcc_assert (known_eq (vectorization_factor,
3265 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3266
3267 slp_done_for_suggested_uf = slp;
3268
3269 /* Ok to vectorize! */
3270 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3271 return opt_result::success ();
3272
3273 again:
3274 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3275 gcc_assert (!ok);
3276
3277 /* Try again with SLP forced off but if we didn't do any SLP there is
3278 no point in re-trying. */
3279 if (!slp)
3280 return ok;
3281
3282 /* If the slp decision is true when suggested unroll factor is worked
3283 out, and we are applying suggested unroll factor, we don't need to
3284 re-try any more. */
3285 if (applying_suggested_uf && slp_done_for_suggested_uf)
3286 return ok;
3287
3288 /* If there are reduction chains re-trying will fail anyway. */
3289 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3290 return ok;
3291
3292 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3293 via interleaving or lane instructions. */
3294 slp_instance instance;
3295 slp_tree node;
3296 unsigned i, j;
3297 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3298 {
3299 stmt_vec_info vinfo;
3300 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3301 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3302 continue;
3303 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3304 unsigned int size = DR_GROUP_SIZE (vinfo);
3305 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3306 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3307 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3308 && ! vect_grouped_store_supported (vectype, size))
3309 return opt_result::failure_at (vinfo->stmt,
3310 "unsupported grouped store\n");
3311 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3312 {
3313 vinfo = SLP_TREE_REPRESENTATIVE (node);
3314 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3315 {
3316 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3317 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3318 size = DR_GROUP_SIZE (vinfo);
3319 vectype = STMT_VINFO_VECTYPE (vinfo);
3320 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3321 && ! vect_grouped_load_supported (vectype, single_element_p,
3322 size))
3323 return opt_result::failure_at (vinfo->stmt,
3324 "unsupported grouped load\n");
3325 }
3326 }
3327 }
3328
3329 if (dump_enabled_p ())
3330 dump_printf_loc (MSG_NOTE, vect_location,
3331 "re-trying with SLP disabled\n");
3332
3333 /* Roll back state appropriately. No SLP this time. */
3334 slp = false;
3335 /* Restore vectorization factor as it were without SLP. */
3336 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3337 /* Free the SLP instances. */
3338 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3339 vect_free_slp_instance (instance);
3340 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3341 /* Reset SLP type to loop_vect on all stmts. */
3342 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3343 {
3344 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3345 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3346 !gsi_end_p (si); gsi_next (&si))
3347 {
3348 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3349 STMT_SLP_TYPE (stmt_info) = loop_vect;
3350 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3351 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3352 {
3353 /* vectorizable_reduction adjusts reduction stmt def-types,
3354 restore them to that of the PHI. */
3355 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3356 = STMT_VINFO_DEF_TYPE (stmt_info);
3357 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3358 (STMT_VINFO_REDUC_DEF (stmt_info)))
3359 = STMT_VINFO_DEF_TYPE (stmt_info);
3360 }
3361 }
3362 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3363 !gsi_end_p (si); gsi_next (&si))
3364 {
3365 if (is_gimple_debug (gsi_stmt (si)))
3366 continue;
3367 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3368 STMT_SLP_TYPE (stmt_info) = loop_vect;
3369 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3370 {
3371 stmt_vec_info pattern_stmt_info
3372 = STMT_VINFO_RELATED_STMT (stmt_info);
3373 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3374 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3375
3376 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3377 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3378 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3379 !gsi_end_p (pi); gsi_next (&pi))
3380 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3381 = loop_vect;
3382 }
3383 }
3384 }
3385 /* Free optimized alias test DDRS. */
3386 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3387 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3388 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3389 /* Reset target cost data. */
3390 delete loop_vinfo->vector_costs;
3391 loop_vinfo->vector_costs = nullptr;
3392 /* Reset accumulated rgroup information. */
3393 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3394 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3395 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3396 /* Reset assorted flags. */
3397 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3398 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3399 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3400 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3401 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3402 = saved_can_use_partial_vectors_p;
3403
3404 goto start_over;
3405 }
3406
3407 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3408 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3409 OLD_LOOP_VINFO is better unless something specifically indicates
3410 otherwise.
3411
3412 Note that this deliberately isn't a partial order. */
3413
3414 static bool
3415 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3416 loop_vec_info old_loop_vinfo)
3417 {
3418 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3419 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3420
3421 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3422 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3423
3424 /* Always prefer a VF of loop->simdlen over any other VF. */
3425 if (loop->simdlen)
3426 {
3427 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3428 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3429 if (new_simdlen_p != old_simdlen_p)
3430 return new_simdlen_p;
3431 }
3432
3433 const auto *old_costs = old_loop_vinfo->vector_costs;
3434 const auto *new_costs = new_loop_vinfo->vector_costs;
3435 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3436 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3437
3438 return new_costs->better_main_loop_than_p (old_costs);
3439 }
3440
3441 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3442 true if we should. */
3443
3444 static bool
3445 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3446 loop_vec_info old_loop_vinfo)
3447 {
3448 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3449 return false;
3450
3451 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_NOTE, vect_location,
3453 "***** Preferring vector mode %s to vector mode %s\n",
3454 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3455 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3456 return true;
3457 }
3458
3459 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3460 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3461 MODE_I to the next mode useful to analyze.
3462 Return the loop_vinfo on success and wrapped null on failure. */
3463
3464 static opt_loop_vec_info
3465 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3466 const vect_loop_form_info *loop_form_info,
3467 loop_vec_info main_loop_vinfo,
3468 const vector_modes &vector_modes, unsigned &mode_i,
3469 machine_mode &autodetected_vector_mode,
3470 bool &fatal)
3471 {
3472 loop_vec_info loop_vinfo
3473 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3474
3475 machine_mode vector_mode = vector_modes[mode_i];
3476 loop_vinfo->vector_mode = vector_mode;
3477 unsigned int suggested_unroll_factor = 1;
3478 bool slp_done_for_suggested_uf = false;
3479
3480 /* Run the main analysis. */
3481 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3482 &suggested_unroll_factor,
3483 slp_done_for_suggested_uf);
3484 if (dump_enabled_p ())
3485 dump_printf_loc (MSG_NOTE, vect_location,
3486 "***** Analysis %s with vector mode %s\n",
3487 res ? "succeeded" : " failed",
3488 GET_MODE_NAME (loop_vinfo->vector_mode));
3489
3490 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3491 {
3492 if (dump_enabled_p ())
3493 dump_printf_loc (MSG_NOTE, vect_location,
3494 "***** Re-trying analysis for unrolling"
3495 " with unroll factor %d and slp %s.\n",
3496 suggested_unroll_factor,
3497 slp_done_for_suggested_uf ? "on" : "off");
3498 loop_vec_info unroll_vinfo
3499 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3500 unroll_vinfo->vector_mode = vector_mode;
3501 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3502 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3503 slp_done_for_suggested_uf);
3504 if (new_res)
3505 {
3506 delete loop_vinfo;
3507 loop_vinfo = unroll_vinfo;
3508 }
3509 else
3510 delete unroll_vinfo;
3511 }
3512
3513 /* Remember the autodetected vector mode. */
3514 if (vector_mode == VOIDmode)
3515 autodetected_vector_mode = loop_vinfo->vector_mode;
3516
3517 /* Advance mode_i, first skipping modes that would result in the
3518 same analysis result. */
3519 while (mode_i + 1 < vector_modes.length ()
3520 && vect_chooses_same_modes_p (loop_vinfo,
3521 vector_modes[mode_i + 1]))
3522 {
3523 if (dump_enabled_p ())
3524 dump_printf_loc (MSG_NOTE, vect_location,
3525 "***** The result for vector mode %s would"
3526 " be the same\n",
3527 GET_MODE_NAME (vector_modes[mode_i + 1]));
3528 mode_i += 1;
3529 }
3530 if (mode_i + 1 < vector_modes.length ()
3531 && VECTOR_MODE_P (autodetected_vector_mode)
3532 && (related_vector_mode (vector_modes[mode_i + 1],
3533 GET_MODE_INNER (autodetected_vector_mode))
3534 == autodetected_vector_mode)
3535 && (related_vector_mode (autodetected_vector_mode,
3536 GET_MODE_INNER (vector_modes[mode_i + 1]))
3537 == vector_modes[mode_i + 1]))
3538 {
3539 if (dump_enabled_p ())
3540 dump_printf_loc (MSG_NOTE, vect_location,
3541 "***** Skipping vector mode %s, which would"
3542 " repeat the analysis for %s\n",
3543 GET_MODE_NAME (vector_modes[mode_i + 1]),
3544 GET_MODE_NAME (autodetected_vector_mode));
3545 mode_i += 1;
3546 }
3547 mode_i++;
3548
3549 if (!res)
3550 {
3551 delete loop_vinfo;
3552 if (fatal)
3553 gcc_checking_assert (main_loop_vinfo == NULL);
3554 return opt_loop_vec_info::propagate_failure (res);
3555 }
3556
3557 return opt_loop_vec_info::success (loop_vinfo);
3558 }
3559
3560 /* Function vect_analyze_loop.
3561
3562 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3563 for it. The different analyses will record information in the
3564 loop_vec_info struct. */
3565 opt_loop_vec_info
3566 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3567 {
3568 DUMP_VECT_SCOPE ("analyze_loop_nest");
3569
3570 if (loop_outer (loop)
3571 && loop_vec_info_for_loop (loop_outer (loop))
3572 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3573 return opt_loop_vec_info::failure_at (vect_location,
3574 "outer-loop already vectorized.\n");
3575
3576 if (!find_loop_nest (loop, &shared->loop_nest))
3577 return opt_loop_vec_info::failure_at
3578 (vect_location,
3579 "not vectorized: loop nest containing two or more consecutive inner"
3580 " loops cannot be vectorized\n");
3581
3582 /* Analyze the loop form. */
3583 vect_loop_form_info loop_form_info;
3584 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3585 if (!res)
3586 {
3587 if (dump_enabled_p ())
3588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589 "bad loop form.\n");
3590 return opt_loop_vec_info::propagate_failure (res);
3591 }
3592 if (!integer_onep (loop_form_info.assumptions))
3593 {
3594 /* We consider to vectorize this loop by versioning it under
3595 some assumptions. In order to do this, we need to clear
3596 existing information computed by scev and niter analyzer. */
3597 scev_reset_htab ();
3598 free_numbers_of_iterations_estimates (loop);
3599 /* Also set flag for this loop so that following scev and niter
3600 analysis are done under the assumptions. */
3601 loop_constraint_set (loop, LOOP_C_FINITE);
3602 }
3603 else
3604 /* Clear the existing niter information to make sure the nonwrapping flag
3605 will be calculated and set propriately. */
3606 free_numbers_of_iterations_estimates (loop);
3607
3608 auto_vector_modes vector_modes;
3609 /* Autodetect first vector size we try. */
3610 vector_modes.safe_push (VOIDmode);
3611 unsigned int autovec_flags
3612 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3613 loop->simdlen != 0);
3614 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3615 && !unlimited_cost_model (loop));
3616 machine_mode autodetected_vector_mode = VOIDmode;
3617 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3618 unsigned int mode_i = 0;
3619 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3620
3621 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3622 a mode has not been analyzed. */
3623 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3624 for (unsigned i = 0; i < vector_modes.length (); ++i)
3625 cached_vf_per_mode.safe_push (0);
3626
3627 /* First determine the main loop vectorization mode, either the first
3628 one that works, starting with auto-detecting the vector mode and then
3629 following the targets order of preference, or the one with the
3630 lowest cost if pick_lowest_cost_p. */
3631 while (1)
3632 {
3633 bool fatal;
3634 unsigned int last_mode_i = mode_i;
3635 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3636 failed. */
3637 cached_vf_per_mode[last_mode_i] = -1;
3638 opt_loop_vec_info loop_vinfo
3639 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3640 NULL, vector_modes, mode_i,
3641 autodetected_vector_mode, fatal);
3642 if (fatal)
3643 break;
3644
3645 if (loop_vinfo)
3646 {
3647 /* Analyzis has been successful so update the VF value. The
3648 VF should always be a multiple of unroll_factor and we want to
3649 capture the original VF here. */
3650 cached_vf_per_mode[last_mode_i]
3651 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3652 loop_vinfo->suggested_unroll_factor);
3653 /* Once we hit the desired simdlen for the first time,
3654 discard any previous attempts. */
3655 if (simdlen
3656 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3657 {
3658 delete first_loop_vinfo;
3659 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3660 simdlen = 0;
3661 }
3662 else if (pick_lowest_cost_p
3663 && first_loop_vinfo
3664 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3665 {
3666 /* Pick loop_vinfo over first_loop_vinfo. */
3667 delete first_loop_vinfo;
3668 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3669 }
3670 if (first_loop_vinfo == NULL)
3671 first_loop_vinfo = loop_vinfo;
3672 else
3673 {
3674 delete loop_vinfo;
3675 loop_vinfo = opt_loop_vec_info::success (NULL);
3676 }
3677
3678 /* Commit to first_loop_vinfo if we have no reason to try
3679 alternatives. */
3680 if (!simdlen && !pick_lowest_cost_p)
3681 break;
3682 }
3683 if (mode_i == vector_modes.length ()
3684 || autodetected_vector_mode == VOIDmode)
3685 break;
3686
3687 /* Try the next biggest vector size. */
3688 if (dump_enabled_p ())
3689 dump_printf_loc (MSG_NOTE, vect_location,
3690 "***** Re-trying analysis with vector mode %s\n",
3691 GET_MODE_NAME (vector_modes[mode_i]));
3692 }
3693 if (!first_loop_vinfo)
3694 return opt_loop_vec_info::propagate_failure (res);
3695
3696 if (dump_enabled_p ())
3697 dump_printf_loc (MSG_NOTE, vect_location,
3698 "***** Choosing vector mode %s\n",
3699 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3700
3701 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3702 enabled, SIMDUID is not set, it is the innermost loop and we have
3703 either already found the loop's SIMDLEN or there was no SIMDLEN to
3704 begin with.
3705 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3706 bool vect_epilogues = (!simdlen
3707 && loop->inner == NULL
3708 && param_vect_epilogues_nomask
3709 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3710 /* No code motion support for multiple epilogues so for now
3711 not supported when multiple exits. */
3712 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3713 && !loop->simduid);
3714 if (!vect_epilogues)
3715 return first_loop_vinfo;
3716
3717 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3718 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3719
3720 /* For epilogues start the analysis from the first mode. The motivation
3721 behind starting from the beginning comes from cases where the VECTOR_MODES
3722 array may contain length-agnostic and length-specific modes. Their
3723 ordering is not guaranteed, so we could end up picking a mode for the main
3724 loop that is after the epilogue's optimal mode. */
3725 vector_modes[0] = autodetected_vector_mode;
3726 mode_i = 0;
3727
3728 bool supports_partial_vectors =
3729 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3730 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3731
3732 while (1)
3733 {
3734 /* If the target does not support partial vectors we can shorten the
3735 number of modes to analyze for the epilogue as we know we can't pick a
3736 mode that would lead to a VF at least as big as the
3737 FIRST_VINFO_VF. */
3738 if (!supports_partial_vectors
3739 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3740 {
3741 mode_i++;
3742 if (mode_i == vector_modes.length ())
3743 break;
3744 continue;
3745 }
3746
3747 if (dump_enabled_p ())
3748 dump_printf_loc (MSG_NOTE, vect_location,
3749 "***** Re-trying epilogue analysis with vector "
3750 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3751
3752 bool fatal;
3753 opt_loop_vec_info loop_vinfo
3754 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3755 first_loop_vinfo,
3756 vector_modes, mode_i,
3757 autodetected_vector_mode, fatal);
3758 if (fatal)
3759 break;
3760
3761 if (loop_vinfo)
3762 {
3763 if (pick_lowest_cost_p)
3764 {
3765 /* Keep trying to roll back vectorization attempts while the
3766 loop_vec_infos they produced were worse than this one. */
3767 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3768 while (!vinfos.is_empty ()
3769 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3770 {
3771 gcc_assert (vect_epilogues);
3772 delete vinfos.pop ();
3773 }
3774 }
3775 /* For now only allow one epilogue loop. */
3776 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3777 {
3778 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3779 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3780 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3781 || maybe_ne (lowest_th, 0U));
3782 /* Keep track of the known smallest versioning
3783 threshold. */
3784 if (ordered_p (lowest_th, th))
3785 lowest_th = ordered_min (lowest_th, th);
3786 }
3787 else
3788 {
3789 delete loop_vinfo;
3790 loop_vinfo = opt_loop_vec_info::success (NULL);
3791 }
3792
3793 /* For now only allow one epilogue loop, but allow
3794 pick_lowest_cost_p to replace it, so commit to the
3795 first epilogue if we have no reason to try alternatives. */
3796 if (!pick_lowest_cost_p)
3797 break;
3798 }
3799
3800 if (mode_i == vector_modes.length ())
3801 break;
3802
3803 }
3804
3805 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3806 {
3807 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3808 if (dump_enabled_p ())
3809 dump_printf_loc (MSG_NOTE, vect_location,
3810 "***** Choosing epilogue vector mode %s\n",
3811 GET_MODE_NAME
3812 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3813 }
3814
3815 return first_loop_vinfo;
3816 }
3817
3818 /* Return true if there is an in-order reduction function for CODE, storing
3819 it in *REDUC_FN if so. */
3820
3821 static bool
3822 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3823 {
3824 /* We support MINUS_EXPR by negating the operand. This also preserves an
3825 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3826 (-0.0) = -0.0. */
3827 if (code == PLUS_EXPR || code == MINUS_EXPR)
3828 {
3829 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3830 return true;
3831 }
3832 return false;
3833 }
3834
3835 /* Function reduction_fn_for_scalar_code
3836
3837 Input:
3838 CODE - tree_code of a reduction operations.
3839
3840 Output:
3841 REDUC_FN - the corresponding internal function to be used to reduce the
3842 vector of partial results into a single scalar result, or IFN_LAST
3843 if the operation is a supported reduction operation, but does not have
3844 such an internal function.
3845
3846 Return FALSE if CODE currently cannot be vectorized as reduction. */
3847
3848 bool
3849 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3850 {
3851 if (code.is_tree_code ())
3852 switch (tree_code (code))
3853 {
3854 case MAX_EXPR:
3855 *reduc_fn = IFN_REDUC_MAX;
3856 return true;
3857
3858 case MIN_EXPR:
3859 *reduc_fn = IFN_REDUC_MIN;
3860 return true;
3861
3862 case PLUS_EXPR:
3863 *reduc_fn = IFN_REDUC_PLUS;
3864 return true;
3865
3866 case BIT_AND_EXPR:
3867 *reduc_fn = IFN_REDUC_AND;
3868 return true;
3869
3870 case BIT_IOR_EXPR:
3871 *reduc_fn = IFN_REDUC_IOR;
3872 return true;
3873
3874 case BIT_XOR_EXPR:
3875 *reduc_fn = IFN_REDUC_XOR;
3876 return true;
3877
3878 case MULT_EXPR:
3879 case MINUS_EXPR:
3880 *reduc_fn = IFN_LAST;
3881 return true;
3882
3883 default:
3884 return false;
3885 }
3886 else
3887 switch (combined_fn (code))
3888 {
3889 CASE_CFN_FMAX:
3890 *reduc_fn = IFN_REDUC_FMAX;
3891 return true;
3892
3893 CASE_CFN_FMIN:
3894 *reduc_fn = IFN_REDUC_FMIN;
3895 return true;
3896
3897 default:
3898 return false;
3899 }
3900 }
3901
3902 /* If there is a neutral value X such that a reduction would not be affected
3903 by the introduction of additional X elements, return that X, otherwise
3904 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3905 of the scalar elements. If the reduction has just a single initial value
3906 then INITIAL_VALUE is that value, otherwise it is null.
3907 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3908 In that case no signed zero is returned. */
3909
3910 tree
3911 neutral_op_for_reduction (tree scalar_type, code_helper code,
3912 tree initial_value, bool as_initial)
3913 {
3914 if (code.is_tree_code ())
3915 switch (tree_code (code))
3916 {
3917 case DOT_PROD_EXPR:
3918 case SAD_EXPR:
3919 case MINUS_EXPR:
3920 case BIT_IOR_EXPR:
3921 case BIT_XOR_EXPR:
3922 return build_zero_cst (scalar_type);
3923 case WIDEN_SUM_EXPR:
3924 case PLUS_EXPR:
3925 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3926 return build_real (scalar_type, dconstm0);
3927 else
3928 return build_zero_cst (scalar_type);
3929
3930 case MULT_EXPR:
3931 return build_one_cst (scalar_type);
3932
3933 case BIT_AND_EXPR:
3934 return build_all_ones_cst (scalar_type);
3935
3936 case MAX_EXPR:
3937 case MIN_EXPR:
3938 return initial_value;
3939
3940 default:
3941 return NULL_TREE;
3942 }
3943 else
3944 switch (combined_fn (code))
3945 {
3946 CASE_CFN_FMIN:
3947 CASE_CFN_FMAX:
3948 return initial_value;
3949
3950 default:
3951 return NULL_TREE;
3952 }
3953 }
3954
3955 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3956 STMT is printed with a message MSG. */
3957
3958 static void
3959 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3960 {
3961 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3962 }
3963
3964 /* Return true if we need an in-order reduction for operation CODE
3965 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3966 overflow must wrap. */
3967
3968 bool
3969 needs_fold_left_reduction_p (tree type, code_helper code)
3970 {
3971 /* CHECKME: check for !flag_finite_math_only too? */
3972 if (SCALAR_FLOAT_TYPE_P (type))
3973 {
3974 if (code.is_tree_code ())
3975 switch (tree_code (code))
3976 {
3977 case MIN_EXPR:
3978 case MAX_EXPR:
3979 return false;
3980
3981 default:
3982 return !flag_associative_math;
3983 }
3984 else
3985 switch (combined_fn (code))
3986 {
3987 CASE_CFN_FMIN:
3988 CASE_CFN_FMAX:
3989 return false;
3990
3991 default:
3992 return !flag_associative_math;
3993 }
3994 }
3995
3996 if (INTEGRAL_TYPE_P (type))
3997 return (!code.is_tree_code ()
3998 || !operation_no_trapping_overflow (type, tree_code (code)));
3999
4000 if (SAT_FIXED_POINT_TYPE_P (type))
4001 return true;
4002
4003 return false;
4004 }
4005
4006 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4007 has a handled computation expression. Store the main reduction
4008 operation in *CODE. */
4009
4010 static bool
4011 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4012 tree loop_arg, code_helper *code,
4013 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4014 {
4015 auto_bitmap visited;
4016 tree lookfor = PHI_RESULT (phi);
4017 ssa_op_iter curri;
4018 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4019 while (USE_FROM_PTR (curr) != loop_arg)
4020 curr = op_iter_next_use (&curri);
4021 curri.i = curri.numops;
4022 do
4023 {
4024 path.safe_push (std::make_pair (curri, curr));
4025 tree use = USE_FROM_PTR (curr);
4026 if (use == lookfor)
4027 break;
4028 gimple *def = SSA_NAME_DEF_STMT (use);
4029 if (gimple_nop_p (def)
4030 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4031 {
4032 pop:
4033 do
4034 {
4035 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4036 curri = x.first;
4037 curr = x.second;
4038 do
4039 curr = op_iter_next_use (&curri);
4040 /* Skip already visited or non-SSA operands (from iterating
4041 over PHI args). */
4042 while (curr != NULL_USE_OPERAND_P
4043 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4044 || ! bitmap_set_bit (visited,
4045 SSA_NAME_VERSION
4046 (USE_FROM_PTR (curr)))));
4047 }
4048 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4049 if (curr == NULL_USE_OPERAND_P)
4050 break;
4051 }
4052 else
4053 {
4054 if (gimple_code (def) == GIMPLE_PHI)
4055 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4056 else
4057 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4058 while (curr != NULL_USE_OPERAND_P
4059 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4060 || ! bitmap_set_bit (visited,
4061 SSA_NAME_VERSION
4062 (USE_FROM_PTR (curr)))))
4063 curr = op_iter_next_use (&curri);
4064 if (curr == NULL_USE_OPERAND_P)
4065 goto pop;
4066 }
4067 }
4068 while (1);
4069 if (dump_file && (dump_flags & TDF_DETAILS))
4070 {
4071 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4072 unsigned i;
4073 std::pair<ssa_op_iter, use_operand_p> *x;
4074 FOR_EACH_VEC_ELT (path, i, x)
4075 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4076 dump_printf (MSG_NOTE, "\n");
4077 }
4078
4079 /* Check whether the reduction path detected is valid. */
4080 bool fail = path.length () == 0;
4081 bool neg = false;
4082 int sign = -1;
4083 *code = ERROR_MARK;
4084 for (unsigned i = 1; i < path.length (); ++i)
4085 {
4086 gimple *use_stmt = USE_STMT (path[i].second);
4087 gimple_match_op op;
4088 if (!gimple_extract_op (use_stmt, &op))
4089 {
4090 fail = true;
4091 break;
4092 }
4093 unsigned int opi = op.num_ops;
4094 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4095 {
4096 /* The following make sure we can compute the operand index
4097 easily plus it mostly disallows chaining via COND_EXPR condition
4098 operands. */
4099 for (opi = 0; opi < op.num_ops; ++opi)
4100 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4101 break;
4102 }
4103 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4104 {
4105 for (opi = 0; opi < op.num_ops; ++opi)
4106 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4107 break;
4108 }
4109 if (opi == op.num_ops)
4110 {
4111 fail = true;
4112 break;
4113 }
4114 op.code = canonicalize_code (op.code, op.type);
4115 if (op.code == MINUS_EXPR)
4116 {
4117 op.code = PLUS_EXPR;
4118 /* Track whether we negate the reduction value each iteration. */
4119 if (op.ops[1] == op.ops[opi])
4120 neg = ! neg;
4121 }
4122 else if (op.code == IFN_COND_SUB)
4123 {
4124 op.code = IFN_COND_ADD;
4125 /* Track whether we negate the reduction value each iteration. */
4126 if (op.ops[2] == op.ops[opi])
4127 neg = ! neg;
4128 }
4129 if (CONVERT_EXPR_CODE_P (op.code)
4130 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4131 ;
4132 else if (*code == ERROR_MARK)
4133 {
4134 *code = op.code;
4135 sign = TYPE_SIGN (op.type);
4136 }
4137 else if (op.code != *code)
4138 {
4139 fail = true;
4140 break;
4141 }
4142 else if ((op.code == MIN_EXPR
4143 || op.code == MAX_EXPR)
4144 && sign != TYPE_SIGN (op.type))
4145 {
4146 fail = true;
4147 break;
4148 }
4149 /* Check there's only a single stmt the op is used on. For the
4150 not value-changing tail and the last stmt allow out-of-loop uses.
4151 ??? We could relax this and handle arbitrary live stmts by
4152 forcing a scalar epilogue for example. */
4153 imm_use_iterator imm_iter;
4154 use_operand_p use_p;
4155 gimple *op_use_stmt;
4156 unsigned cnt = 0;
4157 bool cond_fn_p = op.code.is_internal_fn ()
4158 && (conditional_internal_fn_code (internal_fn (op.code))
4159 != ERROR_MARK);
4160
4161 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4162 {
4163 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4164 op1 twice (once as definition, once as else) in the same operation.
4165 Allow this. */
4166 if (cond_fn_p && op_use_stmt == use_stmt)
4167 {
4168 gcall *call = as_a<gcall *> (use_stmt);
4169 unsigned else_pos
4170 = internal_fn_else_index (internal_fn (op.code));
4171
4172 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4173 {
4174 if (j == else_pos)
4175 continue;
4176 if (gimple_call_arg (call, j) == op.ops[opi])
4177 cnt++;
4178 }
4179 }
4180 else if (!is_gimple_debug (op_use_stmt)
4181 && (*code != ERROR_MARK
4182 || flow_bb_inside_loop_p (loop,
4183 gimple_bb (op_use_stmt))))
4184 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4185 cnt++;
4186 }
4187
4188 if (cnt != 1)
4189 {
4190 fail = true;
4191 break;
4192 }
4193 }
4194 return ! fail && ! neg && *code != ERROR_MARK;
4195 }
4196
4197 bool
4198 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4199 tree loop_arg, enum tree_code code)
4200 {
4201 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4202 code_helper code_;
4203 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4204 && code_ == code);
4205 }
4206
4207
4208
4209 /* Function vect_is_simple_reduction
4210
4211 (1) Detect a cross-iteration def-use cycle that represents a simple
4212 reduction computation. We look for the following pattern:
4213
4214 loop_header:
4215 a1 = phi < a0, a2 >
4216 a3 = ...
4217 a2 = operation (a3, a1)
4218
4219 or
4220
4221 a3 = ...
4222 loop_header:
4223 a1 = phi < a0, a2 >
4224 a2 = operation (a3, a1)
4225
4226 such that:
4227 1. operation is commutative and associative and it is safe to
4228 change the order of the computation
4229 2. no uses for a2 in the loop (a2 is used out of the loop)
4230 3. no uses of a1 in the loop besides the reduction operation
4231 4. no uses of a1 outside the loop.
4232
4233 Conditions 1,4 are tested here.
4234 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4235
4236 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4237 nested cycles.
4238
4239 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4240 reductions:
4241
4242 a1 = phi < a0, a2 >
4243 inner loop (def of a3)
4244 a2 = phi < a3 >
4245
4246 (4) Detect condition expressions, ie:
4247 for (int i = 0; i < N; i++)
4248 if (a[i] < val)
4249 ret_val = a[i];
4250
4251 */
4252
4253 static stmt_vec_info
4254 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4255 bool *double_reduc, bool *reduc_chain_p, bool slp)
4256 {
4257 gphi *phi = as_a <gphi *> (phi_info->stmt);
4258 gimple *phi_use_stmt = NULL;
4259 imm_use_iterator imm_iter;
4260 use_operand_p use_p;
4261
4262 *double_reduc = false;
4263 *reduc_chain_p = false;
4264 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4265
4266 tree phi_name = PHI_RESULT (phi);
4267 /* ??? If there are no uses of the PHI result the inner loop reduction
4268 won't be detected as possibly double-reduction by vectorizable_reduction
4269 because that tries to walk the PHI arg from the preheader edge which
4270 can be constant. See PR60382. */
4271 if (has_zero_uses (phi_name))
4272 return NULL;
4273 class loop *loop = (gimple_bb (phi))->loop_father;
4274 unsigned nphi_def_loop_uses = 0;
4275 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4276 {
4277 gimple *use_stmt = USE_STMT (use_p);
4278 if (is_gimple_debug (use_stmt))
4279 continue;
4280
4281 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4282 {
4283 if (dump_enabled_p ())
4284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4285 "intermediate value used outside loop.\n");
4286
4287 return NULL;
4288 }
4289
4290 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4291 op1 twice (once as definition, once as else) in the same operation.
4292 Only count it as one. */
4293 if (use_stmt != phi_use_stmt)
4294 {
4295 nphi_def_loop_uses++;
4296 phi_use_stmt = use_stmt;
4297 }
4298 }
4299
4300 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4301 if (TREE_CODE (latch_def) != SSA_NAME)
4302 {
4303 if (dump_enabled_p ())
4304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4305 "reduction: not ssa_name: %T\n", latch_def);
4306 return NULL;
4307 }
4308
4309 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4310 if (!def_stmt_info
4311 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4312 return NULL;
4313
4314 bool nested_in_vect_loop
4315 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4316 unsigned nlatch_def_loop_uses = 0;
4317 auto_vec<gphi *, 3> lcphis;
4318 bool inner_loop_of_double_reduc = false;
4319 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4320 {
4321 gimple *use_stmt = USE_STMT (use_p);
4322 if (is_gimple_debug (use_stmt))
4323 continue;
4324 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4325 nlatch_def_loop_uses++;
4326 else
4327 {
4328 /* We can have more than one loop-closed PHI. */
4329 lcphis.safe_push (as_a <gphi *> (use_stmt));
4330 if (nested_in_vect_loop
4331 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4332 == vect_double_reduction_def))
4333 inner_loop_of_double_reduc = true;
4334 }
4335 }
4336
4337 /* If we are vectorizing an inner reduction we are executing that
4338 in the original order only in case we are not dealing with a
4339 double reduction. */
4340 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4341 {
4342 if (dump_enabled_p ())
4343 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4344 "detected nested cycle: ");
4345 return def_stmt_info;
4346 }
4347
4348 /* When the inner loop of a double reduction ends up with more than
4349 one loop-closed PHI we have failed to classify alternate such
4350 PHIs as double reduction, leading to wrong code. See PR103237. */
4351 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4352 {
4353 if (dump_enabled_p ())
4354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4355 "unhandle double reduction\n");
4356 return NULL;
4357 }
4358
4359 /* If this isn't a nested cycle or if the nested cycle reduction value
4360 is used ouside of the inner loop we cannot handle uses of the reduction
4361 value. */
4362 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4363 {
4364 if (dump_enabled_p ())
4365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4366 "reduction used in loop.\n");
4367 return NULL;
4368 }
4369
4370 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4371 defined in the inner loop. */
4372 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4373 {
4374 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4375 if (gimple_phi_num_args (def_stmt) != 1
4376 || TREE_CODE (op1) != SSA_NAME)
4377 {
4378 if (dump_enabled_p ())
4379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4380 "unsupported phi node definition.\n");
4381
4382 return NULL;
4383 }
4384
4385 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4386 and the latch definition op1. */
4387 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4388 if (gimple_bb (def1)
4389 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4390 && loop->inner
4391 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4392 && (is_gimple_assign (def1) || is_gimple_call (def1))
4393 && is_a <gphi *> (phi_use_stmt)
4394 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4395 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4396 loop_latch_edge (loop->inner))))
4397 {
4398 if (dump_enabled_p ())
4399 report_vect_op (MSG_NOTE, def_stmt,
4400 "detected double reduction: ");
4401
4402 *double_reduc = true;
4403 return def_stmt_info;
4404 }
4405
4406 return NULL;
4407 }
4408
4409 /* Look for the expression computing latch_def from then loop PHI result. */
4410 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4411 code_helper code;
4412 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4413 path))
4414 {
4415 STMT_VINFO_REDUC_CODE (phi_info) = code;
4416 if (code == COND_EXPR && !nested_in_vect_loop)
4417 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4418
4419 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4420 reduction chain for which the additional restriction is that
4421 all operations in the chain are the same. */
4422 auto_vec<stmt_vec_info, 8> reduc_chain;
4423 unsigned i;
4424 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4425 for (i = path.length () - 1; i >= 1; --i)
4426 {
4427 gimple *stmt = USE_STMT (path[i].second);
4428 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4429 gimple_match_op op;
4430 if (!gimple_extract_op (stmt, &op))
4431 gcc_unreachable ();
4432 if (gassign *assign = dyn_cast<gassign *> (stmt))
4433 STMT_VINFO_REDUC_IDX (stmt_info)
4434 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4435 else
4436 {
4437 gcall *call = as_a<gcall *> (stmt);
4438 STMT_VINFO_REDUC_IDX (stmt_info)
4439 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4440 }
4441 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4442 && (i == 1 || i == path.length () - 1));
4443 if ((op.code != code && !leading_conversion)
4444 /* We can only handle the final value in epilogue
4445 generation for reduction chains. */
4446 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4447 is_slp_reduc = false;
4448 /* For reduction chains we support a trailing/leading
4449 conversions. We do not store those in the actual chain. */
4450 if (leading_conversion)
4451 continue;
4452 reduc_chain.safe_push (stmt_info);
4453 }
4454 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4455 {
4456 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4457 {
4458 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4459 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4460 }
4461 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4462 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4463
4464 /* Save the chain for further analysis in SLP detection. */
4465 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4466 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4467
4468 *reduc_chain_p = true;
4469 if (dump_enabled_p ())
4470 dump_printf_loc (MSG_NOTE, vect_location,
4471 "reduction: detected reduction chain\n");
4472 }
4473 else if (dump_enabled_p ())
4474 dump_printf_loc (MSG_NOTE, vect_location,
4475 "reduction: detected reduction\n");
4476
4477 return def_stmt_info;
4478 }
4479
4480 if (dump_enabled_p ())
4481 dump_printf_loc (MSG_NOTE, vect_location,
4482 "reduction: unknown pattern\n");
4483
4484 return NULL;
4485 }
4486
4487 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4488 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4489 or -1 if not known. */
4490
4491 static int
4492 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4493 {
4494 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4495 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4496 {
4497 if (dump_enabled_p ())
4498 dump_printf_loc (MSG_NOTE, vect_location,
4499 "cost model: epilogue peel iters set to vf/2 "
4500 "because loop iterations are unknown .\n");
4501 return assumed_vf / 2;
4502 }
4503 else
4504 {
4505 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4506 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4507 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4508 /* If we need to peel for gaps, but no peeling is required, we have to
4509 peel VF iterations. */
4510 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4511 peel_iters_epilogue = assumed_vf;
4512 return peel_iters_epilogue;
4513 }
4514 }
4515
4516 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4517 int
4518 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4519 int *peel_iters_epilogue,
4520 stmt_vector_for_cost *scalar_cost_vec,
4521 stmt_vector_for_cost *prologue_cost_vec,
4522 stmt_vector_for_cost *epilogue_cost_vec)
4523 {
4524 int retval = 0;
4525
4526 *peel_iters_epilogue
4527 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4528
4529 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4530 {
4531 /* If peeled iterations are known but number of scalar loop
4532 iterations are unknown, count a taken branch per peeled loop. */
4533 if (peel_iters_prologue > 0)
4534 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4535 vect_prologue);
4536 if (*peel_iters_epilogue > 0)
4537 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4538 vect_epilogue);
4539 }
4540
4541 stmt_info_for_cost *si;
4542 int j;
4543 if (peel_iters_prologue)
4544 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4545 retval += record_stmt_cost (prologue_cost_vec,
4546 si->count * peel_iters_prologue,
4547 si->kind, si->stmt_info, si->misalign,
4548 vect_prologue);
4549 if (*peel_iters_epilogue)
4550 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4551 retval += record_stmt_cost (epilogue_cost_vec,
4552 si->count * *peel_iters_epilogue,
4553 si->kind, si->stmt_info, si->misalign,
4554 vect_epilogue);
4555
4556 return retval;
4557 }
4558
4559 /* Function vect_estimate_min_profitable_iters
4560
4561 Return the number of iterations required for the vector version of the
4562 loop to be profitable relative to the cost of the scalar version of the
4563 loop.
4564
4565 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4566 of iterations for vectorization. -1 value means loop vectorization
4567 is not profitable. This returned value may be used for dynamic
4568 profitability check.
4569
4570 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4571 for static check against estimated number of iterations. */
4572
4573 static void
4574 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4575 int *ret_min_profitable_niters,
4576 int *ret_min_profitable_estimate,
4577 unsigned *suggested_unroll_factor)
4578 {
4579 int min_profitable_iters;
4580 int min_profitable_estimate;
4581 int peel_iters_prologue;
4582 int peel_iters_epilogue;
4583 unsigned vec_inside_cost = 0;
4584 int vec_outside_cost = 0;
4585 unsigned vec_prologue_cost = 0;
4586 unsigned vec_epilogue_cost = 0;
4587 int scalar_single_iter_cost = 0;
4588 int scalar_outside_cost = 0;
4589 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4590 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4591 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4592
4593 /* Cost model disabled. */
4594 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4595 {
4596 if (dump_enabled_p ())
4597 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4598 *ret_min_profitable_niters = 0;
4599 *ret_min_profitable_estimate = 0;
4600 return;
4601 }
4602
4603 /* Requires loop versioning tests to handle misalignment. */
4604 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4605 {
4606 /* FIXME: Make cost depend on complexity of individual check. */
4607 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4608 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4609 if (dump_enabled_p ())
4610 dump_printf (MSG_NOTE,
4611 "cost model: Adding cost of checks for loop "
4612 "versioning to treat misalignment.\n");
4613 }
4614
4615 /* Requires loop versioning with alias checks. */
4616 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4617 {
4618 /* FIXME: Make cost depend on complexity of individual check. */
4619 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4620 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4621 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4622 if (len)
4623 /* Count LEN - 1 ANDs and LEN comparisons. */
4624 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4625 scalar_stmt, vect_prologue);
4626 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4627 if (len)
4628 {
4629 /* Count LEN - 1 ANDs and LEN comparisons. */
4630 unsigned int nstmts = len * 2 - 1;
4631 /* +1 for each bias that needs adding. */
4632 for (unsigned int i = 0; i < len; ++i)
4633 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4634 nstmts += 1;
4635 (void) add_stmt_cost (target_cost_data, nstmts,
4636 scalar_stmt, vect_prologue);
4637 }
4638 if (dump_enabled_p ())
4639 dump_printf (MSG_NOTE,
4640 "cost model: Adding cost of checks for loop "
4641 "versioning aliasing.\n");
4642 }
4643
4644 /* Requires loop versioning with niter checks. */
4645 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4646 {
4647 /* FIXME: Make cost depend on complexity of individual check. */
4648 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4649 NULL, NULL, NULL_TREE, 0, vect_prologue);
4650 if (dump_enabled_p ())
4651 dump_printf (MSG_NOTE,
4652 "cost model: Adding cost of checks for loop "
4653 "versioning niters.\n");
4654 }
4655
4656 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4657 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4658 vect_prologue);
4659
4660 /* Count statements in scalar loop. Using this as scalar cost for a single
4661 iteration for now.
4662
4663 TODO: Add outer loop support.
4664
4665 TODO: Consider assigning different costs to different scalar
4666 statements. */
4667
4668 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4669
4670 /* Add additional cost for the peeled instructions in prologue and epilogue
4671 loop. (For fully-masked loops there will be no peeling.)
4672
4673 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4674 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4675
4676 TODO: Build an expression that represents peel_iters for prologue and
4677 epilogue to be used in a run-time test. */
4678
4679 bool prologue_need_br_taken_cost = false;
4680 bool prologue_need_br_not_taken_cost = false;
4681
4682 /* Calculate peel_iters_prologue. */
4683 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4684 peel_iters_prologue = 0;
4685 else if (npeel < 0)
4686 {
4687 peel_iters_prologue = assumed_vf / 2;
4688 if (dump_enabled_p ())
4689 dump_printf (MSG_NOTE, "cost model: "
4690 "prologue peel iters set to vf/2.\n");
4691
4692 /* If peeled iterations are unknown, count a taken branch and a not taken
4693 branch per peeled loop. Even if scalar loop iterations are known,
4694 vector iterations are not known since peeled prologue iterations are
4695 not known. Hence guards remain the same. */
4696 prologue_need_br_taken_cost = true;
4697 prologue_need_br_not_taken_cost = true;
4698 }
4699 else
4700 {
4701 peel_iters_prologue = npeel;
4702 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4703 /* If peeled iterations are known but number of scalar loop
4704 iterations are unknown, count a taken branch per peeled loop. */
4705 prologue_need_br_taken_cost = true;
4706 }
4707
4708 bool epilogue_need_br_taken_cost = false;
4709 bool epilogue_need_br_not_taken_cost = false;
4710
4711 /* Calculate peel_iters_epilogue. */
4712 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4713 /* We need to peel exactly one iteration for gaps. */
4714 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4715 else if (npeel < 0)
4716 {
4717 /* If peeling for alignment is unknown, loop bound of main loop
4718 becomes unknown. */
4719 peel_iters_epilogue = assumed_vf / 2;
4720 if (dump_enabled_p ())
4721 dump_printf (MSG_NOTE, "cost model: "
4722 "epilogue peel iters set to vf/2 because "
4723 "peeling for alignment is unknown.\n");
4724
4725 /* See the same reason above in peel_iters_prologue calculation. */
4726 epilogue_need_br_taken_cost = true;
4727 epilogue_need_br_not_taken_cost = true;
4728 }
4729 else
4730 {
4731 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4732 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4733 /* If peeled iterations are known but number of scalar loop
4734 iterations are unknown, count a taken branch per peeled loop. */
4735 epilogue_need_br_taken_cost = true;
4736 }
4737
4738 stmt_info_for_cost *si;
4739 int j;
4740 /* Add costs associated with peel_iters_prologue. */
4741 if (peel_iters_prologue)
4742 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4743 {
4744 (void) add_stmt_cost (target_cost_data,
4745 si->count * peel_iters_prologue, si->kind,
4746 si->stmt_info, si->node, si->vectype,
4747 si->misalign, vect_prologue);
4748 }
4749
4750 /* Add costs associated with peel_iters_epilogue. */
4751 if (peel_iters_epilogue)
4752 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4753 {
4754 (void) add_stmt_cost (target_cost_data,
4755 si->count * peel_iters_epilogue, si->kind,
4756 si->stmt_info, si->node, si->vectype,
4757 si->misalign, vect_epilogue);
4758 }
4759
4760 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4761
4762 if (prologue_need_br_taken_cost)
4763 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4764 vect_prologue);
4765
4766 if (prologue_need_br_not_taken_cost)
4767 (void) add_stmt_cost (target_cost_data, 1,
4768 cond_branch_not_taken, vect_prologue);
4769
4770 if (epilogue_need_br_taken_cost)
4771 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4772 vect_epilogue);
4773
4774 if (epilogue_need_br_not_taken_cost)
4775 (void) add_stmt_cost (target_cost_data, 1,
4776 cond_branch_not_taken, vect_epilogue);
4777
4778 /* Take care of special costs for rgroup controls of partial vectors. */
4779 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4780 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4781 == vect_partial_vectors_avx512))
4782 {
4783 /* Calculate how many masks we need to generate. */
4784 unsigned int num_masks = 0;
4785 bool need_saturation = false;
4786 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4787 if (rgm.type)
4788 {
4789 unsigned nvectors = rgm.factor;
4790 num_masks += nvectors;
4791 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4792 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4793 need_saturation = true;
4794 }
4795
4796 /* ??? The target isn't able to identify the costs below as
4797 producing masks so it cannot penaltize cases where we'd run
4798 out of mask registers for example. */
4799
4800 /* ??? We are also failing to account for smaller vector masks
4801 we generate by splitting larger masks in vect_get_loop_mask. */
4802
4803 /* In the worst case, we need to generate each mask in the prologue
4804 and in the loop body. We need one splat per group and one
4805 compare per mask.
4806
4807 Sometimes the prologue mask will fold to a constant,
4808 so the actual prologue cost might be smaller. However, it's
4809 simpler and safer to use the worst-case cost; if this ends up
4810 being the tie-breaker between vectorizing or not, then it's
4811 probably better not to vectorize. */
4812 (void) add_stmt_cost (target_cost_data,
4813 num_masks
4814 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4815 vector_stmt, NULL, NULL, NULL_TREE, 0,
4816 vect_prologue);
4817 (void) add_stmt_cost (target_cost_data,
4818 num_masks
4819 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4820 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4821
4822 /* When we need saturation we need it both in the prologue and
4823 the epilogue. */
4824 if (need_saturation)
4825 {
4826 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4827 NULL, NULL, NULL_TREE, 0, vect_prologue);
4828 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4829 NULL, NULL, NULL_TREE, 0, vect_body);
4830 }
4831 }
4832 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4833 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4834 == vect_partial_vectors_while_ult))
4835 {
4836 /* Calculate how many masks we need to generate. */
4837 unsigned int num_masks = 0;
4838 rgroup_controls *rgm;
4839 unsigned int num_vectors_m1;
4840 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4841 num_vectors_m1, rgm)
4842 if (rgm->type)
4843 num_masks += num_vectors_m1 + 1;
4844 gcc_assert (num_masks > 0);
4845
4846 /* In the worst case, we need to generate each mask in the prologue
4847 and in the loop body. One of the loop body mask instructions
4848 replaces the comparison in the scalar loop, and since we don't
4849 count the scalar comparison against the scalar body, we shouldn't
4850 count that vector instruction against the vector body either.
4851
4852 Sometimes we can use unpacks instead of generating prologue
4853 masks and sometimes the prologue mask will fold to a constant,
4854 so the actual prologue cost might be smaller. However, it's
4855 simpler and safer to use the worst-case cost; if this ends up
4856 being the tie-breaker between vectorizing or not, then it's
4857 probably better not to vectorize. */
4858 (void) add_stmt_cost (target_cost_data, num_masks,
4859 vector_stmt, NULL, NULL, NULL_TREE, 0,
4860 vect_prologue);
4861 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4862 vector_stmt, NULL, NULL, NULL_TREE, 0,
4863 vect_body);
4864 }
4865 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4866 {
4867 /* Referring to the functions vect_set_loop_condition_partial_vectors
4868 and vect_set_loop_controls_directly, we need to generate each
4869 length in the prologue and in the loop body if required. Although
4870 there are some possible optimizations, we consider the worst case
4871 here. */
4872
4873 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4874 signed char partial_load_store_bias
4875 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4876 bool need_iterate_p
4877 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4878 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4879
4880 /* Calculate how many statements to be added. */
4881 unsigned int prologue_stmts = 0;
4882 unsigned int body_stmts = 0;
4883
4884 rgroup_controls *rgc;
4885 unsigned int num_vectors_m1;
4886 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4887 if (rgc->type)
4888 {
4889 /* May need one SHIFT for nitems_total computation. */
4890 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4891 if (nitems != 1 && !niters_known_p)
4892 prologue_stmts += 1;
4893
4894 /* May need one MAX and one MINUS for wrap around. */
4895 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4896 prologue_stmts += 2;
4897
4898 /* Need one MAX and one MINUS for each batch limit excepting for
4899 the 1st one. */
4900 prologue_stmts += num_vectors_m1 * 2;
4901
4902 unsigned int num_vectors = num_vectors_m1 + 1;
4903
4904 /* Need to set up lengths in prologue, only one MIN required
4905 for each since start index is zero. */
4906 prologue_stmts += num_vectors;
4907
4908 /* If we have a non-zero partial load bias, we need one PLUS
4909 to adjust the load length. */
4910 if (partial_load_store_bias != 0)
4911 body_stmts += 1;
4912
4913 unsigned int length_update_cost = 0;
4914 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4915 /* For decrement IV style, Each only need a single SELECT_VL
4916 or MIN since beginning to calculate the number of elements
4917 need to be processed in current iteration. */
4918 length_update_cost = 1;
4919 else
4920 /* For increment IV stype, Each may need two MINs and one MINUS to
4921 update lengths in body for next iteration. */
4922 length_update_cost = 3;
4923
4924 if (need_iterate_p)
4925 body_stmts += length_update_cost * num_vectors;
4926 }
4927
4928 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4929 scalar_stmt, vect_prologue);
4930 (void) add_stmt_cost (target_cost_data, body_stmts,
4931 scalar_stmt, vect_body);
4932 }
4933
4934 /* FORNOW: The scalar outside cost is incremented in one of the
4935 following ways:
4936
4937 1. The vectorizer checks for alignment and aliasing and generates
4938 a condition that allows dynamic vectorization. A cost model
4939 check is ANDED with the versioning condition. Hence scalar code
4940 path now has the added cost of the versioning check.
4941
4942 if (cost > th & versioning_check)
4943 jmp to vector code
4944
4945 Hence run-time scalar is incremented by not-taken branch cost.
4946
4947 2. The vectorizer then checks if a prologue is required. If the
4948 cost model check was not done before during versioning, it has to
4949 be done before the prologue check.
4950
4951 if (cost <= th)
4952 prologue = scalar_iters
4953 if (prologue == 0)
4954 jmp to vector code
4955 else
4956 execute prologue
4957 if (prologue == num_iters)
4958 go to exit
4959
4960 Hence the run-time scalar cost is incremented by a taken branch,
4961 plus a not-taken branch, plus a taken branch cost.
4962
4963 3. The vectorizer then checks if an epilogue is required. If the
4964 cost model check was not done before during prologue check, it
4965 has to be done with the epilogue check.
4966
4967 if (prologue == 0)
4968 jmp to vector code
4969 else
4970 execute prologue
4971 if (prologue == num_iters)
4972 go to exit
4973 vector code:
4974 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4975 jmp to epilogue
4976
4977 Hence the run-time scalar cost should be incremented by 2 taken
4978 branches.
4979
4980 TODO: The back end may reorder the BBS's differently and reverse
4981 conditions/branch directions. Change the estimates below to
4982 something more reasonable. */
4983
4984 /* If the number of iterations is known and we do not do versioning, we can
4985 decide whether to vectorize at compile time. Hence the scalar version
4986 do not carry cost model guard costs. */
4987 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4988 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4989 {
4990 /* Cost model check occurs at versioning. */
4991 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4992 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4993 else
4994 {
4995 /* Cost model check occurs at prologue generation. */
4996 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4997 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4998 + vect_get_stmt_cost (cond_branch_not_taken);
4999 /* Cost model check occurs at epilogue generation. */
5000 else
5001 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5002 }
5003 }
5004
5005 /* Complete the target-specific cost calculations. */
5006 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5007 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5008 suggested_unroll_factor);
5009
5010 if (suggested_unroll_factor && *suggested_unroll_factor > 1
5011 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5012 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5013 *suggested_unroll_factor,
5014 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5015 {
5016 if (dump_enabled_p ())
5017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5018 "can't unroll as unrolled vectorization factor larger"
5019 " than maximum vectorization factor: "
5020 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5021 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5022 *suggested_unroll_factor = 1;
5023 }
5024
5025 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5026
5027 if (dump_enabled_p ())
5028 {
5029 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5030 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5031 vec_inside_cost);
5032 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5033 vec_prologue_cost);
5034 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5035 vec_epilogue_cost);
5036 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5037 scalar_single_iter_cost);
5038 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5039 scalar_outside_cost);
5040 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5041 vec_outside_cost);
5042 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5043 peel_iters_prologue);
5044 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5045 peel_iters_epilogue);
5046 }
5047
5048 /* Calculate number of iterations required to make the vector version
5049 profitable, relative to the loop bodies only. The following condition
5050 must hold true:
5051 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5052 where
5053 SIC = scalar iteration cost, VIC = vector iteration cost,
5054 VOC = vector outside cost, VF = vectorization factor,
5055 NPEEL = prologue iterations + epilogue iterations,
5056 SOC = scalar outside cost for run time cost model check. */
5057
5058 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5059 - vec_inside_cost);
5060 if (saving_per_viter <= 0)
5061 {
5062 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5063 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5064 "vectorization did not happen for a simd loop");
5065
5066 if (dump_enabled_p ())
5067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5068 "cost model: the vector iteration cost = %d "
5069 "divided by the scalar iteration cost = %d "
5070 "is greater or equal to the vectorization factor = %d"
5071 ".\n",
5072 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5073 *ret_min_profitable_niters = -1;
5074 *ret_min_profitable_estimate = -1;
5075 return;
5076 }
5077
5078 /* ??? The "if" arm is written to handle all cases; see below for what
5079 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5080 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5081 {
5082 /* Rewriting the condition above in terms of the number of
5083 vector iterations (vniters) rather than the number of
5084 scalar iterations (niters) gives:
5085
5086 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5087
5088 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5089
5090 For integer N, X and Y when X > 0:
5091
5092 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5093 int outside_overhead = (vec_outside_cost
5094 - scalar_single_iter_cost * peel_iters_prologue
5095 - scalar_single_iter_cost * peel_iters_epilogue
5096 - scalar_outside_cost);
5097 /* We're only interested in cases that require at least one
5098 vector iteration. */
5099 int min_vec_niters = 1;
5100 if (outside_overhead > 0)
5101 min_vec_niters = outside_overhead / saving_per_viter + 1;
5102
5103 if (dump_enabled_p ())
5104 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5105 min_vec_niters);
5106
5107 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5108 {
5109 /* Now that we know the minimum number of vector iterations,
5110 find the minimum niters for which the scalar cost is larger:
5111
5112 SIC * niters > VIC * vniters + VOC - SOC
5113
5114 We know that the minimum niters is no more than
5115 vniters * VF + NPEEL, but it might be (and often is) less
5116 than that if a partial vector iteration is cheaper than the
5117 equivalent scalar code. */
5118 int threshold = (vec_inside_cost * min_vec_niters
5119 + vec_outside_cost
5120 - scalar_outside_cost);
5121 if (threshold <= 0)
5122 min_profitable_iters = 1;
5123 else
5124 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5125 }
5126 else
5127 /* Convert the number of vector iterations into a number of
5128 scalar iterations. */
5129 min_profitable_iters = (min_vec_niters * assumed_vf
5130 + peel_iters_prologue
5131 + peel_iters_epilogue);
5132 }
5133 else
5134 {
5135 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5136 * assumed_vf
5137 - vec_inside_cost * peel_iters_prologue
5138 - vec_inside_cost * peel_iters_epilogue);
5139 if (min_profitable_iters <= 0)
5140 min_profitable_iters = 0;
5141 else
5142 {
5143 min_profitable_iters /= saving_per_viter;
5144
5145 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5146 <= (((int) vec_inside_cost * min_profitable_iters)
5147 + (((int) vec_outside_cost - scalar_outside_cost)
5148 * assumed_vf)))
5149 min_profitable_iters++;
5150 }
5151 }
5152
5153 if (dump_enabled_p ())
5154 dump_printf (MSG_NOTE,
5155 " Calculated minimum iters for profitability: %d\n",
5156 min_profitable_iters);
5157
5158 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5159 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5160 /* We want the vectorized loop to execute at least once. */
5161 min_profitable_iters = assumed_vf + peel_iters_prologue;
5162 else if (min_profitable_iters < peel_iters_prologue)
5163 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5164 vectorized loop executes at least once. */
5165 min_profitable_iters = peel_iters_prologue;
5166
5167 if (dump_enabled_p ())
5168 dump_printf_loc (MSG_NOTE, vect_location,
5169 " Runtime profitability threshold = %d\n",
5170 min_profitable_iters);
5171
5172 *ret_min_profitable_niters = min_profitable_iters;
5173
5174 /* Calculate number of iterations required to make the vector version
5175 profitable, relative to the loop bodies only.
5176
5177 Non-vectorized variant is SIC * niters and it must win over vector
5178 variant on the expected loop trip count. The following condition must hold true:
5179 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5180
5181 if (vec_outside_cost <= 0)
5182 min_profitable_estimate = 0;
5183 /* ??? This "else if" arm is written to handle all cases; see below for
5184 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5185 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5186 {
5187 /* This is a repeat of the code above, but with + SOC rather
5188 than - SOC. */
5189 int outside_overhead = (vec_outside_cost
5190 - scalar_single_iter_cost * peel_iters_prologue
5191 - scalar_single_iter_cost * peel_iters_epilogue
5192 + scalar_outside_cost);
5193 int min_vec_niters = 1;
5194 if (outside_overhead > 0)
5195 min_vec_niters = outside_overhead / saving_per_viter + 1;
5196
5197 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5198 {
5199 int threshold = (vec_inside_cost * min_vec_niters
5200 + vec_outside_cost
5201 + scalar_outside_cost);
5202 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5203 }
5204 else
5205 min_profitable_estimate = (min_vec_niters * assumed_vf
5206 + peel_iters_prologue
5207 + peel_iters_epilogue);
5208 }
5209 else
5210 {
5211 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5212 * assumed_vf
5213 - vec_inside_cost * peel_iters_prologue
5214 - vec_inside_cost * peel_iters_epilogue)
5215 / ((scalar_single_iter_cost * assumed_vf)
5216 - vec_inside_cost);
5217 }
5218 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5219 if (dump_enabled_p ())
5220 dump_printf_loc (MSG_NOTE, vect_location,
5221 " Static estimate profitability threshold = %d\n",
5222 min_profitable_estimate);
5223
5224 *ret_min_profitable_estimate = min_profitable_estimate;
5225 }
5226
5227 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5228 vector elements (not bits) for a vector with NELT elements. */
5229 static void
5230 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5231 vec_perm_builder *sel)
5232 {
5233 /* The encoding is a single stepped pattern. Any wrap-around is handled
5234 by vec_perm_indices. */
5235 sel->new_vector (nelt, 1, 3);
5236 for (unsigned int i = 0; i < 3; i++)
5237 sel->quick_push (i + offset);
5238 }
5239
5240 /* Checks whether the target supports whole-vector shifts for vectors of mode
5241 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5242 it supports vec_perm_const with masks for all necessary shift amounts. */
5243 static bool
5244 have_whole_vector_shift (machine_mode mode)
5245 {
5246 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5247 return true;
5248
5249 /* Variable-length vectors should be handled via the optab. */
5250 unsigned int nelt;
5251 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5252 return false;
5253
5254 vec_perm_builder sel;
5255 vec_perm_indices indices;
5256 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5257 {
5258 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5259 indices.new_vector (sel, 2, nelt);
5260 if (!can_vec_perm_const_p (mode, mode, indices, false))
5261 return false;
5262 }
5263 return true;
5264 }
5265
5266 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5267 multiplication operands have differing signs and (b) we intend
5268 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5269 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5270
5271 static bool
5272 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5273 stmt_vec_info stmt_info)
5274 {
5275 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5276 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5277 return false;
5278
5279 tree rhs1 = gimple_assign_rhs1 (assign);
5280 tree rhs2 = gimple_assign_rhs2 (assign);
5281 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5282 return false;
5283
5284 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5285 gcc_assert (reduc_info->is_reduc_info);
5286 return !directly_supported_p (DOT_PROD_EXPR,
5287 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5288 optab_vector_mixed_sign);
5289 }
5290
5291 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5292 functions. Design better to avoid maintenance issues. */
5293
5294 /* Function vect_model_reduction_cost.
5295
5296 Models cost for a reduction operation, including the vector ops
5297 generated within the strip-mine loop in some cases, the initial
5298 definition before the loop, and the epilogue code that must be generated. */
5299
5300 static void
5301 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5302 stmt_vec_info stmt_info, internal_fn reduc_fn,
5303 vect_reduction_type reduction_type,
5304 int ncopies, stmt_vector_for_cost *cost_vec)
5305 {
5306 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5307 tree vectype;
5308 machine_mode mode;
5309 class loop *loop = NULL;
5310
5311 if (loop_vinfo)
5312 loop = LOOP_VINFO_LOOP (loop_vinfo);
5313
5314 /* Condition reductions generate two reductions in the loop. */
5315 if (reduction_type == COND_REDUCTION)
5316 ncopies *= 2;
5317
5318 vectype = STMT_VINFO_VECTYPE (stmt_info);
5319 mode = TYPE_MODE (vectype);
5320 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5321
5322 gimple_match_op op;
5323 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5324 gcc_unreachable ();
5325
5326 bool emulated_mixed_dot_prod
5327 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5328 if (reduction_type == EXTRACT_LAST_REDUCTION)
5329 /* No extra instructions are needed in the prologue. The loop body
5330 operations are costed in vectorizable_condition. */
5331 inside_cost = 0;
5332 else if (reduction_type == FOLD_LEFT_REDUCTION)
5333 {
5334 /* No extra instructions needed in the prologue. */
5335 prologue_cost = 0;
5336
5337 if (reduc_fn != IFN_LAST)
5338 /* Count one reduction-like operation per vector. */
5339 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5340 stmt_info, 0, vect_body);
5341 else
5342 {
5343 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5344 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5345 inside_cost = record_stmt_cost (cost_vec, nelements,
5346 vec_to_scalar, stmt_info, 0,
5347 vect_body);
5348 inside_cost += record_stmt_cost (cost_vec, nelements,
5349 scalar_stmt, stmt_info, 0,
5350 vect_body);
5351 }
5352 }
5353 else
5354 {
5355 /* Add in the cost of the initial definitions. */
5356 int prologue_stmts;
5357 if (reduction_type == COND_REDUCTION)
5358 /* For cond reductions we have four vectors: initial index, step,
5359 initial result of the data reduction, initial value of the index
5360 reduction. */
5361 prologue_stmts = 4;
5362 else if (emulated_mixed_dot_prod)
5363 /* We need the initial reduction value and two invariants:
5364 one that contains the minimum signed value and one that
5365 contains half of its negative. */
5366 prologue_stmts = 3;
5367 else
5368 prologue_stmts = 1;
5369 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5370 scalar_to_vec, stmt_info, 0,
5371 vect_prologue);
5372 }
5373
5374 /* Determine cost of epilogue code.
5375
5376 We have a reduction operator that will reduce the vector in one statement.
5377 Also requires scalar extract. */
5378
5379 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5380 {
5381 if (reduc_fn != IFN_LAST)
5382 {
5383 if (reduction_type == COND_REDUCTION)
5384 {
5385 /* An EQ stmt and an COND_EXPR stmt. */
5386 epilogue_cost += record_stmt_cost (cost_vec, 2,
5387 vector_stmt, stmt_info, 0,
5388 vect_epilogue);
5389 /* Reduction of the max index and a reduction of the found
5390 values. */
5391 epilogue_cost += record_stmt_cost (cost_vec, 2,
5392 vec_to_scalar, stmt_info, 0,
5393 vect_epilogue);
5394 /* A broadcast of the max value. */
5395 epilogue_cost += record_stmt_cost (cost_vec, 1,
5396 scalar_to_vec, stmt_info, 0,
5397 vect_epilogue);
5398 }
5399 else
5400 {
5401 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5402 stmt_info, 0, vect_epilogue);
5403 epilogue_cost += record_stmt_cost (cost_vec, 1,
5404 vec_to_scalar, stmt_info, 0,
5405 vect_epilogue);
5406 }
5407 }
5408 else if (reduction_type == COND_REDUCTION)
5409 {
5410 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5411 /* Extraction of scalar elements. */
5412 epilogue_cost += record_stmt_cost (cost_vec,
5413 2 * estimated_nunits,
5414 vec_to_scalar, stmt_info, 0,
5415 vect_epilogue);
5416 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5417 epilogue_cost += record_stmt_cost (cost_vec,
5418 2 * estimated_nunits - 3,
5419 scalar_stmt, stmt_info, 0,
5420 vect_epilogue);
5421 }
5422 else if (reduction_type == EXTRACT_LAST_REDUCTION
5423 || reduction_type == FOLD_LEFT_REDUCTION)
5424 /* No extra instructions need in the epilogue. */
5425 ;
5426 else
5427 {
5428 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5429 tree bitsize = TYPE_SIZE (op.type);
5430 int element_bitsize = tree_to_uhwi (bitsize);
5431 int nelements = vec_size_in_bits / element_bitsize;
5432
5433 if (op.code == COND_EXPR)
5434 op.code = MAX_EXPR;
5435
5436 /* We have a whole vector shift available. */
5437 if (VECTOR_MODE_P (mode)
5438 && directly_supported_p (op.code, vectype)
5439 && have_whole_vector_shift (mode))
5440 {
5441 /* Final reduction via vector shifts and the reduction operator.
5442 Also requires scalar extract. */
5443 epilogue_cost += record_stmt_cost (cost_vec,
5444 exact_log2 (nelements) * 2,
5445 vector_stmt, stmt_info, 0,
5446 vect_epilogue);
5447 epilogue_cost += record_stmt_cost (cost_vec, 1,
5448 vec_to_scalar, stmt_info, 0,
5449 vect_epilogue);
5450 }
5451 else
5452 /* Use extracts and reduction op for final reduction. For N
5453 elements, we have N extracts and N-1 reduction ops. */
5454 epilogue_cost += record_stmt_cost (cost_vec,
5455 nelements + nelements - 1,
5456 vector_stmt, stmt_info, 0,
5457 vect_epilogue);
5458 }
5459 }
5460
5461 if (dump_enabled_p ())
5462 dump_printf (MSG_NOTE,
5463 "vect_model_reduction_cost: inside_cost = %d, "
5464 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5465 prologue_cost, epilogue_cost);
5466 }
5467
5468 /* SEQ is a sequence of instructions that initialize the reduction
5469 described by REDUC_INFO. Emit them in the appropriate place. */
5470
5471 static void
5472 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5473 stmt_vec_info reduc_info, gimple *seq)
5474 {
5475 if (reduc_info->reused_accumulator)
5476 {
5477 /* When reusing an accumulator from the main loop, we only need
5478 initialization instructions if the main loop can be skipped.
5479 In that case, emit the initialization instructions at the end
5480 of the guard block that does the skip. */
5481 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5482 gcc_assert (skip_edge);
5483 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5484 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5485 }
5486 else
5487 {
5488 /* The normal case: emit the initialization instructions on the
5489 preheader edge. */
5490 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5491 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5492 }
5493 }
5494
5495 /* Function get_initial_def_for_reduction
5496
5497 Input:
5498 REDUC_INFO - the info_for_reduction
5499 INIT_VAL - the initial value of the reduction variable
5500 NEUTRAL_OP - a value that has no effect on the reduction, as per
5501 neutral_op_for_reduction
5502
5503 Output:
5504 Return a vector variable, initialized according to the operation that
5505 STMT_VINFO performs. This vector will be used as the initial value
5506 of the vector of partial results.
5507
5508 The value we need is a vector in which element 0 has value INIT_VAL
5509 and every other element has value NEUTRAL_OP. */
5510
5511 static tree
5512 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5513 stmt_vec_info reduc_info,
5514 tree init_val, tree neutral_op)
5515 {
5516 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5517 tree scalar_type = TREE_TYPE (init_val);
5518 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5519 tree init_def;
5520 gimple_seq stmts = NULL;
5521
5522 gcc_assert (vectype);
5523
5524 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5525 || SCALAR_FLOAT_TYPE_P (scalar_type));
5526
5527 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5528 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5529
5530 if (operand_equal_p (init_val, neutral_op))
5531 {
5532 /* If both elements are equal then the vector described above is
5533 just a splat. */
5534 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5535 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5536 }
5537 else
5538 {
5539 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5540 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5541 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5542 {
5543 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5544 element 0. */
5545 init_def = gimple_build_vector_from_val (&stmts, vectype,
5546 neutral_op);
5547 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5548 vectype, init_def, init_val);
5549 }
5550 else
5551 {
5552 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5553 tree_vector_builder elts (vectype, 1, 2);
5554 elts.quick_push (init_val);
5555 elts.quick_push (neutral_op);
5556 init_def = gimple_build_vector (&stmts, &elts);
5557 }
5558 }
5559
5560 if (stmts)
5561 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5562 return init_def;
5563 }
5564
5565 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5566 which performs a reduction involving GROUP_SIZE scalar statements.
5567 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5568 is nonnull, introducing extra elements of that value will not change the
5569 result. */
5570
5571 static void
5572 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5573 stmt_vec_info reduc_info,
5574 vec<tree> *vec_oprnds,
5575 unsigned int number_of_vectors,
5576 unsigned int group_size, tree neutral_op)
5577 {
5578 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5579 unsigned HOST_WIDE_INT nunits;
5580 unsigned j, number_of_places_left_in_vector;
5581 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5582 unsigned int i;
5583
5584 gcc_assert (group_size == initial_values.length () || neutral_op);
5585
5586 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5587 created vectors. It is greater than 1 if unrolling is performed.
5588
5589 For example, we have two scalar operands, s1 and s2 (e.g., group of
5590 strided accesses of size two), while NUNITS is four (i.e., four scalars
5591 of this type can be packed in a vector). The output vector will contain
5592 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5593 will be 2).
5594
5595 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5596 vectors containing the operands.
5597
5598 For example, NUNITS is four as before, and the group size is 8
5599 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5600 {s5, s6, s7, s8}. */
5601
5602 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5603 nunits = group_size;
5604
5605 number_of_places_left_in_vector = nunits;
5606 bool constant_p = true;
5607 tree_vector_builder elts (vector_type, nunits, 1);
5608 elts.quick_grow (nunits);
5609 gimple_seq ctor_seq = NULL;
5610 for (j = 0; j < nunits * number_of_vectors; ++j)
5611 {
5612 tree op;
5613 i = j % group_size;
5614
5615 /* Get the def before the loop. In reduction chain we have only
5616 one initial value. Else we have as many as PHIs in the group. */
5617 if (i >= initial_values.length () || (j > i && neutral_op))
5618 op = neutral_op;
5619 else
5620 op = initial_values[i];
5621
5622 /* Create 'vect_ = {op0,op1,...,opn}'. */
5623 number_of_places_left_in_vector--;
5624 elts[nunits - number_of_places_left_in_vector - 1] = op;
5625 if (!CONSTANT_CLASS_P (op))
5626 constant_p = false;
5627
5628 if (number_of_places_left_in_vector == 0)
5629 {
5630 tree init;
5631 if (constant_p && !neutral_op
5632 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5633 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5634 /* Build the vector directly from ELTS. */
5635 init = gimple_build_vector (&ctor_seq, &elts);
5636 else if (neutral_op)
5637 {
5638 /* Build a vector of the neutral value and shift the
5639 other elements into place. */
5640 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5641 neutral_op);
5642 int k = nunits;
5643 while (k > 0 && elts[k - 1] == neutral_op)
5644 k -= 1;
5645 while (k > 0)
5646 {
5647 k -= 1;
5648 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5649 vector_type, init, elts[k]);
5650 }
5651 }
5652 else
5653 {
5654 /* First time round, duplicate ELTS to fill the
5655 required number of vectors. */
5656 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5657 elts, number_of_vectors, *vec_oprnds);
5658 break;
5659 }
5660 vec_oprnds->quick_push (init);
5661
5662 number_of_places_left_in_vector = nunits;
5663 elts.new_vector (vector_type, nunits, 1);
5664 elts.quick_grow (nunits);
5665 constant_p = true;
5666 }
5667 }
5668 if (ctor_seq != NULL)
5669 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5670 }
5671
5672 /* For a statement STMT_INFO taking part in a reduction operation return
5673 the stmt_vec_info the meta information is stored on. */
5674
5675 stmt_vec_info
5676 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5677 {
5678 stmt_info = vect_orig_stmt (stmt_info);
5679 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5680 if (!is_a <gphi *> (stmt_info->stmt)
5681 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5682 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5683 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5684 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5685 {
5686 if (gimple_phi_num_args (phi) == 1)
5687 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5688 }
5689 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5690 {
5691 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5692 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5693 stmt_info = info;
5694 }
5695 return stmt_info;
5696 }
5697
5698 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5699 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5700 return false. */
5701
5702 static bool
5703 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5704 stmt_vec_info reduc_info)
5705 {
5706 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5707 if (!main_loop_vinfo)
5708 return false;
5709
5710 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5711 return false;
5712
5713 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5714 auto_vec<tree, 16> main_loop_results (num_phis);
5715 auto_vec<tree, 16> initial_values (num_phis);
5716 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5717 {
5718 /* The epilogue loop can be entered either from the main loop or
5719 from an earlier guard block. */
5720 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5721 for (tree incoming_value : reduc_info->reduc_initial_values)
5722 {
5723 /* Look for:
5724
5725 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5726 INITIAL_VALUE(guard block)>. */
5727 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5728
5729 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5730 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5731
5732 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5733 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5734
5735 main_loop_results.quick_push (from_main_loop);
5736 initial_values.quick_push (from_skip);
5737 }
5738 }
5739 else
5740 /* The main loop dominates the epilogue loop. */
5741 main_loop_results.splice (reduc_info->reduc_initial_values);
5742
5743 /* See if the main loop has the kind of accumulator we need. */
5744 vect_reusable_accumulator *accumulator
5745 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5746 if (!accumulator
5747 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5748 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5749 accumulator->reduc_info->reduc_scalar_results.begin ()))
5750 return false;
5751
5752 /* Handle the case where we can reduce wider vectors to narrower ones. */
5753 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5754 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5755 unsigned HOST_WIDE_INT m;
5756 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5757 TYPE_VECTOR_SUBPARTS (vectype), &m))
5758 return false;
5759 /* Check the intermediate vector types and operations are available. */
5760 tree prev_vectype = old_vectype;
5761 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5762 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5763 {
5764 intermediate_nunits = exact_div (intermediate_nunits, 2);
5765 tree intermediate_vectype = get_related_vectype_for_scalar_type
5766 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5767 if (!intermediate_vectype
5768 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5769 intermediate_vectype)
5770 || !can_vec_extract (TYPE_MODE (prev_vectype),
5771 TYPE_MODE (intermediate_vectype)))
5772 return false;
5773 prev_vectype = intermediate_vectype;
5774 }
5775
5776 /* Non-SLP reductions might apply an adjustment after the reduction
5777 operation, in order to simplify the initialization of the accumulator.
5778 If the epilogue loop carries on from where the main loop left off,
5779 it should apply the same adjustment to the final reduction result.
5780
5781 If the epilogue loop can also be entered directly (rather than via
5782 the main loop), we need to be able to handle that case in the same way,
5783 with the same adjustment. (In principle we could add a PHI node
5784 to select the correct adjustment, but in practice that shouldn't be
5785 necessary.) */
5786 tree main_adjustment
5787 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5788 if (loop_vinfo->main_loop_edge && main_adjustment)
5789 {
5790 gcc_assert (num_phis == 1);
5791 tree initial_value = initial_values[0];
5792 /* Check that we can use INITIAL_VALUE as the adjustment and
5793 initialize the accumulator with a neutral value instead. */
5794 if (!operand_equal_p (initial_value, main_adjustment))
5795 return false;
5796 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5797 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5798 code, initial_value);
5799 }
5800 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5801 reduc_info->reduc_initial_values.truncate (0);
5802 reduc_info->reduc_initial_values.splice (initial_values);
5803 reduc_info->reused_accumulator = accumulator;
5804 return true;
5805 }
5806
5807 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5808 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5809
5810 static tree
5811 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5812 gimple_seq *seq)
5813 {
5814 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5815 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5816 tree stype = TREE_TYPE (vectype);
5817 tree new_temp = vec_def;
5818 while (nunits > nunits1)
5819 {
5820 nunits /= 2;
5821 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5822 stype, nunits);
5823 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5824
5825 /* The target has to make sure we support lowpart/highpart
5826 extraction, either via direct vector extract or through
5827 an integer mode punning. */
5828 tree dst1, dst2;
5829 gimple *epilog_stmt;
5830 if (convert_optab_handler (vec_extract_optab,
5831 TYPE_MODE (TREE_TYPE (new_temp)),
5832 TYPE_MODE (vectype1))
5833 != CODE_FOR_nothing)
5834 {
5835 /* Extract sub-vectors directly once vec_extract becomes
5836 a conversion optab. */
5837 dst1 = make_ssa_name (vectype1);
5838 epilog_stmt
5839 = gimple_build_assign (dst1, BIT_FIELD_REF,
5840 build3 (BIT_FIELD_REF, vectype1,
5841 new_temp, TYPE_SIZE (vectype1),
5842 bitsize_int (0)));
5843 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5844 dst2 = make_ssa_name (vectype1);
5845 epilog_stmt
5846 = gimple_build_assign (dst2, BIT_FIELD_REF,
5847 build3 (BIT_FIELD_REF, vectype1,
5848 new_temp, TYPE_SIZE (vectype1),
5849 bitsize_int (bitsize)));
5850 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5851 }
5852 else
5853 {
5854 /* Extract via punning to appropriately sized integer mode
5855 vector. */
5856 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5857 tree etype = build_vector_type (eltype, 2);
5858 gcc_assert (convert_optab_handler (vec_extract_optab,
5859 TYPE_MODE (etype),
5860 TYPE_MODE (eltype))
5861 != CODE_FOR_nothing);
5862 tree tem = make_ssa_name (etype);
5863 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5864 build1 (VIEW_CONVERT_EXPR,
5865 etype, new_temp));
5866 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5867 new_temp = tem;
5868 tem = make_ssa_name (eltype);
5869 epilog_stmt
5870 = gimple_build_assign (tem, BIT_FIELD_REF,
5871 build3 (BIT_FIELD_REF, eltype,
5872 new_temp, TYPE_SIZE (eltype),
5873 bitsize_int (0)));
5874 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5875 dst1 = make_ssa_name (vectype1);
5876 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5877 build1 (VIEW_CONVERT_EXPR,
5878 vectype1, tem));
5879 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5880 tem = make_ssa_name (eltype);
5881 epilog_stmt
5882 = gimple_build_assign (tem, BIT_FIELD_REF,
5883 build3 (BIT_FIELD_REF, eltype,
5884 new_temp, TYPE_SIZE (eltype),
5885 bitsize_int (bitsize)));
5886 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5887 dst2 = make_ssa_name (vectype1);
5888 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5889 build1 (VIEW_CONVERT_EXPR,
5890 vectype1, tem));
5891 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5892 }
5893
5894 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5895 }
5896
5897 return new_temp;
5898 }
5899
5900 /* Function vect_create_epilog_for_reduction
5901
5902 Create code at the loop-epilog to finalize the result of a reduction
5903 computation.
5904
5905 STMT_INFO is the scalar reduction stmt that is being vectorized.
5906 SLP_NODE is an SLP node containing a group of reduction statements. The
5907 first one in this group is STMT_INFO.
5908 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5909 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5910 (counting from 0)
5911 LOOP_EXIT is the edge to update in the merge block. In the case of a single
5912 exit this edge is always the main loop exit.
5913
5914 This function:
5915 1. Completes the reduction def-use cycles.
5916 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5917 by calling the function specified by REDUC_FN if available, or by
5918 other means (whole-vector shifts or a scalar loop).
5919 The function also creates a new phi node at the loop exit to preserve
5920 loop-closed form, as illustrated below.
5921
5922 The flow at the entry to this function:
5923
5924 loop:
5925 vec_def = phi <vec_init, null> # REDUCTION_PHI
5926 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5927 s_loop = scalar_stmt # (scalar) STMT_INFO
5928 loop_exit:
5929 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5930 use <s_out0>
5931 use <s_out0>
5932
5933 The above is transformed by this function into:
5934
5935 loop:
5936 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5937 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5938 s_loop = scalar_stmt # (scalar) STMT_INFO
5939 loop_exit:
5940 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5941 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5942 v_out2 = reduce <v_out1>
5943 s_out3 = extract_field <v_out2, 0>
5944 s_out4 = adjust_result <s_out3>
5945 use <s_out4>
5946 use <s_out4>
5947 */
5948
5949 static void
5950 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5951 stmt_vec_info stmt_info,
5952 slp_tree slp_node,
5953 slp_instance slp_node_instance,
5954 edge loop_exit)
5955 {
5956 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5957 gcc_assert (reduc_info->is_reduc_info);
5958 /* For double reductions we need to get at the inner loop reduction
5959 stmt which has the meta info attached. Our stmt_info is that of the
5960 loop-closed PHI of the inner loop which we remember as
5961 def for the reduction PHI generation. */
5962 bool double_reduc = false;
5963 stmt_vec_info rdef_info = stmt_info;
5964 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5965 {
5966 gcc_assert (!slp_node);
5967 double_reduc = true;
5968 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5969 (stmt_info->stmt, 0));
5970 stmt_info = vect_stmt_to_vectorize (stmt_info);
5971 }
5972 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5973 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5974 tree vectype;
5975 machine_mode mode;
5976 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5977 basic_block exit_bb;
5978 tree scalar_dest;
5979 tree scalar_type;
5980 gimple *new_phi = NULL, *phi = NULL;
5981 gimple_stmt_iterator exit_gsi;
5982 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5983 gimple *epilog_stmt = NULL;
5984 gimple *exit_phi;
5985 tree bitsize;
5986 tree def;
5987 tree orig_name, scalar_result;
5988 imm_use_iterator imm_iter, phi_imm_iter;
5989 use_operand_p use_p, phi_use_p;
5990 gimple *use_stmt;
5991 auto_vec<tree> reduc_inputs;
5992 int j, i;
5993 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5994 unsigned int group_size = 1, k;
5995 /* SLP reduction without reduction chain, e.g.,
5996 # a1 = phi <a2, a0>
5997 # b1 = phi <b2, b0>
5998 a2 = operation (a1)
5999 b2 = operation (b1) */
6000 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6001 bool direct_slp_reduc;
6002 tree induction_index = NULL_TREE;
6003
6004 if (slp_node)
6005 group_size = SLP_TREE_LANES (slp_node);
6006
6007 if (nested_in_vect_loop_p (loop, stmt_info))
6008 {
6009 outer_loop = loop;
6010 loop = loop->inner;
6011 gcc_assert (!slp_node && double_reduc);
6012 }
6013
6014 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6015 gcc_assert (vectype);
6016 mode = TYPE_MODE (vectype);
6017
6018 tree induc_val = NULL_TREE;
6019 tree adjustment_def = NULL;
6020 if (slp_node)
6021 ;
6022 else
6023 {
6024 /* Optimize: for induction condition reduction, if we can't use zero
6025 for induc_val, use initial_def. */
6026 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6027 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6028 else if (double_reduc)
6029 ;
6030 else
6031 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6032 }
6033
6034 stmt_vec_info single_live_out_stmt[] = { stmt_info };
6035 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6036 if (slp_reduc)
6037 /* All statements produce live-out values. */
6038 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6039
6040 unsigned vec_num;
6041 int ncopies;
6042 if (slp_node)
6043 {
6044 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6045 ncopies = 1;
6046 }
6047 else
6048 {
6049 vec_num = 1;
6050 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6051 }
6052
6053 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6054 which is updated with the current index of the loop for every match of
6055 the original loop's cond_expr (VEC_STMT). This results in a vector
6056 containing the last time the condition passed for that vector lane.
6057 The first match will be a 1 to allow 0 to be used for non-matching
6058 indexes. If there are no matches at all then the vector will be all
6059 zeroes.
6060
6061 PR92772: This algorithm is broken for architectures that support
6062 masked vectors, but do not provide fold_extract_last. */
6063 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6064 {
6065 auto_vec<std::pair<tree, bool>, 2> ccompares;
6066 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6067 cond_info = vect_stmt_to_vectorize (cond_info);
6068 while (cond_info != reduc_info)
6069 {
6070 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6071 {
6072 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6073 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6074 ccompares.safe_push
6075 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6076 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6077 }
6078 cond_info
6079 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6080 1 + STMT_VINFO_REDUC_IDX
6081 (cond_info)));
6082 cond_info = vect_stmt_to_vectorize (cond_info);
6083 }
6084 gcc_assert (ccompares.length () != 0);
6085
6086 tree indx_before_incr, indx_after_incr;
6087 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6088 int scalar_precision
6089 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6090 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6091 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6092 (TYPE_MODE (vectype), cr_index_scalar_type,
6093 TYPE_VECTOR_SUBPARTS (vectype));
6094
6095 /* First we create a simple vector induction variable which starts
6096 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6097 vector size (STEP). */
6098
6099 /* Create a {1,2,3,...} vector. */
6100 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6101
6102 /* Create a vector of the step value. */
6103 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6104 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6105
6106 /* Create an induction variable. */
6107 gimple_stmt_iterator incr_gsi;
6108 bool insert_after;
6109 vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6110 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6111 insert_after, &indx_before_incr, &indx_after_incr);
6112
6113 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6114 filled with zeros (VEC_ZERO). */
6115
6116 /* Create a vector of 0s. */
6117 tree zero = build_zero_cst (cr_index_scalar_type);
6118 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6119
6120 /* Create a vector phi node. */
6121 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6122 new_phi = create_phi_node (new_phi_tree, loop->header);
6123 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6124 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6125
6126 /* Now take the condition from the loops original cond_exprs
6127 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6128 every match uses values from the induction variable
6129 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6130 (NEW_PHI_TREE).
6131 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6132 the new cond_expr (INDEX_COND_EXPR). */
6133 gimple_seq stmts = NULL;
6134 for (int i = ccompares.length () - 1; i != -1; --i)
6135 {
6136 tree ccompare = ccompares[i].first;
6137 if (ccompares[i].second)
6138 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6139 cr_index_vector_type,
6140 ccompare,
6141 indx_before_incr, new_phi_tree);
6142 else
6143 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6144 cr_index_vector_type,
6145 ccompare,
6146 new_phi_tree, indx_before_incr);
6147 }
6148 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6149
6150 /* Update the phi with the vec cond. */
6151 induction_index = new_phi_tree;
6152 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6153 loop_latch_edge (loop), UNKNOWN_LOCATION);
6154 }
6155
6156 /* 2. Create epilog code.
6157 The reduction epilog code operates across the elements of the vector
6158 of partial results computed by the vectorized loop.
6159 The reduction epilog code consists of:
6160
6161 step 1: compute the scalar result in a vector (v_out2)
6162 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6163 step 3: adjust the scalar result (s_out3) if needed.
6164
6165 Step 1 can be accomplished using one the following three schemes:
6166 (scheme 1) using reduc_fn, if available.
6167 (scheme 2) using whole-vector shifts, if available.
6168 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6169 combined.
6170
6171 The overall epilog code looks like this:
6172
6173 s_out0 = phi <s_loop> # original EXIT_PHI
6174 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6175 v_out2 = reduce <v_out1> # step 1
6176 s_out3 = extract_field <v_out2, 0> # step 2
6177 s_out4 = adjust_result <s_out3> # step 3
6178
6179 (step 3 is optional, and steps 1 and 2 may be combined).
6180 Lastly, the uses of s_out0 are replaced by s_out4. */
6181
6182
6183 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6184 v_out1 = phi <VECT_DEF>
6185 Store them in NEW_PHIS. */
6186 if (double_reduc)
6187 loop = outer_loop;
6188 /* We need to reduce values in all exits. */
6189 exit_bb = loop_exit->dest;
6190 exit_gsi = gsi_after_labels (exit_bb);
6191 reduc_inputs.create (slp_node ? vec_num : ncopies);
6192 for (unsigned i = 0; i < vec_num; i++)
6193 {
6194 gimple_seq stmts = NULL;
6195 if (slp_node)
6196 def = vect_get_slp_vect_def (slp_node, i);
6197 else
6198 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6199 for (j = 0; j < ncopies; j++)
6200 {
6201 tree new_def = copy_ssa_name (def);
6202 phi = create_phi_node (new_def, exit_bb);
6203 if (j)
6204 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6205 if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6206 SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6207 else
6208 {
6209 for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6210 SET_PHI_ARG_DEF (phi, k, def);
6211 }
6212 new_def = gimple_convert (&stmts, vectype, new_def);
6213 reduc_inputs.quick_push (new_def);
6214 }
6215 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6216 }
6217
6218 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6219 (i.e. when reduc_fn is not available) and in the final adjustment
6220 code (if needed). Also get the original scalar reduction variable as
6221 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6222 represents a reduction pattern), the tree-code and scalar-def are
6223 taken from the original stmt that the pattern-stmt (STMT) replaces.
6224 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6225 are taken from STMT. */
6226
6227 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6228 if (orig_stmt_info != stmt_info)
6229 {
6230 /* Reduction pattern */
6231 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6232 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6233 }
6234
6235 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6236 scalar_type = TREE_TYPE (scalar_dest);
6237 scalar_results.truncate (0);
6238 scalar_results.reserve_exact (group_size);
6239 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6240 bitsize = TYPE_SIZE (scalar_type);
6241
6242 /* True if we should implement SLP_REDUC using native reduction operations
6243 instead of scalar operations. */
6244 direct_slp_reduc = (reduc_fn != IFN_LAST
6245 && slp_reduc
6246 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6247
6248 /* In case of reduction chain, e.g.,
6249 # a1 = phi <a3, a0>
6250 a2 = operation (a1)
6251 a3 = operation (a2),
6252
6253 we may end up with more than one vector result. Here we reduce them
6254 to one vector.
6255
6256 The same is true for a SLP reduction, e.g.,
6257 # a1 = phi <a2, a0>
6258 # b1 = phi <b2, b0>
6259 a2 = operation (a1)
6260 b2 = operation (a2),
6261
6262 where we can end up with more than one vector as well. We can
6263 easily accumulate vectors when the number of vector elements is
6264 a multiple of the SLP group size.
6265
6266 The same is true if we couldn't use a single defuse cycle. */
6267 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6268 || direct_slp_reduc
6269 || (slp_reduc
6270 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6271 || ncopies > 1)
6272 {
6273 gimple_seq stmts = NULL;
6274 tree single_input = reduc_inputs[0];
6275 for (k = 1; k < reduc_inputs.length (); k++)
6276 single_input = gimple_build (&stmts, code, vectype,
6277 single_input, reduc_inputs[k]);
6278 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6279
6280 reduc_inputs.truncate (0);
6281 reduc_inputs.safe_push (single_input);
6282 }
6283
6284 tree orig_reduc_input = reduc_inputs[0];
6285
6286 /* If this loop is an epilogue loop that can be skipped after the
6287 main loop, we can only share a reduction operation between the
6288 main loop and the epilogue if we put it at the target of the
6289 skip edge.
6290
6291 We can still reuse accumulators if this check fails. Doing so has
6292 the minor(?) benefit of making the epilogue loop's scalar result
6293 independent of the main loop's scalar result. */
6294 bool unify_with_main_loop_p = false;
6295 if (reduc_info->reused_accumulator
6296 && loop_vinfo->skip_this_loop_edge
6297 && single_succ_p (exit_bb)
6298 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6299 {
6300 unify_with_main_loop_p = true;
6301
6302 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6303 reduc_inputs[0] = make_ssa_name (vectype);
6304 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6305 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6306 UNKNOWN_LOCATION);
6307 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6308 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6309 exit_gsi = gsi_after_labels (reduc_block);
6310 }
6311
6312 /* Shouldn't be used beyond this point. */
6313 exit_bb = nullptr;
6314
6315 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6316 && reduc_fn != IFN_LAST)
6317 {
6318 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6319 various data values where the condition matched and another vector
6320 (INDUCTION_INDEX) containing all the indexes of those matches. We
6321 need to extract the last matching index (which will be the index with
6322 highest value) and use this to index into the data vector.
6323 For the case where there were no matches, the data vector will contain
6324 all default values and the index vector will be all zeros. */
6325
6326 /* Get various versions of the type of the vector of indexes. */
6327 tree index_vec_type = TREE_TYPE (induction_index);
6328 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6329 tree index_scalar_type = TREE_TYPE (index_vec_type);
6330 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6331
6332 /* Get an unsigned integer version of the type of the data vector. */
6333 int scalar_precision
6334 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6335 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6336 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6337 vectype);
6338
6339 /* First we need to create a vector (ZERO_VEC) of zeros and another
6340 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6341 can create using a MAX reduction and then expanding.
6342 In the case where the loop never made any matches, the max index will
6343 be zero. */
6344
6345 /* Vector of {0, 0, 0,...}. */
6346 tree zero_vec = build_zero_cst (vectype);
6347
6348 /* Find maximum value from the vector of found indexes. */
6349 tree max_index = make_ssa_name (index_scalar_type);
6350 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6351 1, induction_index);
6352 gimple_call_set_lhs (max_index_stmt, max_index);
6353 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6354
6355 /* Vector of {max_index, max_index, max_index,...}. */
6356 tree max_index_vec = make_ssa_name (index_vec_type);
6357 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6358 max_index);
6359 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6360 max_index_vec_rhs);
6361 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6362
6363 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6364 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6365 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6366 otherwise. Only one value should match, resulting in a vector
6367 (VEC_COND) with one data value and the rest zeros.
6368 In the case where the loop never made any matches, every index will
6369 match, resulting in a vector with all data values (which will all be
6370 the default value). */
6371
6372 /* Compare the max index vector to the vector of found indexes to find
6373 the position of the max value. */
6374 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6375 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6376 induction_index,
6377 max_index_vec);
6378 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6379
6380 /* Use the compare to choose either values from the data vector or
6381 zero. */
6382 tree vec_cond = make_ssa_name (vectype);
6383 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6384 vec_compare,
6385 reduc_inputs[0],
6386 zero_vec);
6387 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6388
6389 /* Finally we need to extract the data value from the vector (VEC_COND)
6390 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6391 reduction, but because this doesn't exist, we can use a MAX reduction
6392 instead. The data value might be signed or a float so we need to cast
6393 it first.
6394 In the case where the loop never made any matches, the data values are
6395 all identical, and so will reduce down correctly. */
6396
6397 /* Make the matched data values unsigned. */
6398 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6399 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6400 vec_cond);
6401 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6402 VIEW_CONVERT_EXPR,
6403 vec_cond_cast_rhs);
6404 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6405
6406 /* Reduce down to a scalar value. */
6407 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6408 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6409 1, vec_cond_cast);
6410 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6411 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6412
6413 /* Convert the reduced value back to the result type and set as the
6414 result. */
6415 gimple_seq stmts = NULL;
6416 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6417 data_reduc);
6418 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6419 scalar_results.safe_push (new_temp);
6420 }
6421 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6422 && reduc_fn == IFN_LAST)
6423 {
6424 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6425 idx = 0;
6426 idx_val = induction_index[0];
6427 val = data_reduc[0];
6428 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6429 if (induction_index[i] > idx_val)
6430 val = data_reduc[i], idx_val = induction_index[i];
6431 return val; */
6432
6433 tree data_eltype = TREE_TYPE (vectype);
6434 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6435 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6436 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6437 /* Enforced by vectorizable_reduction, which ensures we have target
6438 support before allowing a conditional reduction on variable-length
6439 vectors. */
6440 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6441 tree idx_val = NULL_TREE, val = NULL_TREE;
6442 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6443 {
6444 tree old_idx_val = idx_val;
6445 tree old_val = val;
6446 idx_val = make_ssa_name (idx_eltype);
6447 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6448 build3 (BIT_FIELD_REF, idx_eltype,
6449 induction_index,
6450 bitsize_int (el_size),
6451 bitsize_int (off)));
6452 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6453 val = make_ssa_name (data_eltype);
6454 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6455 build3 (BIT_FIELD_REF,
6456 data_eltype,
6457 reduc_inputs[0],
6458 bitsize_int (el_size),
6459 bitsize_int (off)));
6460 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6461 if (off != 0)
6462 {
6463 tree new_idx_val = idx_val;
6464 if (off != v_size - el_size)
6465 {
6466 new_idx_val = make_ssa_name (idx_eltype);
6467 epilog_stmt = gimple_build_assign (new_idx_val,
6468 MAX_EXPR, idx_val,
6469 old_idx_val);
6470 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6471 }
6472 tree cond = make_ssa_name (boolean_type_node);
6473 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6474 idx_val, old_idx_val);
6475 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6476 tree new_val = make_ssa_name (data_eltype);
6477 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6478 cond, val, old_val);
6479 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6480 idx_val = new_idx_val;
6481 val = new_val;
6482 }
6483 }
6484 /* Convert the reduced value back to the result type and set as the
6485 result. */
6486 gimple_seq stmts = NULL;
6487 val = gimple_convert (&stmts, scalar_type, val);
6488 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6489 scalar_results.safe_push (val);
6490 }
6491
6492 /* 2.3 Create the reduction code, using one of the three schemes described
6493 above. In SLP we simply need to extract all the elements from the
6494 vector (without reducing them), so we use scalar shifts. */
6495 else if (reduc_fn != IFN_LAST && !slp_reduc)
6496 {
6497 tree tmp;
6498 tree vec_elem_type;
6499
6500 /* Case 1: Create:
6501 v_out2 = reduc_expr <v_out1> */
6502
6503 if (dump_enabled_p ())
6504 dump_printf_loc (MSG_NOTE, vect_location,
6505 "Reduce using direct vector reduction.\n");
6506
6507 gimple_seq stmts = NULL;
6508 vec_elem_type = TREE_TYPE (vectype);
6509 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6510 vec_elem_type, reduc_inputs[0]);
6511 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6512 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6513
6514 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6515 && induc_val)
6516 {
6517 /* Earlier we set the initial value to be a vector if induc_val
6518 values. Check the result and if it is induc_val then replace
6519 with the original initial value, unless induc_val is
6520 the same as initial_def already. */
6521 tree zcompare = make_ssa_name (boolean_type_node);
6522 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6523 new_temp, induc_val);
6524 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6525 tree initial_def = reduc_info->reduc_initial_values[0];
6526 tmp = make_ssa_name (new_scalar_dest);
6527 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6528 initial_def, new_temp);
6529 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6530 new_temp = tmp;
6531 }
6532
6533 scalar_results.safe_push (new_temp);
6534 }
6535 else if (direct_slp_reduc)
6536 {
6537 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6538 with the elements for other SLP statements replaced with the
6539 neutral value. We can then do a normal reduction on each vector. */
6540
6541 /* Enforced by vectorizable_reduction. */
6542 gcc_assert (reduc_inputs.length () == 1);
6543 gcc_assert (pow2p_hwi (group_size));
6544
6545 gimple_seq seq = NULL;
6546
6547 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6548 and the same element size as VECTYPE. */
6549 tree index = build_index_vector (vectype, 0, 1);
6550 tree index_type = TREE_TYPE (index);
6551 tree index_elt_type = TREE_TYPE (index_type);
6552 tree mask_type = truth_type_for (index_type);
6553
6554 /* Create a vector that, for each element, identifies which of
6555 the REDUC_GROUP_SIZE results should use it. */
6556 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6557 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6558 build_vector_from_val (index_type, index_mask));
6559
6560 /* Get a neutral vector value. This is simply a splat of the neutral
6561 scalar value if we have one, otherwise the initial scalar value
6562 is itself a neutral value. */
6563 tree vector_identity = NULL_TREE;
6564 tree neutral_op = NULL_TREE;
6565 if (slp_node)
6566 {
6567 tree initial_value = NULL_TREE;
6568 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6569 initial_value = reduc_info->reduc_initial_values[0];
6570 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6571 initial_value, false);
6572 }
6573 if (neutral_op)
6574 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6575 neutral_op);
6576 for (unsigned int i = 0; i < group_size; ++i)
6577 {
6578 /* If there's no univeral neutral value, we can use the
6579 initial scalar value from the original PHI. This is used
6580 for MIN and MAX reduction, for example. */
6581 if (!neutral_op)
6582 {
6583 tree scalar_value = reduc_info->reduc_initial_values[i];
6584 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6585 scalar_value);
6586 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6587 scalar_value);
6588 }
6589
6590 /* Calculate the equivalent of:
6591
6592 sel[j] = (index[j] == i);
6593
6594 which selects the elements of REDUC_INPUTS[0] that should
6595 be included in the result. */
6596 tree compare_val = build_int_cst (index_elt_type, i);
6597 compare_val = build_vector_from_val (index_type, compare_val);
6598 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6599 index, compare_val);
6600
6601 /* Calculate the equivalent of:
6602
6603 vec = seq ? reduc_inputs[0] : vector_identity;
6604
6605 VEC is now suitable for a full vector reduction. */
6606 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6607 sel, reduc_inputs[0], vector_identity);
6608
6609 /* Do the reduction and convert it to the appropriate type. */
6610 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6611 TREE_TYPE (vectype), vec);
6612 scalar = gimple_convert (&seq, scalar_type, scalar);
6613 scalar_results.safe_push (scalar);
6614 }
6615 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6616 }
6617 else
6618 {
6619 bool reduce_with_shift;
6620 tree vec_temp;
6621
6622 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6623
6624 /* See if the target wants to do the final (shift) reduction
6625 in a vector mode of smaller size and first reduce upper/lower
6626 halves against each other. */
6627 enum machine_mode mode1 = mode;
6628 tree stype = TREE_TYPE (vectype);
6629 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6630 unsigned nunits1 = nunits;
6631 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6632 && reduc_inputs.length () == 1)
6633 {
6634 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6635 /* For SLP reductions we have to make sure lanes match up, but
6636 since we're doing individual element final reduction reducing
6637 vector width here is even more important.
6638 ??? We can also separate lanes with permutes, for the common
6639 case of power-of-two group-size odd/even extracts would work. */
6640 if (slp_reduc && nunits != nunits1)
6641 {
6642 nunits1 = least_common_multiple (nunits1, group_size);
6643 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6644 }
6645 }
6646 if (!slp_reduc
6647 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6648 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6649
6650 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6651 stype, nunits1);
6652 reduce_with_shift = have_whole_vector_shift (mode1);
6653 if (!VECTOR_MODE_P (mode1)
6654 || !directly_supported_p (code, vectype1))
6655 reduce_with_shift = false;
6656
6657 /* First reduce the vector to the desired vector size we should
6658 do shift reduction on by combining upper and lower halves. */
6659 gimple_seq stmts = NULL;
6660 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6661 code, &stmts);
6662 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6663 reduc_inputs[0] = new_temp;
6664
6665 if (reduce_with_shift && !slp_reduc)
6666 {
6667 int element_bitsize = tree_to_uhwi (bitsize);
6668 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6669 for variable-length vectors and also requires direct target support
6670 for loop reductions. */
6671 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6672 int nelements = vec_size_in_bits / element_bitsize;
6673 vec_perm_builder sel;
6674 vec_perm_indices indices;
6675
6676 int elt_offset;
6677
6678 tree zero_vec = build_zero_cst (vectype1);
6679 /* Case 2: Create:
6680 for (offset = nelements/2; offset >= 1; offset/=2)
6681 {
6682 Create: va' = vec_shift <va, offset>
6683 Create: va = vop <va, va'>
6684 } */
6685
6686 tree rhs;
6687
6688 if (dump_enabled_p ())
6689 dump_printf_loc (MSG_NOTE, vect_location,
6690 "Reduce using vector shifts\n");
6691
6692 gimple_seq stmts = NULL;
6693 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6694 for (elt_offset = nelements / 2;
6695 elt_offset >= 1;
6696 elt_offset /= 2)
6697 {
6698 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6699 indices.new_vector (sel, 2, nelements);
6700 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6701 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6702 new_temp, zero_vec, mask);
6703 new_temp = gimple_build (&stmts, code,
6704 vectype1, new_name, new_temp);
6705 }
6706 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6707
6708 /* 2.4 Extract the final scalar result. Create:
6709 s_out3 = extract_field <v_out2, bitpos> */
6710
6711 if (dump_enabled_p ())
6712 dump_printf_loc (MSG_NOTE, vect_location,
6713 "extract scalar result\n");
6714
6715 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6716 bitsize, bitsize_zero_node);
6717 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6718 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6719 gimple_assign_set_lhs (epilog_stmt, new_temp);
6720 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6721 scalar_results.safe_push (new_temp);
6722 }
6723 else
6724 {
6725 /* Case 3: Create:
6726 s = extract_field <v_out2, 0>
6727 for (offset = element_size;
6728 offset < vector_size;
6729 offset += element_size;)
6730 {
6731 Create: s' = extract_field <v_out2, offset>
6732 Create: s = op <s, s'> // For non SLP cases
6733 } */
6734
6735 if (dump_enabled_p ())
6736 dump_printf_loc (MSG_NOTE, vect_location,
6737 "Reduce using scalar code.\n");
6738
6739 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6740 int element_bitsize = tree_to_uhwi (bitsize);
6741 tree compute_type = TREE_TYPE (vectype);
6742 gimple_seq stmts = NULL;
6743 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6744 {
6745 int bit_offset;
6746 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6747 vec_temp, bitsize, bitsize_zero_node);
6748
6749 /* In SLP we don't need to apply reduction operation, so we just
6750 collect s' values in SCALAR_RESULTS. */
6751 if (slp_reduc)
6752 scalar_results.safe_push (new_temp);
6753
6754 for (bit_offset = element_bitsize;
6755 bit_offset < vec_size_in_bits;
6756 bit_offset += element_bitsize)
6757 {
6758 tree bitpos = bitsize_int (bit_offset);
6759 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6760 compute_type, vec_temp,
6761 bitsize, bitpos);
6762 if (slp_reduc)
6763 {
6764 /* In SLP we don't need to apply reduction operation, so
6765 we just collect s' values in SCALAR_RESULTS. */
6766 new_temp = new_name;
6767 scalar_results.safe_push (new_name);
6768 }
6769 else
6770 new_temp = gimple_build (&stmts, code, compute_type,
6771 new_name, new_temp);
6772 }
6773 }
6774
6775 /* The only case where we need to reduce scalar results in SLP, is
6776 unrolling. If the size of SCALAR_RESULTS is greater than
6777 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6778 REDUC_GROUP_SIZE. */
6779 if (slp_reduc)
6780 {
6781 tree res, first_res, new_res;
6782
6783 /* Reduce multiple scalar results in case of SLP unrolling. */
6784 for (j = group_size; scalar_results.iterate (j, &res);
6785 j++)
6786 {
6787 first_res = scalar_results[j % group_size];
6788 new_res = gimple_build (&stmts, code, compute_type,
6789 first_res, res);
6790 scalar_results[j % group_size] = new_res;
6791 }
6792 scalar_results.truncate (group_size);
6793 for (k = 0; k < group_size; k++)
6794 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6795 scalar_results[k]);
6796 }
6797 else
6798 {
6799 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6800 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6801 scalar_results.safe_push (new_temp);
6802 }
6803
6804 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6805 }
6806
6807 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6808 && induc_val)
6809 {
6810 /* Earlier we set the initial value to be a vector if induc_val
6811 values. Check the result and if it is induc_val then replace
6812 with the original initial value, unless induc_val is
6813 the same as initial_def already. */
6814 tree zcompare = make_ssa_name (boolean_type_node);
6815 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6816 induc_val);
6817 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6818 tree initial_def = reduc_info->reduc_initial_values[0];
6819 tree tmp = make_ssa_name (new_scalar_dest);
6820 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6821 initial_def, new_temp);
6822 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6823 scalar_results[0] = tmp;
6824 }
6825 }
6826
6827 /* 2.5 Adjust the final result by the initial value of the reduction
6828 variable. (When such adjustment is not needed, then
6829 'adjustment_def' is zero). For example, if code is PLUS we create:
6830 new_temp = loop_exit_def + adjustment_def */
6831
6832 if (adjustment_def)
6833 {
6834 gcc_assert (!slp_reduc);
6835 gimple_seq stmts = NULL;
6836 if (double_reduc)
6837 {
6838 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6839 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6840 new_temp = gimple_build (&stmts, code, vectype,
6841 reduc_inputs[0], adjustment_def);
6842 }
6843 else
6844 {
6845 new_temp = scalar_results[0];
6846 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6847 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6848 adjustment_def);
6849 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6850 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6851 new_temp, adjustment_def);
6852 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6853 }
6854
6855 epilog_stmt = gimple_seq_last_stmt (stmts);
6856 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6857 scalar_results[0] = new_temp;
6858 }
6859
6860 /* Record this operation if it could be reused by the epilogue loop. */
6861 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6862 && reduc_inputs.length () == 1)
6863 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6864 { orig_reduc_input, reduc_info });
6865
6866 if (double_reduc)
6867 loop = outer_loop;
6868
6869 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6870 phis with new adjusted scalar results, i.e., replace use <s_out0>
6871 with use <s_out4>.
6872
6873 Transform:
6874 loop_exit:
6875 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6876 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6877 v_out2 = reduce <v_out1>
6878 s_out3 = extract_field <v_out2, 0>
6879 s_out4 = adjust_result <s_out3>
6880 use <s_out0>
6881 use <s_out0>
6882
6883 into:
6884
6885 loop_exit:
6886 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6887 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6888 v_out2 = reduce <v_out1>
6889 s_out3 = extract_field <v_out2, 0>
6890 s_out4 = adjust_result <s_out3>
6891 use <s_out4>
6892 use <s_out4> */
6893
6894 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6895 auto_vec<gimple *> phis;
6896 for (k = 0; k < live_out_stmts.size (); k++)
6897 {
6898 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6899 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6900
6901 /* Find the loop-closed-use at the loop exit of the original scalar
6902 result. (The reduction result is expected to have two immediate uses,
6903 one at the latch block, and one at the loop exit). For double
6904 reductions we are looking for exit phis of the outer loop. */
6905 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6906 {
6907 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6908 {
6909 if (!is_gimple_debug (USE_STMT (use_p))
6910 && gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6911 phis.safe_push (USE_STMT (use_p));
6912 }
6913 else
6914 {
6915 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6916 {
6917 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6918
6919 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6920 {
6921 if (!flow_bb_inside_loop_p (loop,
6922 gimple_bb (USE_STMT (phi_use_p)))
6923 && !is_gimple_debug (USE_STMT (phi_use_p)))
6924 phis.safe_push (USE_STMT (phi_use_p));
6925 }
6926 }
6927 }
6928 }
6929
6930 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6931 {
6932 /* Replace the uses: */
6933 orig_name = PHI_RESULT (exit_phi);
6934
6935 /* Look for a single use at the target of the skip edge. */
6936 if (unify_with_main_loop_p)
6937 {
6938 use_operand_p use_p;
6939 gimple *user;
6940 if (!single_imm_use (orig_name, &use_p, &user))
6941 gcc_unreachable ();
6942 orig_name = gimple_get_lhs (user);
6943 }
6944
6945 scalar_result = scalar_results[k];
6946 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6947 {
6948 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6949 SET_USE (use_p, scalar_result);
6950 update_stmt (use_stmt);
6951 }
6952 }
6953
6954 phis.truncate (0);
6955 }
6956 }
6957
6958 /* Return a vector of type VECTYPE that is equal to the vector select
6959 operation "MASK ? VEC : IDENTITY". Insert the select statements
6960 before GSI. */
6961
6962 static tree
6963 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6964 tree vec, tree identity)
6965 {
6966 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6967 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6968 mask, vec, identity);
6969 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6970 return cond;
6971 }
6972
6973 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6974 order, starting with LHS. Insert the extraction statements before GSI and
6975 associate the new scalar SSA names with variable SCALAR_DEST.
6976 If MASK is nonzero mask the input and then operate on it unconditionally.
6977 Return the SSA name for the result. */
6978
6979 static tree
6980 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6981 tree_code code, tree lhs, tree vector_rhs,
6982 tree mask)
6983 {
6984 tree vectype = TREE_TYPE (vector_rhs);
6985 tree scalar_type = TREE_TYPE (vectype);
6986 tree bitsize = TYPE_SIZE (scalar_type);
6987 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6988 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6989
6990 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6991 to perform an unconditional element-wise reduction of it. */
6992 if (mask)
6993 {
6994 tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6995 "masked_vector_rhs");
6996 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6997 false);
6998 tree vector_identity = build_vector_from_val (vectype, neutral_op);
6999 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7000 mask, vector_rhs, vector_identity);
7001 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7002 vector_rhs = masked_vector_rhs;
7003 }
7004
7005 for (unsigned HOST_WIDE_INT bit_offset = 0;
7006 bit_offset < vec_size_in_bits;
7007 bit_offset += element_bitsize)
7008 {
7009 tree bitpos = bitsize_int (bit_offset);
7010 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7011 bitsize, bitpos);
7012
7013 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7014 rhs = make_ssa_name (scalar_dest, stmt);
7015 gimple_assign_set_lhs (stmt, rhs);
7016 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7017
7018 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7019 tree new_name = make_ssa_name (scalar_dest, stmt);
7020 gimple_assign_set_lhs (stmt, new_name);
7021 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7022 lhs = new_name;
7023 }
7024 return lhs;
7025 }
7026
7027 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7028 type of the vector input. */
7029
7030 static internal_fn
7031 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7032 {
7033 internal_fn mask_reduc_fn;
7034 internal_fn mask_len_reduc_fn;
7035
7036 switch (reduc_fn)
7037 {
7038 case IFN_FOLD_LEFT_PLUS:
7039 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7040 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7041 break;
7042
7043 default:
7044 return IFN_LAST;
7045 }
7046
7047 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7048 OPTIMIZE_FOR_SPEED))
7049 return mask_reduc_fn;
7050 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7051 OPTIMIZE_FOR_SPEED))
7052 return mask_len_reduc_fn;
7053 return IFN_LAST;
7054 }
7055
7056 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7057 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7058 statement. CODE is the operation performed by STMT_INFO and OPS are
7059 its scalar operands. REDUC_INDEX is the index of the operand in
7060 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7061 implements in-order reduction, or IFN_LAST if we should open-code it.
7062 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7063 that should be used to control the operation in a fully-masked loop. */
7064
7065 static bool
7066 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7067 stmt_vec_info stmt_info,
7068 gimple_stmt_iterator *gsi,
7069 gimple **vec_stmt, slp_tree slp_node,
7070 gimple *reduc_def_stmt,
7071 code_helper code, internal_fn reduc_fn,
7072 tree *ops, int num_ops, tree vectype_in,
7073 int reduc_index, vec_loop_masks *masks,
7074 vec_loop_lens *lens)
7075 {
7076 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7077 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7078 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7079
7080 int ncopies;
7081 if (slp_node)
7082 ncopies = 1;
7083 else
7084 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7085
7086 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7087 gcc_assert (ncopies == 1);
7088
7089 bool is_cond_op = false;
7090 if (!code.is_tree_code ())
7091 {
7092 code = conditional_internal_fn_code (internal_fn (code));
7093 gcc_assert (code != ERROR_MARK);
7094 is_cond_op = true;
7095 }
7096
7097 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7098
7099 if (slp_node)
7100 {
7101 if (is_cond_op)
7102 {
7103 if (dump_enabled_p ())
7104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7105 "fold-left reduction on SLP not supported.\n");
7106 return false;
7107 }
7108
7109 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7110 TYPE_VECTOR_SUBPARTS (vectype_in)));
7111 }
7112
7113 /* The operands either come from a binary operation or an IFN_COND operation.
7114 The former is a gimple assign with binary rhs and the latter is a
7115 gimple call with four arguments. */
7116 gcc_assert (num_ops == 2 || num_ops == 4);
7117 tree op0, opmask;
7118 if (!is_cond_op)
7119 op0 = ops[1 - reduc_index];
7120 else
7121 {
7122 op0 = ops[2 + (1 - reduc_index)];
7123 opmask = ops[0];
7124 gcc_assert (!slp_node);
7125 }
7126
7127 int group_size = 1;
7128 stmt_vec_info scalar_dest_def_info;
7129 auto_vec<tree> vec_oprnds0, vec_opmask;
7130 if (slp_node)
7131 {
7132 auto_vec<vec<tree> > vec_defs (2);
7133 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7134 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7135 vec_defs[0].release ();
7136 vec_defs[1].release ();
7137 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7138 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7139 }
7140 else
7141 {
7142 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7143 op0, &vec_oprnds0);
7144 scalar_dest_def_info = stmt_info;
7145
7146 /* For an IFN_COND_OP we also need the vector mask operand. */
7147 if (is_cond_op)
7148 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7149 opmask, &vec_opmask);
7150 }
7151
7152 gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7153 tree scalar_dest = gimple_get_lhs (sdef);
7154 tree scalar_type = TREE_TYPE (scalar_dest);
7155 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7156
7157 int vec_num = vec_oprnds0.length ();
7158 gcc_assert (vec_num == 1 || slp_node);
7159 tree vec_elem_type = TREE_TYPE (vectype_out);
7160 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7161
7162 tree vector_identity = NULL_TREE;
7163 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7164 {
7165 vector_identity = build_zero_cst (vectype_out);
7166 if (!HONOR_SIGNED_ZEROS (vectype_out))
7167 ;
7168 else
7169 {
7170 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7171 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7172 vector_identity);
7173 }
7174 }
7175
7176 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7177 int i;
7178 tree def0;
7179 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7180 {
7181 gimple *new_stmt;
7182 tree mask = NULL_TREE;
7183 tree len = NULL_TREE;
7184 tree bias = NULL_TREE;
7185 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7186 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7187 else if (is_cond_op)
7188 mask = vec_opmask[0];
7189 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7190 {
7191 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7192 i, 1);
7193 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7194 bias = build_int_cst (intQI_type_node, biasval);
7195 if (!is_cond_op)
7196 mask = build_minus_one_cst (truth_type_for (vectype_in));
7197 }
7198
7199 /* Handle MINUS by adding the negative. */
7200 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7201 {
7202 tree negated = make_ssa_name (vectype_out);
7203 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7204 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7205 def0 = negated;
7206 }
7207
7208 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7209 && mask && mask_reduc_fn == IFN_LAST)
7210 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7211 vector_identity);
7212
7213 /* On the first iteration the input is simply the scalar phi
7214 result, and for subsequent iterations it is the output of
7215 the preceding operation. */
7216 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7217 {
7218 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7219 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7220 def0, mask, len, bias);
7221 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7222 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7223 def0, mask);
7224 else
7225 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7226 def0);
7227 /* For chained SLP reductions the output of the previous reduction
7228 operation serves as the input of the next. For the final statement
7229 the output cannot be a temporary - we reuse the original
7230 scalar destination of the last statement. */
7231 if (i != vec_num - 1)
7232 {
7233 gimple_set_lhs (new_stmt, scalar_dest_var);
7234 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7235 gimple_set_lhs (new_stmt, reduc_var);
7236 }
7237 }
7238 else
7239 {
7240 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7241 tree_code (code), reduc_var, def0,
7242 mask);
7243 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7244 /* Remove the statement, so that we can use the same code paths
7245 as for statements that we've just created. */
7246 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7247 gsi_remove (&tmp_gsi, true);
7248 }
7249
7250 if (i == vec_num - 1)
7251 {
7252 gimple_set_lhs (new_stmt, scalar_dest);
7253 vect_finish_replace_stmt (loop_vinfo,
7254 scalar_dest_def_info,
7255 new_stmt);
7256 }
7257 else
7258 vect_finish_stmt_generation (loop_vinfo,
7259 scalar_dest_def_info,
7260 new_stmt, gsi);
7261
7262 if (slp_node)
7263 slp_node->push_vec_def (new_stmt);
7264 else
7265 {
7266 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7267 *vec_stmt = new_stmt;
7268 }
7269 }
7270
7271 return true;
7272 }
7273
7274 /* Function is_nonwrapping_integer_induction.
7275
7276 Check if STMT_VINO (which is part of loop LOOP) both increments and
7277 does not cause overflow. */
7278
7279 static bool
7280 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7281 {
7282 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7283 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7284 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7285 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7286 widest_int ni, max_loop_value, lhs_max;
7287 wi::overflow_type overflow = wi::OVF_NONE;
7288
7289 /* Make sure the loop is integer based. */
7290 if (TREE_CODE (base) != INTEGER_CST
7291 || TREE_CODE (step) != INTEGER_CST)
7292 return false;
7293
7294 /* Check that the max size of the loop will not wrap. */
7295
7296 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7297 return true;
7298
7299 if (! max_stmt_executions (loop, &ni))
7300 return false;
7301
7302 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7303 &overflow);
7304 if (overflow)
7305 return false;
7306
7307 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7308 TYPE_SIGN (lhs_type), &overflow);
7309 if (overflow)
7310 return false;
7311
7312 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7313 <= TYPE_PRECISION (lhs_type));
7314 }
7315
7316 /* Check if masking can be supported by inserting a conditional expression.
7317 CODE is the code for the operation. COND_FN is the conditional internal
7318 function, if it exists. VECTYPE_IN is the type of the vector input. */
7319 static bool
7320 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7321 tree vectype_in)
7322 {
7323 if (cond_fn != IFN_LAST
7324 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7325 OPTIMIZE_FOR_SPEED))
7326 return false;
7327
7328 if (code.is_tree_code ())
7329 switch (tree_code (code))
7330 {
7331 case DOT_PROD_EXPR:
7332 case SAD_EXPR:
7333 return true;
7334
7335 default:
7336 break;
7337 }
7338 return false;
7339 }
7340
7341 /* Insert a conditional expression to enable masked vectorization. CODE is the
7342 code for the operation. VOP is the array of operands. MASK is the loop
7343 mask. GSI is a statement iterator used to place the new conditional
7344 expression. */
7345 static void
7346 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7347 gimple_stmt_iterator *gsi)
7348 {
7349 switch (tree_code (code))
7350 {
7351 case DOT_PROD_EXPR:
7352 {
7353 tree vectype = TREE_TYPE (vop[1]);
7354 tree zero = build_zero_cst (vectype);
7355 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7356 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7357 mask, vop[1], zero);
7358 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7359 vop[1] = masked_op1;
7360 break;
7361 }
7362
7363 case SAD_EXPR:
7364 {
7365 tree vectype = TREE_TYPE (vop[1]);
7366 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7367 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7368 mask, vop[1], vop[0]);
7369 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7370 vop[1] = masked_op1;
7371 break;
7372 }
7373
7374 default:
7375 gcc_unreachable ();
7376 }
7377 }
7378
7379 /* Function vectorizable_reduction.
7380
7381 Check if STMT_INFO performs a reduction operation that can be vectorized.
7382 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7383 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7384 Return true if STMT_INFO is vectorizable in this way.
7385
7386 This function also handles reduction idioms (patterns) that have been
7387 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7388 may be of this form:
7389 X = pattern_expr (arg0, arg1, ..., X)
7390 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7391 sequence that had been detected and replaced by the pattern-stmt
7392 (STMT_INFO).
7393
7394 This function also handles reduction of condition expressions, for example:
7395 for (int i = 0; i < N; i++)
7396 if (a[i] < value)
7397 last = a[i];
7398 This is handled by vectorising the loop and creating an additional vector
7399 containing the loop indexes for which "a[i] < value" was true. In the
7400 function epilogue this is reduced to a single max value and then used to
7401 index into the vector of results.
7402
7403 In some cases of reduction patterns, the type of the reduction variable X is
7404 different than the type of the other arguments of STMT_INFO.
7405 In such cases, the vectype that is used when transforming STMT_INFO into
7406 a vector stmt is different than the vectype that is used to determine the
7407 vectorization factor, because it consists of a different number of elements
7408 than the actual number of elements that are being operated upon in parallel.
7409
7410 For example, consider an accumulation of shorts into an int accumulator.
7411 On some targets it's possible to vectorize this pattern operating on 8
7412 shorts at a time (hence, the vectype for purposes of determining the
7413 vectorization factor should be V8HI); on the other hand, the vectype that
7414 is used to create the vector form is actually V4SI (the type of the result).
7415
7416 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7417 indicates what is the actual level of parallelism (V8HI in the example), so
7418 that the right vectorization factor would be derived. This vectype
7419 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7420 be used to create the vectorized stmt. The right vectype for the vectorized
7421 stmt is obtained from the type of the result X:
7422 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7423
7424 This means that, contrary to "regular" reductions (or "regular" stmts in
7425 general), the following equation:
7426 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7427 does *NOT* necessarily hold for reduction patterns. */
7428
7429 bool
7430 vectorizable_reduction (loop_vec_info loop_vinfo,
7431 stmt_vec_info stmt_info, slp_tree slp_node,
7432 slp_instance slp_node_instance,
7433 stmt_vector_for_cost *cost_vec)
7434 {
7435 tree vectype_in = NULL_TREE;
7436 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7437 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7438 stmt_vec_info cond_stmt_vinfo = NULL;
7439 int i;
7440 int ncopies;
7441 bool single_defuse_cycle = false;
7442 bool nested_cycle = false;
7443 bool double_reduc = false;
7444 int vec_num;
7445 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7446 tree cond_reduc_val = NULL_TREE;
7447
7448 /* Make sure it was already recognized as a reduction computation. */
7449 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7450 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7451 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7452 return false;
7453
7454 /* The stmt we store reduction analysis meta on. */
7455 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7456 reduc_info->is_reduc_info = true;
7457
7458 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7459 {
7460 if (is_a <gphi *> (stmt_info->stmt))
7461 {
7462 if (slp_node)
7463 {
7464 /* We eventually need to set a vector type on invariant
7465 arguments. */
7466 unsigned j;
7467 slp_tree child;
7468 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7469 if (!vect_maybe_update_slp_op_vectype
7470 (child, SLP_TREE_VECTYPE (slp_node)))
7471 {
7472 if (dump_enabled_p ())
7473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7474 "incompatible vector types for "
7475 "invariants\n");
7476 return false;
7477 }
7478 }
7479 /* Analysis for double-reduction is done on the outer
7480 loop PHI, nested cycles have no further restrictions. */
7481 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7482 }
7483 else
7484 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7485 return true;
7486 }
7487
7488 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7489 stmt_vec_info phi_info = stmt_info;
7490 if (!is_a <gphi *> (stmt_info->stmt))
7491 {
7492 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7493 return true;
7494 }
7495 if (slp_node)
7496 {
7497 slp_node_instance->reduc_phis = slp_node;
7498 /* ??? We're leaving slp_node to point to the PHIs, we only
7499 need it to get at the number of vector stmts which wasn't
7500 yet initialized for the instance root. */
7501 }
7502 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7503 {
7504 use_operand_p use_p;
7505 gimple *use_stmt;
7506 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7507 &use_p, &use_stmt);
7508 gcc_assert (res);
7509 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7510 }
7511
7512 /* PHIs should not participate in patterns. */
7513 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7514 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7515
7516 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7517 and compute the reduction chain length. Discover the real
7518 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7519 tree reduc_def
7520 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7521 loop_latch_edge
7522 (gimple_bb (reduc_def_phi)->loop_father));
7523 unsigned reduc_chain_length = 0;
7524 bool only_slp_reduc_chain = true;
7525 stmt_info = NULL;
7526 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7527 while (reduc_def != PHI_RESULT (reduc_def_phi))
7528 {
7529 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7530 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7531 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7532 {
7533 if (dump_enabled_p ())
7534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7535 "reduction chain broken by patterns.\n");
7536 return false;
7537 }
7538 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7539 only_slp_reduc_chain = false;
7540 /* For epilogue generation live members of the chain need
7541 to point back to the PHI via their original stmt for
7542 info_for_reduction to work. For SLP we need to look at
7543 all lanes here - even though we only will vectorize from
7544 the SLP node with live lane zero the other live lanes also
7545 need to be identified as part of a reduction to be able
7546 to skip code generation for them. */
7547 if (slp_for_stmt_info)
7548 {
7549 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7550 if (STMT_VINFO_LIVE_P (s))
7551 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7552 }
7553 else if (STMT_VINFO_LIVE_P (vdef))
7554 STMT_VINFO_REDUC_DEF (def) = phi_info;
7555 gimple_match_op op;
7556 if (!gimple_extract_op (vdef->stmt, &op))
7557 {
7558 if (dump_enabled_p ())
7559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7560 "reduction chain includes unsupported"
7561 " statement type.\n");
7562 return false;
7563 }
7564 if (CONVERT_EXPR_CODE_P (op.code))
7565 {
7566 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7567 {
7568 if (dump_enabled_p ())
7569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7570 "conversion in the reduction chain.\n");
7571 return false;
7572 }
7573 }
7574 else if (!stmt_info)
7575 /* First non-conversion stmt. */
7576 stmt_info = vdef;
7577 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7578 reduc_chain_length++;
7579 if (!stmt_info && slp_node)
7580 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7581 }
7582 /* PHIs should not participate in patterns. */
7583 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7584
7585 if (nested_in_vect_loop_p (loop, stmt_info))
7586 {
7587 loop = loop->inner;
7588 nested_cycle = true;
7589 }
7590
7591 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7592 element. */
7593 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7594 {
7595 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7596 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7597 }
7598 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7599 gcc_assert (slp_node
7600 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7601
7602 /* 1. Is vectorizable reduction? */
7603 /* Not supportable if the reduction variable is used in the loop, unless
7604 it's a reduction chain. */
7605 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7606 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7607 return false;
7608
7609 /* Reductions that are not used even in an enclosing outer-loop,
7610 are expected to be "live" (used out of the loop). */
7611 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7612 && !STMT_VINFO_LIVE_P (stmt_info))
7613 return false;
7614
7615 /* 2. Has this been recognized as a reduction pattern?
7616
7617 Check if STMT represents a pattern that has been recognized
7618 in earlier analysis stages. For stmts that represent a pattern,
7619 the STMT_VINFO_RELATED_STMT field records the last stmt in
7620 the original sequence that constitutes the pattern. */
7621
7622 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7623 if (orig_stmt_info)
7624 {
7625 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7626 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7627 }
7628
7629 /* 3. Check the operands of the operation. The first operands are defined
7630 inside the loop body. The last operand is the reduction variable,
7631 which is defined by the loop-header-phi. */
7632
7633 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7634 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7635 gimple_match_op op;
7636 if (!gimple_extract_op (stmt_info->stmt, &op))
7637 gcc_unreachable ();
7638 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7639 || op.code == WIDEN_SUM_EXPR
7640 || op.code == SAD_EXPR);
7641
7642 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7643 && !SCALAR_FLOAT_TYPE_P (op.type))
7644 return false;
7645
7646 /* Do not try to vectorize bit-precision reductions. */
7647 if (!type_has_mode_precision_p (op.type))
7648 return false;
7649
7650 /* For lane-reducing ops we're reducing the number of reduction PHIs
7651 which means the only use of that may be in the lane-reducing operation. */
7652 if (lane_reduc_code_p
7653 && reduc_chain_length != 1
7654 && !only_slp_reduc_chain)
7655 {
7656 if (dump_enabled_p ())
7657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7658 "lane-reducing reduction with extra stmts.\n");
7659 return false;
7660 }
7661
7662 /* All uses but the last are expected to be defined in the loop.
7663 The last use is the reduction variable. In case of nested cycle this
7664 assumption is not true: we use reduc_index to record the index of the
7665 reduction variable. */
7666 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7667 tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7668 /* We need to skip an extra operand for COND_EXPRs with embedded
7669 comparison. */
7670 unsigned opno_adjust = 0;
7671 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7672 opno_adjust = 1;
7673 for (i = 0; i < (int) op.num_ops; i++)
7674 {
7675 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7676 if (i == 0 && op.code == COND_EXPR)
7677 continue;
7678
7679 stmt_vec_info def_stmt_info;
7680 enum vect_def_type dt;
7681 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7682 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7683 &vectype_op[i], &def_stmt_info))
7684 {
7685 if (dump_enabled_p ())
7686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7687 "use not simple.\n");
7688 return false;
7689 }
7690 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7691 continue;
7692
7693 /* For an IFN_COND_OP we might hit the reduction definition operand
7694 twice (once as definition, once as else). */
7695 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7696 continue;
7697
7698 /* There should be only one cycle def in the stmt, the one
7699 leading to reduc_def. */
7700 if (VECTORIZABLE_CYCLE_DEF (dt))
7701 return false;
7702
7703 if (!vectype_op[i])
7704 vectype_op[i]
7705 = get_vectype_for_scalar_type (loop_vinfo,
7706 TREE_TYPE (op.ops[i]), slp_op[i]);
7707
7708 /* To properly compute ncopies we are interested in the widest
7709 non-reduction input type in case we're looking at a widening
7710 accumulation that we later handle in vect_transform_reduction. */
7711 if (lane_reduc_code_p
7712 && vectype_op[i]
7713 && (!vectype_in
7714 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7715 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7716 vectype_in = vectype_op[i];
7717
7718 /* Record how the non-reduction-def value of COND_EXPR is defined.
7719 ??? For a chain of multiple CONDs we'd have to match them up all. */
7720 if (op.code == COND_EXPR && reduc_chain_length == 1)
7721 {
7722 if (dt == vect_constant_def)
7723 {
7724 cond_reduc_dt = dt;
7725 cond_reduc_val = op.ops[i];
7726 }
7727 else if (dt == vect_induction_def
7728 && def_stmt_info
7729 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7730 {
7731 cond_reduc_dt = dt;
7732 cond_stmt_vinfo = def_stmt_info;
7733 }
7734 }
7735 }
7736 if (!vectype_in)
7737 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7738 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7739
7740 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7741 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7742 /* If we have a condition reduction, see if we can simplify it further. */
7743 if (v_reduc_type == COND_REDUCTION)
7744 {
7745 if (slp_node)
7746 return false;
7747
7748 /* When the condition uses the reduction value in the condition, fail. */
7749 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7750 {
7751 if (dump_enabled_p ())
7752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7753 "condition depends on previous iteration\n");
7754 return false;
7755 }
7756
7757 if (reduc_chain_length == 1
7758 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7759 OPTIMIZE_FOR_SPEED)
7760 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7761 vectype_in,
7762 OPTIMIZE_FOR_SPEED)))
7763 {
7764 if (dump_enabled_p ())
7765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7766 "optimizing condition reduction with"
7767 " FOLD_EXTRACT_LAST.\n");
7768 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7769 }
7770 else if (cond_reduc_dt == vect_induction_def)
7771 {
7772 tree base
7773 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7774 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7775
7776 gcc_assert (TREE_CODE (base) == INTEGER_CST
7777 && TREE_CODE (step) == INTEGER_CST);
7778 cond_reduc_val = NULL_TREE;
7779 enum tree_code cond_reduc_op_code = ERROR_MARK;
7780 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7781 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7782 ;
7783 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7784 above base; punt if base is the minimum value of the type for
7785 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7786 else if (tree_int_cst_sgn (step) == -1)
7787 {
7788 cond_reduc_op_code = MIN_EXPR;
7789 if (tree_int_cst_sgn (base) == -1)
7790 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7791 else if (tree_int_cst_lt (base,
7792 TYPE_MAX_VALUE (TREE_TYPE (base))))
7793 cond_reduc_val
7794 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7795 }
7796 else
7797 {
7798 cond_reduc_op_code = MAX_EXPR;
7799 if (tree_int_cst_sgn (base) == 1)
7800 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7801 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7802 base))
7803 cond_reduc_val
7804 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7805 }
7806 if (cond_reduc_val)
7807 {
7808 if (dump_enabled_p ())
7809 dump_printf_loc (MSG_NOTE, vect_location,
7810 "condition expression based on "
7811 "integer induction.\n");
7812 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7813 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7814 = cond_reduc_val;
7815 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7816 }
7817 }
7818 else if (cond_reduc_dt == vect_constant_def)
7819 {
7820 enum vect_def_type cond_initial_dt;
7821 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7822 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7823 if (cond_initial_dt == vect_constant_def
7824 && types_compatible_p (TREE_TYPE (cond_initial_val),
7825 TREE_TYPE (cond_reduc_val)))
7826 {
7827 tree e = fold_binary (LE_EXPR, boolean_type_node,
7828 cond_initial_val, cond_reduc_val);
7829 if (e && (integer_onep (e) || integer_zerop (e)))
7830 {
7831 if (dump_enabled_p ())
7832 dump_printf_loc (MSG_NOTE, vect_location,
7833 "condition expression based on "
7834 "compile time constant.\n");
7835 /* Record reduction code at analysis stage. */
7836 STMT_VINFO_REDUC_CODE (reduc_info)
7837 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7838 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7839 }
7840 }
7841 }
7842 }
7843
7844 if (STMT_VINFO_LIVE_P (phi_info))
7845 return false;
7846
7847 if (slp_node)
7848 ncopies = 1;
7849 else
7850 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7851
7852 gcc_assert (ncopies >= 1);
7853
7854 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7855
7856 if (nested_cycle)
7857 {
7858 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7859 == vect_double_reduction_def);
7860 double_reduc = true;
7861 }
7862
7863 /* 4.2. Check support for the epilog operation.
7864
7865 If STMT represents a reduction pattern, then the type of the
7866 reduction variable may be different than the type of the rest
7867 of the arguments. For example, consider the case of accumulation
7868 of shorts into an int accumulator; The original code:
7869 S1: int_a = (int) short_a;
7870 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7871
7872 was replaced with:
7873 STMT: int_acc = widen_sum <short_a, int_acc>
7874
7875 This means that:
7876 1. The tree-code that is used to create the vector operation in the
7877 epilog code (that reduces the partial results) is not the
7878 tree-code of STMT, but is rather the tree-code of the original
7879 stmt from the pattern that STMT is replacing. I.e, in the example
7880 above we want to use 'widen_sum' in the loop, but 'plus' in the
7881 epilog.
7882 2. The type (mode) we use to check available target support
7883 for the vector operation to be created in the *epilog*, is
7884 determined by the type of the reduction variable (in the example
7885 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7886 However the type (mode) we use to check available target support
7887 for the vector operation to be created *inside the loop*, is
7888 determined by the type of the other arguments to STMT (in the
7889 example we'd check this: optab_handler (widen_sum_optab,
7890 vect_short_mode)).
7891
7892 This is contrary to "regular" reductions, in which the types of all
7893 the arguments are the same as the type of the reduction variable.
7894 For "regular" reductions we can therefore use the same vector type
7895 (and also the same tree-code) when generating the epilog code and
7896 when generating the code inside the loop. */
7897
7898 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7899
7900 /* If conversion might have created a conditional operation like
7901 IFN_COND_ADD already. Use the internal code for the following checks. */
7902 if (orig_code.is_internal_fn ())
7903 {
7904 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7905 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7906 }
7907
7908 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7909
7910 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7911 if (reduction_type == TREE_CODE_REDUCTION)
7912 {
7913 /* Check whether it's ok to change the order of the computation.
7914 Generally, when vectorizing a reduction we change the order of the
7915 computation. This may change the behavior of the program in some
7916 cases, so we need to check that this is ok. One exception is when
7917 vectorizing an outer-loop: the inner-loop is executed sequentially,
7918 and therefore vectorizing reductions in the inner-loop during
7919 outer-loop vectorization is safe. Likewise when we are vectorizing
7920 a series of reductions using SLP and the VF is one the reductions
7921 are performed in scalar order. */
7922 if (slp_node
7923 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7924 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7925 ;
7926 else if (needs_fold_left_reduction_p (op.type, orig_code))
7927 {
7928 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7929 is not directy used in stmt. */
7930 if (!only_slp_reduc_chain
7931 && reduc_chain_length != 1)
7932 {
7933 if (dump_enabled_p ())
7934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7935 "in-order reduction chain without SLP.\n");
7936 return false;
7937 }
7938 STMT_VINFO_REDUC_TYPE (reduc_info)
7939 = reduction_type = FOLD_LEFT_REDUCTION;
7940 }
7941 else if (!commutative_binary_op_p (orig_code, op.type)
7942 || !associative_binary_op_p (orig_code, op.type))
7943 {
7944 if (dump_enabled_p ())
7945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7946 "reduction: not commutative/associative\n");
7947 return false;
7948 }
7949 }
7950
7951 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7952 && ncopies > 1)
7953 {
7954 if (dump_enabled_p ())
7955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7956 "multiple types in double reduction or condition "
7957 "reduction or fold-left reduction.\n");
7958 return false;
7959 }
7960
7961 internal_fn reduc_fn = IFN_LAST;
7962 if (reduction_type == TREE_CODE_REDUCTION
7963 || reduction_type == FOLD_LEFT_REDUCTION
7964 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7965 || reduction_type == CONST_COND_REDUCTION)
7966 {
7967 if (reduction_type == FOLD_LEFT_REDUCTION
7968 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7969 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7970 {
7971 if (reduc_fn != IFN_LAST
7972 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7973 OPTIMIZE_FOR_SPEED))
7974 {
7975 if (dump_enabled_p ())
7976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977 "reduc op not supported by target.\n");
7978
7979 reduc_fn = IFN_LAST;
7980 }
7981 }
7982 else
7983 {
7984 if (!nested_cycle || double_reduc)
7985 {
7986 if (dump_enabled_p ())
7987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7988 "no reduc code for scalar code.\n");
7989
7990 return false;
7991 }
7992 }
7993 }
7994 else if (reduction_type == COND_REDUCTION)
7995 {
7996 int scalar_precision
7997 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7998 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7999 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8000 vectype_out);
8001
8002 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8003 OPTIMIZE_FOR_SPEED))
8004 reduc_fn = IFN_REDUC_MAX;
8005 }
8006 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8007
8008 if (reduction_type != EXTRACT_LAST_REDUCTION
8009 && (!nested_cycle || double_reduc)
8010 && reduc_fn == IFN_LAST
8011 && !nunits_out.is_constant ())
8012 {
8013 if (dump_enabled_p ())
8014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8015 "missing target support for reduction on"
8016 " variable-length vectors.\n");
8017 return false;
8018 }
8019
8020 /* For SLP reductions, see if there is a neutral value we can use. */
8021 tree neutral_op = NULL_TREE;
8022 if (slp_node)
8023 {
8024 tree initial_value = NULL_TREE;
8025 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8026 initial_value = vect_phi_initial_value (reduc_def_phi);
8027 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8028 orig_code, initial_value);
8029 }
8030
8031 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8032 {
8033 /* We can't support in-order reductions of code such as this:
8034
8035 for (int i = 0; i < n1; ++i)
8036 for (int j = 0; j < n2; ++j)
8037 l += a[j];
8038
8039 since GCC effectively transforms the loop when vectorizing:
8040
8041 for (int i = 0; i < n1 / VF; ++i)
8042 for (int j = 0; j < n2; ++j)
8043 for (int k = 0; k < VF; ++k)
8044 l += a[j];
8045
8046 which is a reassociation of the original operation. */
8047 if (dump_enabled_p ())
8048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8049 "in-order double reduction not supported.\n");
8050
8051 return false;
8052 }
8053
8054 if (reduction_type == FOLD_LEFT_REDUCTION
8055 && slp_node
8056 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8057 {
8058 /* We cannot use in-order reductions in this case because there is
8059 an implicit reassociation of the operations involved. */
8060 if (dump_enabled_p ())
8061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8062 "in-order unchained SLP reductions not supported.\n");
8063 return false;
8064 }
8065
8066 /* For double reductions, and for SLP reductions with a neutral value,
8067 we construct a variable-length initial vector by loading a vector
8068 full of the neutral value and then shift-and-inserting the start
8069 values into the low-numbered elements. */
8070 if ((double_reduc || neutral_op)
8071 && !nunits_out.is_constant ()
8072 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8073 vectype_out, OPTIMIZE_FOR_SPEED))
8074 {
8075 if (dump_enabled_p ())
8076 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8077 "reduction on variable-length vectors requires"
8078 " target support for a vector-shift-and-insert"
8079 " operation.\n");
8080 return false;
8081 }
8082
8083 /* Check extra constraints for variable-length unchained SLP reductions. */
8084 if (slp_node
8085 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8086 && !nunits_out.is_constant ())
8087 {
8088 /* We checked above that we could build the initial vector when
8089 there's a neutral element value. Check here for the case in
8090 which each SLP statement has its own initial value and in which
8091 that value needs to be repeated for every instance of the
8092 statement within the initial vector. */
8093 unsigned int group_size = SLP_TREE_LANES (slp_node);
8094 if (!neutral_op
8095 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8096 TREE_TYPE (vectype_out)))
8097 {
8098 if (dump_enabled_p ())
8099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8100 "unsupported form of SLP reduction for"
8101 " variable-length vectors: cannot build"
8102 " initial vector.\n");
8103 return false;
8104 }
8105 /* The epilogue code relies on the number of elements being a multiple
8106 of the group size. The duplicate-and-interleave approach to setting
8107 up the initial vector does too. */
8108 if (!multiple_p (nunits_out, group_size))
8109 {
8110 if (dump_enabled_p ())
8111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8112 "unsupported form of SLP reduction for"
8113 " variable-length vectors: the vector size"
8114 " is not a multiple of the number of results.\n");
8115 return false;
8116 }
8117 }
8118
8119 if (reduction_type == COND_REDUCTION)
8120 {
8121 widest_int ni;
8122
8123 if (! max_loop_iterations (loop, &ni))
8124 {
8125 if (dump_enabled_p ())
8126 dump_printf_loc (MSG_NOTE, vect_location,
8127 "loop count not known, cannot create cond "
8128 "reduction.\n");
8129 return false;
8130 }
8131 /* Convert backedges to iterations. */
8132 ni += 1;
8133
8134 /* The additional index will be the same type as the condition. Check
8135 that the loop can fit into this less one (because we'll use up the
8136 zero slot for when there are no matches). */
8137 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8138 if (wi::geu_p (ni, wi::to_widest (max_index)))
8139 {
8140 if (dump_enabled_p ())
8141 dump_printf_loc (MSG_NOTE, vect_location,
8142 "loop size is greater than data size.\n");
8143 return false;
8144 }
8145 }
8146
8147 /* In case the vectorization factor (VF) is bigger than the number
8148 of elements that we can fit in a vectype (nunits), we have to generate
8149 more than one vector stmt - i.e - we need to "unroll" the
8150 vector stmt by a factor VF/nunits. For more details see documentation
8151 in vectorizable_operation. */
8152
8153 /* If the reduction is used in an outer loop we need to generate
8154 VF intermediate results, like so (e.g. for ncopies=2):
8155 r0 = phi (init, r0)
8156 r1 = phi (init, r1)
8157 r0 = x0 + r0;
8158 r1 = x1 + r1;
8159 (i.e. we generate VF results in 2 registers).
8160 In this case we have a separate def-use cycle for each copy, and therefore
8161 for each copy we get the vector def for the reduction variable from the
8162 respective phi node created for this copy.
8163
8164 Otherwise (the reduction is unused in the loop nest), we can combine
8165 together intermediate results, like so (e.g. for ncopies=2):
8166 r = phi (init, r)
8167 r = x0 + r;
8168 r = x1 + r;
8169 (i.e. we generate VF/2 results in a single register).
8170 In this case for each copy we get the vector def for the reduction variable
8171 from the vectorized reduction operation generated in the previous iteration.
8172
8173 This only works when we see both the reduction PHI and its only consumer
8174 in vectorizable_reduction and there are no intermediate stmts
8175 participating. When unrolling we want each unrolled iteration to have its
8176 own reduction accumulator since one of the main goals of unrolling a
8177 reduction is to reduce the aggregate loop-carried latency. */
8178 if (ncopies > 1
8179 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8180 && reduc_chain_length == 1
8181 && loop_vinfo->suggested_unroll_factor == 1)
8182 single_defuse_cycle = true;
8183
8184 if (single_defuse_cycle || lane_reduc_code_p)
8185 {
8186 gcc_assert (op.code != COND_EXPR);
8187
8188 /* 4. Supportable by target? */
8189 bool ok = true;
8190
8191 /* 4.1. check support for the operation in the loop
8192
8193 This isn't necessary for the lane reduction codes, since they
8194 can only be produced by pattern matching, and it's up to the
8195 pattern matcher to test for support. The main reason for
8196 specifically skipping this step is to avoid rechecking whether
8197 mixed-sign dot-products can be implemented using signed
8198 dot-products. */
8199 machine_mode vec_mode = TYPE_MODE (vectype_in);
8200 if (!lane_reduc_code_p
8201 && !directly_supported_p (op.code, vectype_in, optab_vector))
8202 {
8203 if (dump_enabled_p ())
8204 dump_printf (MSG_NOTE, "op not supported by target.\n");
8205 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8206 || !vect_can_vectorize_without_simd_p (op.code))
8207 ok = false;
8208 else
8209 if (dump_enabled_p ())
8210 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8211 }
8212
8213 if (vect_emulated_vector_p (vectype_in)
8214 && !vect_can_vectorize_without_simd_p (op.code))
8215 {
8216 if (dump_enabled_p ())
8217 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8218 return false;
8219 }
8220
8221 /* lane-reducing operations have to go through vect_transform_reduction.
8222 For the other cases try without the single cycle optimization. */
8223 if (!ok)
8224 {
8225 if (lane_reduc_code_p)
8226 return false;
8227 else
8228 single_defuse_cycle = false;
8229 }
8230 }
8231 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8232
8233 /* If the reduction stmt is one of the patterns that have lane
8234 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8235 if ((ncopies > 1 && ! single_defuse_cycle)
8236 && lane_reduc_code_p)
8237 {
8238 if (dump_enabled_p ())
8239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8240 "multi def-use cycle not possible for lane-reducing "
8241 "reduction operation\n");
8242 return false;
8243 }
8244
8245 if (slp_node
8246 && !(!single_defuse_cycle
8247 && !lane_reduc_code_p
8248 && reduction_type != FOLD_LEFT_REDUCTION))
8249 for (i = 0; i < (int) op.num_ops; i++)
8250 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8251 {
8252 if (dump_enabled_p ())
8253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8254 "incompatible vector types for invariants\n");
8255 return false;
8256 }
8257
8258 if (slp_node)
8259 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8260 else
8261 vec_num = 1;
8262
8263 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8264 reduction_type, ncopies, cost_vec);
8265 /* Cost the reduction op inside the loop if transformed via
8266 vect_transform_reduction. Otherwise this is costed by the
8267 separate vectorizable_* routines. */
8268 if (single_defuse_cycle || lane_reduc_code_p)
8269 {
8270 int factor = 1;
8271 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8272 /* Three dot-products and a subtraction. */
8273 factor = 4;
8274 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8275 stmt_info, 0, vect_body);
8276 }
8277
8278 if (dump_enabled_p ()
8279 && reduction_type == FOLD_LEFT_REDUCTION)
8280 dump_printf_loc (MSG_NOTE, vect_location,
8281 "using an in-order (fold-left) reduction.\n");
8282 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8283 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8284 reductions go through their own vectorizable_* routines. */
8285 if (!single_defuse_cycle
8286 && !lane_reduc_code_p
8287 && reduction_type != FOLD_LEFT_REDUCTION)
8288 {
8289 stmt_vec_info tem
8290 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8291 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8292 {
8293 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8294 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8295 }
8296 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8297 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8298 }
8299 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8300 {
8301 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8302 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8303 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8304
8305 if (reduction_type != FOLD_LEFT_REDUCTION
8306 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8307 && (cond_fn == IFN_LAST
8308 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8309 OPTIMIZE_FOR_SPEED)))
8310 {
8311 if (dump_enabled_p ())
8312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8313 "can't operate on partial vectors because"
8314 " no conditional operation is available.\n");
8315 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8316 }
8317 else if (reduction_type == FOLD_LEFT_REDUCTION
8318 && reduc_fn == IFN_LAST
8319 && !expand_vec_cond_expr_p (vectype_in,
8320 truth_type_for (vectype_in),
8321 SSA_NAME))
8322 {
8323 if (dump_enabled_p ())
8324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8325 "can't operate on partial vectors because"
8326 " no conditional operation is available.\n");
8327 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8328 }
8329 else if (reduction_type == FOLD_LEFT_REDUCTION
8330 && internal_fn_mask_index (reduc_fn) == -1
8331 && FLOAT_TYPE_P (vectype_in)
8332 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8333 {
8334 if (dump_enabled_p ())
8335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8336 "can't operate on partial vectors because"
8337 " signed zeros cannot be preserved.\n");
8338 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8339 }
8340 else
8341 {
8342 internal_fn mask_reduc_fn
8343 = get_masked_reduction_fn (reduc_fn, vectype_in);
8344
8345 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8346 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8347 vectype_in, 1);
8348 else
8349 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8350 vectype_in, NULL);
8351 }
8352 }
8353 return true;
8354 }
8355
8356 /* STMT_INFO is a dot-product reduction whose multiplication operands
8357 have different signs. Emit a sequence to emulate the operation
8358 using a series of signed DOT_PROD_EXPRs and return the last
8359 statement generated. VEC_DEST is the result of the vector operation
8360 and VOP lists its inputs. */
8361
8362 static gassign *
8363 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8364 gimple_stmt_iterator *gsi, tree vec_dest,
8365 tree vop[3])
8366 {
8367 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8368 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8369 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8370 gimple *new_stmt;
8371
8372 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8373 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8374 std::swap (vop[0], vop[1]);
8375
8376 /* Convert all inputs to signed types. */
8377 for (int i = 0; i < 3; ++i)
8378 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8379 {
8380 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8381 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8382 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8383 vop[i] = tmp;
8384 }
8385
8386 /* In the comments below we assume 8-bit inputs for simplicity,
8387 but the approach works for any full integer type. */
8388
8389 /* Create a vector of -128. */
8390 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8391 tree min_narrow = build_vector_from_val (narrow_vectype,
8392 min_narrow_elttype);
8393
8394 /* Create a vector of 64. */
8395 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8396 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8397 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8398
8399 /* Emit: SUB_RES = VOP[0] - 128. */
8400 tree sub_res = make_ssa_name (narrow_vectype);
8401 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8402 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8403
8404 /* Emit:
8405
8406 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8407 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8408 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8409
8410 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8411 Doing the two 64 * y steps first allows more time to compute x. */
8412 tree stage1 = make_ssa_name (wide_vectype);
8413 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8414 vop[1], half_narrow, vop[2]);
8415 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8416
8417 tree stage2 = make_ssa_name (wide_vectype);
8418 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8419 vop[1], half_narrow, stage1);
8420 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8421
8422 tree stage3 = make_ssa_name (wide_vectype);
8423 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8424 sub_res, vop[1], stage2);
8425 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8426
8427 /* Convert STAGE3 to the reduction type. */
8428 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8429 }
8430
8431 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8432 value. */
8433
8434 bool
8435 vect_transform_reduction (loop_vec_info loop_vinfo,
8436 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8437 gimple **vec_stmt, slp_tree slp_node)
8438 {
8439 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8440 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8441 int i;
8442 int ncopies;
8443 int vec_num;
8444
8445 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8446 gcc_assert (reduc_info->is_reduc_info);
8447
8448 if (nested_in_vect_loop_p (loop, stmt_info))
8449 {
8450 loop = loop->inner;
8451 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8452 }
8453
8454 gimple_match_op op;
8455 if (!gimple_extract_op (stmt_info->stmt, &op))
8456 gcc_unreachable ();
8457
8458 /* All uses but the last are expected to be defined in the loop.
8459 The last use is the reduction variable. In case of nested cycle this
8460 assumption is not true: we use reduc_index to record the index of the
8461 reduction variable. */
8462 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8463 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8464 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8465 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8466
8467 if (slp_node)
8468 {
8469 ncopies = 1;
8470 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8471 }
8472 else
8473 {
8474 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8475 vec_num = 1;
8476 }
8477
8478 code_helper code = canonicalize_code (op.code, op.type);
8479 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8480
8481 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8482 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8483 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8484
8485 /* Transform. */
8486 tree new_temp = NULL_TREE;
8487 auto_vec<tree> vec_oprnds0;
8488 auto_vec<tree> vec_oprnds1;
8489 auto_vec<tree> vec_oprnds2;
8490 tree def0;
8491
8492 if (dump_enabled_p ())
8493 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8494
8495 /* FORNOW: Multiple types are not supported for condition. */
8496 if (code == COND_EXPR)
8497 gcc_assert (ncopies == 1);
8498
8499 /* A binary COND_OP reduction must have the same definition and else
8500 value. */
8501 bool cond_fn_p = code.is_internal_fn ()
8502 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8503 if (cond_fn_p)
8504 {
8505 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8506 || code == IFN_COND_MUL || code == IFN_COND_AND
8507 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8508 gcc_assert (op.num_ops == 4
8509 && (op.ops[reduc_index]
8510 == op.ops[internal_fn_else_index ((internal_fn) code)]));
8511 }
8512
8513 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8514
8515 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8516 if (reduction_type == FOLD_LEFT_REDUCTION)
8517 {
8518 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8519 gcc_assert (code.is_tree_code () || cond_fn_p);
8520 return vectorize_fold_left_reduction
8521 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8522 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8523 reduc_index, masks, lens);
8524 }
8525
8526 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8527 gcc_assert (single_defuse_cycle
8528 || code == DOT_PROD_EXPR
8529 || code == WIDEN_SUM_EXPR
8530 || code == SAD_EXPR);
8531
8532 /* Create the destination vector */
8533 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8534 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8535
8536 /* Get NCOPIES vector definitions for all operands except the reduction
8537 definition. */
8538 if (!cond_fn_p)
8539 {
8540 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8541 single_defuse_cycle && reduc_index == 0
8542 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8543 single_defuse_cycle && reduc_index == 1
8544 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8545 op.num_ops == 3
8546 && !(single_defuse_cycle && reduc_index == 2)
8547 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8548 }
8549 else
8550 {
8551 /* For a conditional operation pass the truth type as mask
8552 vectype. */
8553 gcc_assert (single_defuse_cycle
8554 && (reduc_index == 1 || reduc_index == 2));
8555 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8556 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8557 reduc_index == 1 ? NULL_TREE : op.ops[1],
8558 NULL_TREE, &vec_oprnds1,
8559 reduc_index == 2 ? NULL_TREE : op.ops[2],
8560 NULL_TREE, &vec_oprnds2);
8561 }
8562
8563 /* For single def-use cycles get one copy of the vectorized reduction
8564 definition. */
8565 if (single_defuse_cycle)
8566 {
8567 gcc_assert (!slp_node);
8568 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8569 op.ops[reduc_index],
8570 reduc_index == 0 ? &vec_oprnds0
8571 : (reduc_index == 1 ? &vec_oprnds1
8572 : &vec_oprnds2));
8573 }
8574
8575 bool emulated_mixed_dot_prod
8576 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8577 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8578 {
8579 gimple *new_stmt;
8580 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8581 if (masked_loop_p && !mask_by_cond_expr)
8582 {
8583 /* No conditional ifns have been defined for dot-product yet. */
8584 gcc_assert (code != DOT_PROD_EXPR);
8585
8586 /* Make sure that the reduction accumulator is vop[0]. */
8587 if (reduc_index == 1)
8588 {
8589 gcc_assert (commutative_binary_op_p (code, op.type));
8590 std::swap (vop[0], vop[1]);
8591 }
8592 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8593 vec_num * ncopies, vectype_in, i);
8594 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8595 vop[0], vop[1], vop[0]);
8596 new_temp = make_ssa_name (vec_dest, call);
8597 gimple_call_set_lhs (call, new_temp);
8598 gimple_call_set_nothrow (call, true);
8599 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8600 new_stmt = call;
8601 }
8602 else
8603 {
8604 if (op.num_ops >= 3)
8605 vop[2] = vec_oprnds2[i];
8606
8607 if (masked_loop_p && mask_by_cond_expr)
8608 {
8609 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8610 vec_num * ncopies, vectype_in, i);
8611 build_vect_cond_expr (code, vop, mask, gsi);
8612 }
8613
8614 if (emulated_mixed_dot_prod)
8615 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8616 vec_dest, vop);
8617
8618 else if (code.is_internal_fn () && !cond_fn_p)
8619 new_stmt = gimple_build_call_internal (internal_fn (code),
8620 op.num_ops,
8621 vop[0], vop[1], vop[2]);
8622 else if (code.is_internal_fn () && cond_fn_p)
8623 new_stmt = gimple_build_call_internal (internal_fn (code),
8624 op.num_ops,
8625 vop[0], vop[1], vop[2],
8626 vop[1]);
8627 else
8628 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8629 vop[0], vop[1], vop[2]);
8630 new_temp = make_ssa_name (vec_dest, new_stmt);
8631 gimple_set_lhs (new_stmt, new_temp);
8632 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8633 }
8634
8635 if (slp_node)
8636 slp_node->push_vec_def (new_stmt);
8637 else if (single_defuse_cycle
8638 && i < ncopies - 1)
8639 {
8640 if (reduc_index == 0)
8641 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8642 else if (reduc_index == 1)
8643 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8644 else if (reduc_index == 2)
8645 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8646 }
8647 else
8648 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8649 }
8650
8651 if (!slp_node)
8652 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8653
8654 return true;
8655 }
8656
8657 /* Transform phase of a cycle PHI. */
8658
8659 bool
8660 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8661 stmt_vec_info stmt_info, gimple **vec_stmt,
8662 slp_tree slp_node, slp_instance slp_node_instance)
8663 {
8664 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8665 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8666 int i;
8667 int ncopies;
8668 int j;
8669 bool nested_cycle = false;
8670 int vec_num;
8671
8672 if (nested_in_vect_loop_p (loop, stmt_info))
8673 {
8674 loop = loop->inner;
8675 nested_cycle = true;
8676 }
8677
8678 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8679 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8680 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8681 gcc_assert (reduc_info->is_reduc_info);
8682
8683 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8684 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8685 /* Leave the scalar phi in place. */
8686 return true;
8687
8688 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8689 /* For a nested cycle we do not fill the above. */
8690 if (!vectype_in)
8691 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8692 gcc_assert (vectype_in);
8693
8694 if (slp_node)
8695 {
8696 /* The size vect_schedule_slp_instance computes is off for us. */
8697 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8698 * SLP_TREE_LANES (slp_node), vectype_in);
8699 ncopies = 1;
8700 }
8701 else
8702 {
8703 vec_num = 1;
8704 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8705 }
8706
8707 /* Check whether we should use a single PHI node and accumulate
8708 vectors to one before the backedge. */
8709 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8710 ncopies = 1;
8711
8712 /* Create the destination vector */
8713 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8714 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8715 vectype_out);
8716
8717 /* Get the loop-entry arguments. */
8718 tree vec_initial_def = NULL_TREE;
8719 auto_vec<tree> vec_initial_defs;
8720 if (slp_node)
8721 {
8722 vec_initial_defs.reserve (vec_num);
8723 if (nested_cycle)
8724 {
8725 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8726 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8727 &vec_initial_defs);
8728 }
8729 else
8730 {
8731 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8732 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8733 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8734
8735 unsigned int num_phis = stmts.length ();
8736 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8737 num_phis = 1;
8738 initial_values.reserve (num_phis);
8739 for (unsigned int i = 0; i < num_phis; ++i)
8740 {
8741 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8742 initial_values.quick_push (vect_phi_initial_value (this_phi));
8743 }
8744 if (vec_num == 1)
8745 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8746 if (!initial_values.is_empty ())
8747 {
8748 tree initial_value
8749 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8750 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8751 tree neutral_op
8752 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8753 code, initial_value);
8754 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8755 &vec_initial_defs, vec_num,
8756 stmts.length (), neutral_op);
8757 }
8758 }
8759 }
8760 else
8761 {
8762 /* Get at the scalar def before the loop, that defines the initial
8763 value of the reduction variable. */
8764 tree initial_def = vect_phi_initial_value (phi);
8765 reduc_info->reduc_initial_values.safe_push (initial_def);
8766 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8767 and we can't use zero for induc_val, use initial_def. Similarly
8768 for REDUC_MIN and initial_def larger than the base. */
8769 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8770 {
8771 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8772 if (TREE_CODE (initial_def) == INTEGER_CST
8773 && !integer_zerop (induc_val)
8774 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8775 && tree_int_cst_lt (initial_def, induc_val))
8776 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8777 && tree_int_cst_lt (induc_val, initial_def))))
8778 {
8779 induc_val = initial_def;
8780 /* Communicate we used the initial_def to epilouge
8781 generation. */
8782 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8783 }
8784 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8785 }
8786 else if (nested_cycle)
8787 {
8788 /* Do not use an adjustment def as that case is not supported
8789 correctly if ncopies is not one. */
8790 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8791 ncopies, initial_def,
8792 &vec_initial_defs);
8793 }
8794 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8795 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8796 /* Fill the initial vector with the initial scalar value. */
8797 vec_initial_def
8798 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8799 initial_def, initial_def);
8800 else
8801 {
8802 if (ncopies == 1)
8803 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8804 if (!reduc_info->reduc_initial_values.is_empty ())
8805 {
8806 initial_def = reduc_info->reduc_initial_values[0];
8807 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8808 tree neutral_op
8809 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8810 code, initial_def);
8811 gcc_assert (neutral_op);
8812 /* Try to simplify the vector initialization by applying an
8813 adjustment after the reduction has been performed. */
8814 if (!reduc_info->reused_accumulator
8815 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8816 && !operand_equal_p (neutral_op, initial_def))
8817 {
8818 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8819 = initial_def;
8820 initial_def = neutral_op;
8821 }
8822 vec_initial_def
8823 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8824 initial_def, neutral_op);
8825 }
8826 }
8827 }
8828
8829 if (vec_initial_def)
8830 {
8831 vec_initial_defs.create (ncopies);
8832 for (i = 0; i < ncopies; ++i)
8833 vec_initial_defs.quick_push (vec_initial_def);
8834 }
8835
8836 if (auto *accumulator = reduc_info->reused_accumulator)
8837 {
8838 tree def = accumulator->reduc_input;
8839 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8840 {
8841 unsigned int nreduc;
8842 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8843 (TREE_TYPE (def)),
8844 TYPE_VECTOR_SUBPARTS (vectype_out),
8845 &nreduc);
8846 gcc_assert (res);
8847 gimple_seq stmts = NULL;
8848 /* Reduce the single vector to a smaller one. */
8849 if (nreduc != 1)
8850 {
8851 /* Perform the reduction in the appropriate type. */
8852 tree rvectype = vectype_out;
8853 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8854 TREE_TYPE (TREE_TYPE (def))))
8855 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8856 TYPE_VECTOR_SUBPARTS
8857 (vectype_out));
8858 def = vect_create_partial_epilog (def, rvectype,
8859 STMT_VINFO_REDUC_CODE
8860 (reduc_info),
8861 &stmts);
8862 }
8863 /* The epilogue loop might use a different vector mode, like
8864 VNx2DI vs. V2DI. */
8865 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8866 {
8867 tree reduc_type = build_vector_type_for_mode
8868 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8869 def = gimple_convert (&stmts, reduc_type, def);
8870 }
8871 /* Adjust the input so we pick up the partially reduced value
8872 for the skip edge in vect_create_epilog_for_reduction. */
8873 accumulator->reduc_input = def;
8874 /* And the reduction could be carried out using a different sign. */
8875 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8876 def = gimple_convert (&stmts, vectype_out, def);
8877 if (loop_vinfo->main_loop_edge)
8878 {
8879 /* While we'd like to insert on the edge this will split
8880 blocks and disturb bookkeeping, we also will eventually
8881 need this on the skip edge. Rely on sinking to
8882 fixup optimal placement and insert in the pred. */
8883 gimple_stmt_iterator gsi
8884 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8885 /* Insert before a cond that eventually skips the
8886 epilogue. */
8887 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8888 gsi_prev (&gsi);
8889 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8890 }
8891 else
8892 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8893 stmts);
8894 }
8895 if (loop_vinfo->main_loop_edge)
8896 vec_initial_defs[0]
8897 = vect_get_main_loop_result (loop_vinfo, def,
8898 vec_initial_defs[0]);
8899 else
8900 vec_initial_defs.safe_push (def);
8901 }
8902
8903 /* Generate the reduction PHIs upfront. */
8904 for (i = 0; i < vec_num; i++)
8905 {
8906 tree vec_init_def = vec_initial_defs[i];
8907 for (j = 0; j < ncopies; j++)
8908 {
8909 /* Create the reduction-phi that defines the reduction
8910 operand. */
8911 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8912
8913 /* Set the loop-entry arg of the reduction-phi. */
8914 if (j != 0 && nested_cycle)
8915 vec_init_def = vec_initial_defs[j];
8916 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8917 UNKNOWN_LOCATION);
8918
8919 /* The loop-latch arg is set in epilogue processing. */
8920
8921 if (slp_node)
8922 slp_node->push_vec_def (new_phi);
8923 else
8924 {
8925 if (j == 0)
8926 *vec_stmt = new_phi;
8927 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8928 }
8929 }
8930 }
8931
8932 return true;
8933 }
8934
8935 /* Vectorizes LC PHIs. */
8936
8937 bool
8938 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8939 stmt_vec_info stmt_info, gimple **vec_stmt,
8940 slp_tree slp_node)
8941 {
8942 if (!loop_vinfo
8943 || !is_a <gphi *> (stmt_info->stmt)
8944 || gimple_phi_num_args (stmt_info->stmt) != 1)
8945 return false;
8946
8947 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8948 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8949 return false;
8950
8951 if (!vec_stmt) /* transformation not required. */
8952 {
8953 /* Deal with copies from externs or constants that disguise as
8954 loop-closed PHI nodes (PR97886). */
8955 if (slp_node
8956 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8957 SLP_TREE_VECTYPE (slp_node)))
8958 {
8959 if (dump_enabled_p ())
8960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8961 "incompatible vector types for invariants\n");
8962 return false;
8963 }
8964 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8965 return true;
8966 }
8967
8968 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8969 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8970 basic_block bb = gimple_bb (stmt_info->stmt);
8971 edge e = single_pred_edge (bb);
8972 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8973 auto_vec<tree> vec_oprnds;
8974 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8975 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8976 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8977 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8978 {
8979 /* Create the vectorized LC PHI node. */
8980 gphi *new_phi = create_phi_node (vec_dest, bb);
8981 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8982 if (slp_node)
8983 slp_node->push_vec_def (new_phi);
8984 else
8985 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8986 }
8987 if (!slp_node)
8988 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8989
8990 return true;
8991 }
8992
8993 /* Vectorizes PHIs. */
8994
8995 bool
8996 vectorizable_phi (vec_info *,
8997 stmt_vec_info stmt_info, gimple **vec_stmt,
8998 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8999 {
9000 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9001 return false;
9002
9003 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9004 return false;
9005
9006 tree vectype = SLP_TREE_VECTYPE (slp_node);
9007
9008 if (!vec_stmt) /* transformation not required. */
9009 {
9010 slp_tree child;
9011 unsigned i;
9012 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9013 if (!child)
9014 {
9015 if (dump_enabled_p ())
9016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9017 "PHI node with unvectorized backedge def\n");
9018 return false;
9019 }
9020 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9021 {
9022 if (dump_enabled_p ())
9023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9024 "incompatible vector types for invariants\n");
9025 return false;
9026 }
9027 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9028 && !useless_type_conversion_p (vectype,
9029 SLP_TREE_VECTYPE (child)))
9030 {
9031 /* With bools we can have mask and non-mask precision vectors
9032 or different non-mask precisions. while pattern recog is
9033 supposed to guarantee consistency here bugs in it can cause
9034 mismatches (PR103489 and PR103800 for example).
9035 Deal with them here instead of ICEing later. */
9036 if (dump_enabled_p ())
9037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9038 "incompatible vector type setup from "
9039 "bool pattern detection\n");
9040 return false;
9041 }
9042
9043 /* For single-argument PHIs assume coalescing which means zero cost
9044 for the scalar and the vector PHIs. This avoids artificially
9045 favoring the vector path (but may pessimize it in some cases). */
9046 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9047 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9048 vector_stmt, stmt_info, vectype, 0, vect_body);
9049 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9050 return true;
9051 }
9052
9053 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9054 basic_block bb = gimple_bb (stmt_info->stmt);
9055 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9056 auto_vec<gphi *> new_phis;
9057 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9058 {
9059 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9060
9061 /* Skip not yet vectorized defs. */
9062 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9063 && SLP_TREE_VEC_DEFS (child).is_empty ())
9064 continue;
9065
9066 auto_vec<tree> vec_oprnds;
9067 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9068 if (!new_phis.exists ())
9069 {
9070 new_phis.create (vec_oprnds.length ());
9071 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9072 {
9073 /* Create the vectorized LC PHI node. */
9074 new_phis.quick_push (create_phi_node (vec_dest, bb));
9075 slp_node->push_vec_def (new_phis[j]);
9076 }
9077 }
9078 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9079 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9080 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9081 }
9082 /* We should have at least one already vectorized child. */
9083 gcc_assert (new_phis.exists ());
9084
9085 return true;
9086 }
9087
9088 /* Vectorizes first order recurrences. An overview of the transformation
9089 is described below. Suppose we have the following loop.
9090
9091 int t = 0;
9092 for (int i = 0; i < n; ++i)
9093 {
9094 b[i] = a[i] - t;
9095 t = a[i];
9096 }
9097
9098 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9099 looks (simplified) like:
9100
9101 scalar.preheader:
9102 init = 0;
9103
9104 scalar.body:
9105 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9106 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9107 _1 = a[i]
9108 b[i] = _1 - _2
9109 if (i < n) goto scalar.body
9110
9111 In this example, _2 is a recurrence because it's value depends on the
9112 previous iteration. We vectorize this as (VF = 4)
9113
9114 vector.preheader:
9115 vect_init = vect_cst(..., ..., ..., 0)
9116
9117 vector.body
9118 i = PHI <0(vector.preheader), i+4(vector.body)>
9119 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9120 vect_2 = a[i, i+1, i+2, i+3];
9121 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9122 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9123 if (..) goto vector.body
9124
9125 In this function, vectorizable_recurr, we code generate both the
9126 vector PHI node and the permute since those together compute the
9127 vectorized value of the scalar PHI. We do not yet have the
9128 backedge value to fill in there nor into the vec_perm. Those
9129 are filled in maybe_set_vectorized_backedge_value and
9130 vect_schedule_scc.
9131
9132 TODO: Since the scalar loop does not have a use of the recurrence
9133 outside of the loop the natural way to implement peeling via
9134 vectorizing the live value doesn't work. For now peeling of loops
9135 with a recurrence is not implemented. For SLP the supported cases
9136 are restricted to those requiring a single vector recurrence PHI. */
9137
9138 bool
9139 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9140 gimple **vec_stmt, slp_tree slp_node,
9141 stmt_vector_for_cost *cost_vec)
9142 {
9143 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9144 return false;
9145
9146 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9147
9148 /* So far we only support first-order recurrence auto-vectorization. */
9149 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9150 return false;
9151
9152 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9153 unsigned ncopies;
9154 if (slp_node)
9155 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9156 else
9157 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9158 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9159 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9160 /* We need to be able to make progress with a single vector. */
9161 if (maybe_gt (dist * 2, nunits))
9162 {
9163 if (dump_enabled_p ())
9164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9165 "first order recurrence exceeds half of "
9166 "a vector\n");
9167 return false;
9168 }
9169
9170 /* First-order recurrence autovectorization needs to handle permutation
9171 with indices = [nunits-1, nunits, nunits+1, ...]. */
9172 vec_perm_builder sel (nunits, 1, 3);
9173 for (int i = 0; i < 3; ++i)
9174 sel.quick_push (nunits - dist + i);
9175 vec_perm_indices indices (sel, 2, nunits);
9176
9177 if (!vec_stmt) /* transformation not required. */
9178 {
9179 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9180 indices))
9181 return false;
9182
9183 if (slp_node)
9184 {
9185 /* We eventually need to set a vector type on invariant
9186 arguments. */
9187 unsigned j;
9188 slp_tree child;
9189 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9190 if (!vect_maybe_update_slp_op_vectype
9191 (child, SLP_TREE_VECTYPE (slp_node)))
9192 {
9193 if (dump_enabled_p ())
9194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9195 "incompatible vector types for "
9196 "invariants\n");
9197 return false;
9198 }
9199 }
9200 /* The recurrence costs the initialization vector and one permute
9201 for each copy. */
9202 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9203 stmt_info, 0, vect_prologue);
9204 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9205 stmt_info, 0, vect_body);
9206 if (dump_enabled_p ())
9207 dump_printf_loc (MSG_NOTE, vect_location,
9208 "vectorizable_recurr: inside_cost = %d, "
9209 "prologue_cost = %d .\n", inside_cost,
9210 prologue_cost);
9211
9212 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9213 return true;
9214 }
9215
9216 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9217 basic_block bb = gimple_bb (phi);
9218 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9219 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9220 {
9221 gimple_seq stmts = NULL;
9222 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9223 gsi_insert_seq_on_edge_immediate (pe, stmts);
9224 }
9225 tree vec_init = build_vector_from_val (vectype, preheader);
9226 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9227
9228 /* Create the vectorized first-order PHI node. */
9229 tree vec_dest = vect_get_new_vect_var (vectype,
9230 vect_simple_var, "vec_recur_");
9231 gphi *new_phi = create_phi_node (vec_dest, bb);
9232 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9233
9234 /* Insert shuffles the first-order recurrence autovectorization.
9235 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9236 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9237
9238 /* Insert the required permute after the latch definition. The
9239 second and later operands are tentative and will be updated when we have
9240 vectorized the latch definition. */
9241 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9242 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9243 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9244 gsi_next (&gsi2);
9245
9246 for (unsigned i = 0; i < ncopies; ++i)
9247 {
9248 vec_dest = make_ssa_name (vectype);
9249 gassign *vperm
9250 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9251 i == 0 ? gimple_phi_result (new_phi) : NULL,
9252 NULL, perm);
9253 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9254
9255 if (slp_node)
9256 slp_node->push_vec_def (vperm);
9257 else
9258 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9259 }
9260
9261 if (!slp_node)
9262 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9263 return true;
9264 }
9265
9266 /* Return true if VECTYPE represents a vector that requires lowering
9267 by the vector lowering pass. */
9268
9269 bool
9270 vect_emulated_vector_p (tree vectype)
9271 {
9272 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9273 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9274 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9275 }
9276
9277 /* Return true if we can emulate CODE on an integer mode representation
9278 of a vector. */
9279
9280 bool
9281 vect_can_vectorize_without_simd_p (tree_code code)
9282 {
9283 switch (code)
9284 {
9285 case PLUS_EXPR:
9286 case MINUS_EXPR:
9287 case NEGATE_EXPR:
9288 case BIT_AND_EXPR:
9289 case BIT_IOR_EXPR:
9290 case BIT_XOR_EXPR:
9291 case BIT_NOT_EXPR:
9292 return true;
9293
9294 default:
9295 return false;
9296 }
9297 }
9298
9299 /* Likewise, but taking a code_helper. */
9300
9301 bool
9302 vect_can_vectorize_without_simd_p (code_helper code)
9303 {
9304 return (code.is_tree_code ()
9305 && vect_can_vectorize_without_simd_p (tree_code (code)));
9306 }
9307
9308 /* Create vector init for vectorized iv. */
9309 static tree
9310 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9311 tree step_expr, poly_uint64 nunits,
9312 tree vectype,
9313 enum vect_induction_op_type induction_type)
9314 {
9315 unsigned HOST_WIDE_INT const_nunits;
9316 tree vec_shift, vec_init, new_name;
9317 unsigned i;
9318 tree itype = TREE_TYPE (vectype);
9319
9320 /* iv_loop is the loop to be vectorized. Create:
9321 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9322 new_name = gimple_convert (stmts, itype, init_expr);
9323 switch (induction_type)
9324 {
9325 case vect_step_op_shr:
9326 case vect_step_op_shl:
9327 /* Build the Initial value from shift_expr. */
9328 vec_init = gimple_build_vector_from_val (stmts,
9329 vectype,
9330 new_name);
9331 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9332 build_zero_cst (itype), step_expr);
9333 vec_init = gimple_build (stmts,
9334 (induction_type == vect_step_op_shr
9335 ? RSHIFT_EXPR : LSHIFT_EXPR),
9336 vectype, vec_init, vec_shift);
9337 break;
9338
9339 case vect_step_op_neg:
9340 {
9341 vec_init = gimple_build_vector_from_val (stmts,
9342 vectype,
9343 new_name);
9344 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9345 vectype, vec_init);
9346 /* The encoding has 2 interleaved stepped patterns. */
9347 vec_perm_builder sel (nunits, 2, 3);
9348 sel.quick_grow (6);
9349 for (i = 0; i < 3; i++)
9350 {
9351 sel[2 * i] = i;
9352 sel[2 * i + 1] = i + nunits;
9353 }
9354 vec_perm_indices indices (sel, 2, nunits);
9355 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9356 fail when vec_init is const vector. In that situation vec_perm is not
9357 really needed. */
9358 tree perm_mask_even
9359 = vect_gen_perm_mask_any (vectype, indices);
9360 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9361 vectype,
9362 vec_init, vec_neg,
9363 perm_mask_even);
9364 }
9365 break;
9366
9367 case vect_step_op_mul:
9368 {
9369 /* Use unsigned mult to avoid UD integer overflow. */
9370 gcc_assert (nunits.is_constant (&const_nunits));
9371 tree utype = unsigned_type_for (itype);
9372 tree uvectype = build_vector_type (utype,
9373 TYPE_VECTOR_SUBPARTS (vectype));
9374 new_name = gimple_convert (stmts, utype, new_name);
9375 vec_init = gimple_build_vector_from_val (stmts,
9376 uvectype,
9377 new_name);
9378 tree_vector_builder elts (uvectype, const_nunits, 1);
9379 tree elt_step = build_one_cst (utype);
9380
9381 elts.quick_push (elt_step);
9382 for (i = 1; i < const_nunits; i++)
9383 {
9384 /* Create: new_name_i = new_name + step_expr. */
9385 elt_step = gimple_build (stmts, MULT_EXPR,
9386 utype, elt_step, step_expr);
9387 elts.quick_push (elt_step);
9388 }
9389 /* Create a vector from [new_name_0, new_name_1, ...,
9390 new_name_nunits-1]. */
9391 tree vec_mul = gimple_build_vector (stmts, &elts);
9392 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9393 vec_init, vec_mul);
9394 vec_init = gimple_convert (stmts, vectype, vec_init);
9395 }
9396 break;
9397
9398 default:
9399 gcc_unreachable ();
9400 }
9401
9402 return vec_init;
9403 }
9404
9405 /* Peel init_expr by skip_niter for induction_type. */
9406 tree
9407 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9408 tree skip_niters, tree step_expr,
9409 enum vect_induction_op_type induction_type)
9410 {
9411 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9412 tree type = TREE_TYPE (init_expr);
9413 unsigned prec = TYPE_PRECISION (type);
9414 switch (induction_type)
9415 {
9416 case vect_step_op_neg:
9417 if (TREE_INT_CST_LOW (skip_niters) % 2)
9418 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9419 /* else no change. */
9420 break;
9421
9422 case vect_step_op_shr:
9423 case vect_step_op_shl:
9424 skip_niters = gimple_convert (stmts, type, skip_niters);
9425 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9426 /* When shift mount >= precision, need to avoid UD.
9427 In the original loop, there's no UD, and according to semantic,
9428 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9429 if (!tree_fits_uhwi_p (step_expr)
9430 || tree_to_uhwi (step_expr) >= prec)
9431 {
9432 if (induction_type == vect_step_op_shl
9433 || TYPE_UNSIGNED (type))
9434 init_expr = build_zero_cst (type);
9435 else
9436 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9437 init_expr,
9438 wide_int_to_tree (type, prec - 1));
9439 }
9440 else
9441 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9442 ? RSHIFT_EXPR : LSHIFT_EXPR),
9443 type, init_expr, step_expr);
9444 break;
9445
9446 case vect_step_op_mul:
9447 {
9448 tree utype = unsigned_type_for (type);
9449 init_expr = gimple_convert (stmts, utype, init_expr);
9450 wide_int skipn = wi::to_wide (skip_niters);
9451 wide_int begin = wi::to_wide (step_expr);
9452 auto_mpz base, exp, mod, res;
9453 wi::to_mpz (begin, base, TYPE_SIGN (type));
9454 wi::to_mpz (skipn, exp, UNSIGNED);
9455 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9456 mpz_powm (res, base, exp, mod);
9457 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9458 tree mult_expr = wide_int_to_tree (utype, begin);
9459 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9460 init_expr, mult_expr);
9461 init_expr = gimple_convert (stmts, type, init_expr);
9462 }
9463 break;
9464
9465 default:
9466 gcc_unreachable ();
9467 }
9468
9469 return init_expr;
9470 }
9471
9472 /* Create vector step for vectorized iv. */
9473 static tree
9474 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9475 poly_uint64 vf,
9476 enum vect_induction_op_type induction_type)
9477 {
9478 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9479 tree new_name = NULL;
9480 /* Step should be pow (step, vf) for mult induction. */
9481 if (induction_type == vect_step_op_mul)
9482 {
9483 gcc_assert (vf.is_constant ());
9484 wide_int begin = wi::to_wide (step_expr);
9485
9486 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9487 begin = wi::mul (begin, wi::to_wide (step_expr));
9488
9489 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9490 }
9491 else if (induction_type == vect_step_op_neg)
9492 /* Do nothing. */
9493 ;
9494 else
9495 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9496 expr, step_expr);
9497 return new_name;
9498 }
9499
9500 static tree
9501 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9502 stmt_vec_info stmt_info,
9503 tree new_name, tree vectype,
9504 enum vect_induction_op_type induction_type)
9505 {
9506 /* No step is needed for neg induction. */
9507 if (induction_type == vect_step_op_neg)
9508 return NULL;
9509
9510 tree t = unshare_expr (new_name);
9511 gcc_assert (CONSTANT_CLASS_P (new_name)
9512 || TREE_CODE (new_name) == SSA_NAME);
9513 tree new_vec = build_vector_from_val (vectype, t);
9514 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9515 new_vec, vectype, NULL);
9516 return vec_step;
9517 }
9518
9519 /* Update vectorized iv with vect_step, induc_def is init. */
9520 static tree
9521 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9522 tree induc_def, tree vec_step,
9523 enum vect_induction_op_type induction_type)
9524 {
9525 tree vec_def = induc_def;
9526 switch (induction_type)
9527 {
9528 case vect_step_op_mul:
9529 {
9530 /* Use unsigned mult to avoid UD integer overflow. */
9531 tree uvectype
9532 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9533 TYPE_VECTOR_SUBPARTS (vectype));
9534 vec_def = gimple_convert (stmts, uvectype, vec_def);
9535 vec_step = gimple_convert (stmts, uvectype, vec_step);
9536 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9537 vec_def, vec_step);
9538 vec_def = gimple_convert (stmts, vectype, vec_def);
9539 }
9540 break;
9541
9542 case vect_step_op_shr:
9543 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9544 vec_def, vec_step);
9545 break;
9546
9547 case vect_step_op_shl:
9548 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9549 vec_def, vec_step);
9550 break;
9551 case vect_step_op_neg:
9552 vec_def = induc_def;
9553 /* Do nothing. */
9554 break;
9555 default:
9556 gcc_unreachable ();
9557 }
9558
9559 return vec_def;
9560
9561 }
9562
9563 /* Function vectorizable_induction
9564
9565 Check if STMT_INFO performs an nonlinear induction computation that can be
9566 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9567 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9568 basic block.
9569 Return true if STMT_INFO is vectorizable in this way. */
9570
9571 static bool
9572 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9573 stmt_vec_info stmt_info,
9574 gimple **vec_stmt, slp_tree slp_node,
9575 stmt_vector_for_cost *cost_vec)
9576 {
9577 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9578 unsigned ncopies;
9579 bool nested_in_vect_loop = false;
9580 class loop *iv_loop;
9581 tree vec_def;
9582 edge pe = loop_preheader_edge (loop);
9583 basic_block new_bb;
9584 tree vec_init, vec_step;
9585 tree new_name;
9586 gimple *new_stmt;
9587 gphi *induction_phi;
9588 tree induc_def, vec_dest;
9589 tree init_expr, step_expr;
9590 tree niters_skip;
9591 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9592 unsigned i;
9593 gimple_stmt_iterator si;
9594
9595 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9596
9597 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9598 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9599 enum vect_induction_op_type induction_type
9600 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9601
9602 gcc_assert (induction_type > vect_step_op_add);
9603
9604 if (slp_node)
9605 ncopies = 1;
9606 else
9607 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9608 gcc_assert (ncopies >= 1);
9609
9610 /* FORNOW. Only handle nonlinear induction in the same loop. */
9611 if (nested_in_vect_loop_p (loop, stmt_info))
9612 {
9613 if (dump_enabled_p ())
9614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9615 "nonlinear induction in nested loop.\n");
9616 return false;
9617 }
9618
9619 iv_loop = loop;
9620 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9621
9622 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9623 update for each iv and a permutation to generate wanted vector iv. */
9624 if (slp_node)
9625 {
9626 if (dump_enabled_p ())
9627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9628 "SLP induction not supported for nonlinear"
9629 " induction.\n");
9630 return false;
9631 }
9632
9633 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9634 {
9635 if (dump_enabled_p ())
9636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9637 "floating point nonlinear induction vectorization"
9638 " not supported.\n");
9639 return false;
9640 }
9641
9642 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9643 init_expr = vect_phi_initial_value (phi);
9644 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9645 && TREE_CODE (step_expr) == INTEGER_CST);
9646 /* step_expr should be aligned with init_expr,
9647 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9648 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9649
9650 if (TREE_CODE (init_expr) == INTEGER_CST)
9651 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9652 else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9653 {
9654 /* INIT_EXPR could be a bit_field, bail out for such case. */
9655 if (dump_enabled_p ())
9656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9657 "nonlinear induction vectorization failed:"
9658 " component type of vectype is not a nop conversion"
9659 " from type of init_expr.\n");
9660 return false;
9661 }
9662
9663 switch (induction_type)
9664 {
9665 case vect_step_op_neg:
9666 if (TREE_CODE (init_expr) != INTEGER_CST
9667 && TREE_CODE (init_expr) != REAL_CST)
9668 {
9669 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9670 if (!directly_supported_p (NEGATE_EXPR, vectype))
9671 return false;
9672
9673 /* The encoding has 2 interleaved stepped patterns. */
9674 vec_perm_builder sel (nunits, 2, 3);
9675 machine_mode mode = TYPE_MODE (vectype);
9676 sel.quick_grow (6);
9677 for (i = 0; i < 3; i++)
9678 {
9679 sel[i * 2] = i;
9680 sel[i * 2 + 1] = i + nunits;
9681 }
9682 vec_perm_indices indices (sel, 2, nunits);
9683 if (!can_vec_perm_const_p (mode, mode, indices))
9684 return false;
9685 }
9686 break;
9687
9688 case vect_step_op_mul:
9689 {
9690 /* Check for backend support of MULT_EXPR. */
9691 if (!directly_supported_p (MULT_EXPR, vectype))
9692 return false;
9693
9694 /* ?? How to construct vector step for variable number vector.
9695 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9696 if (!vf.is_constant ())
9697 return false;
9698 }
9699 break;
9700
9701 case vect_step_op_shr:
9702 /* Check for backend support of RSHIFT_EXPR. */
9703 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9704 return false;
9705
9706 /* Don't shift more than type precision to avoid UD. */
9707 if (!tree_fits_uhwi_p (step_expr)
9708 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9709 TYPE_PRECISION (TREE_TYPE (init_expr))))
9710 return false;
9711 break;
9712
9713 case vect_step_op_shl:
9714 /* Check for backend support of RSHIFT_EXPR. */
9715 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9716 return false;
9717
9718 /* Don't shift more than type precision to avoid UD. */
9719 if (!tree_fits_uhwi_p (step_expr)
9720 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9721 TYPE_PRECISION (TREE_TYPE (init_expr))))
9722 return false;
9723
9724 break;
9725
9726 default:
9727 gcc_unreachable ();
9728 }
9729
9730 if (!vec_stmt) /* transformation not required. */
9731 {
9732 unsigned inside_cost = 0, prologue_cost = 0;
9733 /* loop cost for vec_loop. Neg induction doesn't have any
9734 inside_cost. */
9735 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9736 stmt_info, 0, vect_body);
9737
9738 /* loop cost for vec_loop. Neg induction doesn't have any
9739 inside_cost. */
9740 if (induction_type == vect_step_op_neg)
9741 inside_cost = 0;
9742
9743 /* prologue cost for vec_init and vec_step. */
9744 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9745 stmt_info, 0, vect_prologue);
9746
9747 if (dump_enabled_p ())
9748 dump_printf_loc (MSG_NOTE, vect_location,
9749 "vect_model_induction_cost: inside_cost = %d, "
9750 "prologue_cost = %d. \n", inside_cost,
9751 prologue_cost);
9752
9753 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9754 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9755 return true;
9756 }
9757
9758 /* Transform. */
9759
9760 /* Compute a vector variable, initialized with the first VF values of
9761 the induction variable. E.g., for an iv with IV_PHI='X' and
9762 evolution S, for a vector of 4 units, we want to compute:
9763 [X, X + S, X + 2*S, X + 3*S]. */
9764
9765 if (dump_enabled_p ())
9766 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9767
9768 pe = loop_preheader_edge (iv_loop);
9769 /* Find the first insertion point in the BB. */
9770 basic_block bb = gimple_bb (phi);
9771 si = gsi_after_labels (bb);
9772
9773 gimple_seq stmts = NULL;
9774
9775 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9776 /* If we are using the loop mask to "peel" for alignment then we need
9777 to adjust the start value here. */
9778 if (niters_skip != NULL_TREE)
9779 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9780 step_expr, induction_type);
9781
9782 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9783 step_expr, nunits, vectype,
9784 induction_type);
9785 if (stmts)
9786 {
9787 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9788 gcc_assert (!new_bb);
9789 }
9790
9791 stmts = NULL;
9792 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9793 vf, induction_type);
9794 if (stmts)
9795 {
9796 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9797 gcc_assert (!new_bb);
9798 }
9799
9800 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9801 new_name, vectype,
9802 induction_type);
9803 /* Create the following def-use cycle:
9804 loop prolog:
9805 vec_init = ...
9806 vec_step = ...
9807 loop:
9808 vec_iv = PHI <vec_init, vec_loop>
9809 ...
9810 STMT
9811 ...
9812 vec_loop = vec_iv + vec_step; */
9813
9814 /* Create the induction-phi that defines the induction-operand. */
9815 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9816 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9817 induc_def = PHI_RESULT (induction_phi);
9818
9819 /* Create the iv update inside the loop. */
9820 stmts = NULL;
9821 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9822 induc_def, vec_step,
9823 induction_type);
9824
9825 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9826 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9827
9828 /* Set the arguments of the phi node: */
9829 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9830 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9831 UNKNOWN_LOCATION);
9832
9833 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9834 *vec_stmt = induction_phi;
9835
9836 /* In case that vectorization factor (VF) is bigger than the number
9837 of elements that we can fit in a vectype (nunits), we have to generate
9838 more than one vector stmt - i.e - we need to "unroll" the
9839 vector stmt by a factor VF/nunits. For more details see documentation
9840 in vectorizable_operation. */
9841
9842 if (ncopies > 1)
9843 {
9844 stmts = NULL;
9845 /* FORNOW. This restriction should be relaxed. */
9846 gcc_assert (!nested_in_vect_loop);
9847
9848 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9849 nunits, induction_type);
9850
9851 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9852 new_name, vectype,
9853 induction_type);
9854 vec_def = induc_def;
9855 for (i = 1; i < ncopies; i++)
9856 {
9857 /* vec_i = vec_prev + vec_step. */
9858 stmts = NULL;
9859 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9860 vec_def, vec_step,
9861 induction_type);
9862 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9863 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9864 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9865 }
9866 }
9867
9868 if (dump_enabled_p ())
9869 dump_printf_loc (MSG_NOTE, vect_location,
9870 "transform induction: created def-use cycle: %G%G",
9871 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9872
9873 return true;
9874 }
9875
9876 /* Function vectorizable_induction
9877
9878 Check if STMT_INFO performs an induction computation that can be vectorized.
9879 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9880 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9881 Return true if STMT_INFO is vectorizable in this way. */
9882
9883 bool
9884 vectorizable_induction (loop_vec_info loop_vinfo,
9885 stmt_vec_info stmt_info,
9886 gimple **vec_stmt, slp_tree slp_node,
9887 stmt_vector_for_cost *cost_vec)
9888 {
9889 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9890 unsigned ncopies;
9891 bool nested_in_vect_loop = false;
9892 class loop *iv_loop;
9893 tree vec_def;
9894 edge pe = loop_preheader_edge (loop);
9895 basic_block new_bb;
9896 tree new_vec, vec_init, vec_step, t;
9897 tree new_name;
9898 gimple *new_stmt;
9899 gphi *induction_phi;
9900 tree induc_def, vec_dest;
9901 tree init_expr, step_expr;
9902 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9903 unsigned i;
9904 tree expr;
9905 gimple_stmt_iterator si;
9906 enum vect_induction_op_type induction_type
9907 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9908
9909 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9910 if (!phi)
9911 return false;
9912
9913 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9914 return false;
9915
9916 /* Make sure it was recognized as induction computation. */
9917 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9918 return false;
9919
9920 /* Handle nonlinear induction in a separate place. */
9921 if (induction_type != vect_step_op_add)
9922 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9923 vec_stmt, slp_node, cost_vec);
9924
9925 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9926 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9927
9928 if (slp_node)
9929 ncopies = 1;
9930 else
9931 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9932 gcc_assert (ncopies >= 1);
9933
9934 /* FORNOW. These restrictions should be relaxed. */
9935 if (nested_in_vect_loop_p (loop, stmt_info))
9936 {
9937 imm_use_iterator imm_iter;
9938 use_operand_p use_p;
9939 gimple *exit_phi;
9940 edge latch_e;
9941 tree loop_arg;
9942
9943 if (ncopies > 1)
9944 {
9945 if (dump_enabled_p ())
9946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9947 "multiple types in nested loop.\n");
9948 return false;
9949 }
9950
9951 exit_phi = NULL;
9952 latch_e = loop_latch_edge (loop->inner);
9953 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9954 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9955 {
9956 gimple *use_stmt = USE_STMT (use_p);
9957 if (is_gimple_debug (use_stmt))
9958 continue;
9959
9960 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9961 {
9962 exit_phi = use_stmt;
9963 break;
9964 }
9965 }
9966 if (exit_phi)
9967 {
9968 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9969 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9970 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9971 {
9972 if (dump_enabled_p ())
9973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9974 "inner-loop induction only used outside "
9975 "of the outer vectorized loop.\n");
9976 return false;
9977 }
9978 }
9979
9980 nested_in_vect_loop = true;
9981 iv_loop = loop->inner;
9982 }
9983 else
9984 iv_loop = loop;
9985 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9986
9987 if (slp_node && !nunits.is_constant ())
9988 {
9989 /* The current SLP code creates the step value element-by-element. */
9990 if (dump_enabled_p ())
9991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9992 "SLP induction not supported for variable-length"
9993 " vectors.\n");
9994 return false;
9995 }
9996
9997 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9998 {
9999 if (dump_enabled_p ())
10000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10001 "floating point induction vectorization disabled\n");
10002 return false;
10003 }
10004
10005 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10006 gcc_assert (step_expr != NULL_TREE);
10007 if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10008 && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10009 {
10010 if (dump_enabled_p ())
10011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10012 "bit-precision induction vectorization not "
10013 "supported.\n");
10014 return false;
10015 }
10016 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10017
10018 /* Check for backend support of PLUS/MINUS_EXPR. */
10019 if (!directly_supported_p (PLUS_EXPR, step_vectype)
10020 || !directly_supported_p (MINUS_EXPR, step_vectype))
10021 return false;
10022
10023 if (!vec_stmt) /* transformation not required. */
10024 {
10025 unsigned inside_cost = 0, prologue_cost = 0;
10026 if (slp_node)
10027 {
10028 /* We eventually need to set a vector type on invariant
10029 arguments. */
10030 unsigned j;
10031 slp_tree child;
10032 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10033 if (!vect_maybe_update_slp_op_vectype
10034 (child, SLP_TREE_VECTYPE (slp_node)))
10035 {
10036 if (dump_enabled_p ())
10037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10038 "incompatible vector types for "
10039 "invariants\n");
10040 return false;
10041 }
10042 /* loop cost for vec_loop. */
10043 inside_cost
10044 = record_stmt_cost (cost_vec,
10045 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10046 vector_stmt, stmt_info, 0, vect_body);
10047 /* prologue cost for vec_init (if not nested) and step. */
10048 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10049 scalar_to_vec,
10050 stmt_info, 0, vect_prologue);
10051 }
10052 else /* if (!slp_node) */
10053 {
10054 /* loop cost for vec_loop. */
10055 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10056 stmt_info, 0, vect_body);
10057 /* prologue cost for vec_init and vec_step. */
10058 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10059 stmt_info, 0, vect_prologue);
10060 }
10061 if (dump_enabled_p ())
10062 dump_printf_loc (MSG_NOTE, vect_location,
10063 "vect_model_induction_cost: inside_cost = %d, "
10064 "prologue_cost = %d .\n", inside_cost,
10065 prologue_cost);
10066
10067 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10068 DUMP_VECT_SCOPE ("vectorizable_induction");
10069 return true;
10070 }
10071
10072 /* Transform. */
10073
10074 /* Compute a vector variable, initialized with the first VF values of
10075 the induction variable. E.g., for an iv with IV_PHI='X' and
10076 evolution S, for a vector of 4 units, we want to compute:
10077 [X, X + S, X + 2*S, X + 3*S]. */
10078
10079 if (dump_enabled_p ())
10080 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10081
10082 pe = loop_preheader_edge (iv_loop);
10083 /* Find the first insertion point in the BB. */
10084 basic_block bb = gimple_bb (phi);
10085 si = gsi_after_labels (bb);
10086
10087 /* For SLP induction we have to generate several IVs as for example
10088 with group size 3 we need
10089 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10090 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10091 if (slp_node)
10092 {
10093 /* Enforced above. */
10094 unsigned int const_nunits = nunits.to_constant ();
10095
10096 /* The initial values are vectorized, but any lanes > group_size
10097 need adjustment. */
10098 slp_tree init_node
10099 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10100
10101 /* Gather steps. Since we do not vectorize inductions as
10102 cycles we have to reconstruct the step from SCEV data. */
10103 unsigned group_size = SLP_TREE_LANES (slp_node);
10104 tree *steps = XALLOCAVEC (tree, group_size);
10105 tree *inits = XALLOCAVEC (tree, group_size);
10106 stmt_vec_info phi_info;
10107 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10108 {
10109 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10110 if (!init_node)
10111 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10112 pe->dest_idx);
10113 }
10114
10115 /* Now generate the IVs. */
10116 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10117 gcc_assert ((const_nunits * nvects) % group_size == 0);
10118 unsigned nivs;
10119 if (nested_in_vect_loop)
10120 nivs = nvects;
10121 else
10122 {
10123 /* Compute the number of distinct IVs we need. First reduce
10124 group_size if it is a multiple of const_nunits so we get
10125 one IV for a group_size of 4 but const_nunits 2. */
10126 unsigned group_sizep = group_size;
10127 if (group_sizep % const_nunits == 0)
10128 group_sizep = group_sizep / const_nunits;
10129 nivs = least_common_multiple (group_sizep,
10130 const_nunits) / const_nunits;
10131 }
10132 tree stept = TREE_TYPE (step_vectype);
10133 tree lupdate_mul = NULL_TREE;
10134 if (!nested_in_vect_loop)
10135 {
10136 /* The number of iterations covered in one vector iteration. */
10137 unsigned lup_mul = (nvects * const_nunits) / group_size;
10138 lupdate_mul
10139 = build_vector_from_val (step_vectype,
10140 SCALAR_FLOAT_TYPE_P (stept)
10141 ? build_real_from_wide (stept, lup_mul,
10142 UNSIGNED)
10143 : build_int_cstu (stept, lup_mul));
10144 }
10145 tree peel_mul = NULL_TREE;
10146 gimple_seq init_stmts = NULL;
10147 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10148 {
10149 if (SCALAR_FLOAT_TYPE_P (stept))
10150 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10151 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10152 else
10153 peel_mul = gimple_convert (&init_stmts, stept,
10154 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10155 peel_mul = gimple_build_vector_from_val (&init_stmts,
10156 step_vectype, peel_mul);
10157 }
10158 unsigned ivn;
10159 auto_vec<tree> vec_steps;
10160 for (ivn = 0; ivn < nivs; ++ivn)
10161 {
10162 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10163 tree_vector_builder init_elts (vectype, const_nunits, 1);
10164 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10165 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10166 {
10167 /* The scalar steps of the IVs. */
10168 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10169 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10170 step_elts.quick_push (elt);
10171 if (!init_node)
10172 {
10173 /* The scalar inits of the IVs if not vectorized. */
10174 elt = inits[(ivn*const_nunits + eltn) % group_size];
10175 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10176 TREE_TYPE (elt)))
10177 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10178 TREE_TYPE (vectype), elt);
10179 init_elts.quick_push (elt);
10180 }
10181 /* The number of steps to add to the initial values. */
10182 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10183 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10184 ? build_real_from_wide (stept,
10185 mul_elt, UNSIGNED)
10186 : build_int_cstu (stept, mul_elt));
10187 }
10188 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10189 vec_steps.safe_push (vec_step);
10190 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10191 if (peel_mul)
10192 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10193 step_mul, peel_mul);
10194 if (!init_node)
10195 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10196
10197 /* Create the induction-phi that defines the induction-operand. */
10198 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10199 "vec_iv_");
10200 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10201 induc_def = PHI_RESULT (induction_phi);
10202
10203 /* Create the iv update inside the loop */
10204 tree up = vec_step;
10205 if (lupdate_mul)
10206 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10207 vec_step, lupdate_mul);
10208 gimple_seq stmts = NULL;
10209 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10210 vec_def = gimple_build (&stmts,
10211 PLUS_EXPR, step_vectype, vec_def, up);
10212 vec_def = gimple_convert (&stmts, vectype, vec_def);
10213 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10214 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10215 UNKNOWN_LOCATION);
10216
10217 if (init_node)
10218 vec_init = vect_get_slp_vect_def (init_node, ivn);
10219 if (!nested_in_vect_loop
10220 && !integer_zerop (step_mul))
10221 {
10222 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10223 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10224 vec_step, step_mul);
10225 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10226 vec_def, up);
10227 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10228 }
10229
10230 /* Set the arguments of the phi node: */
10231 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10232
10233 slp_node->push_vec_def (induction_phi);
10234 }
10235 if (!nested_in_vect_loop)
10236 {
10237 /* Fill up to the number of vectors we need for the whole group. */
10238 nivs = least_common_multiple (group_size,
10239 const_nunits) / const_nunits;
10240 vec_steps.reserve (nivs-ivn);
10241 for (; ivn < nivs; ++ivn)
10242 {
10243 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10244 vec_steps.quick_push (vec_steps[0]);
10245 }
10246 }
10247
10248 /* Re-use IVs when we can. We are generating further vector
10249 stmts by adding VF' * stride to the IVs generated above. */
10250 if (ivn < nvects)
10251 {
10252 unsigned vfp
10253 = least_common_multiple (group_size, const_nunits) / group_size;
10254 tree lupdate_mul
10255 = build_vector_from_val (step_vectype,
10256 SCALAR_FLOAT_TYPE_P (stept)
10257 ? build_real_from_wide (stept,
10258 vfp, UNSIGNED)
10259 : build_int_cstu (stept, vfp));
10260 for (; ivn < nvects; ++ivn)
10261 {
10262 gimple *iv
10263 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10264 tree def = gimple_get_lhs (iv);
10265 if (ivn < 2*nivs)
10266 vec_steps[ivn - nivs]
10267 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10268 vec_steps[ivn - nivs], lupdate_mul);
10269 gimple_seq stmts = NULL;
10270 def = gimple_convert (&stmts, step_vectype, def);
10271 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10272 def, vec_steps[ivn % nivs]);
10273 def = gimple_convert (&stmts, vectype, def);
10274 if (gimple_code (iv) == GIMPLE_PHI)
10275 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10276 else
10277 {
10278 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10279 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10280 }
10281 slp_node->push_vec_def (def);
10282 }
10283 }
10284
10285 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10286 gcc_assert (!new_bb);
10287
10288 return true;
10289 }
10290
10291 init_expr = vect_phi_initial_value (phi);
10292
10293 gimple_seq stmts = NULL;
10294 if (!nested_in_vect_loop)
10295 {
10296 /* Convert the initial value to the IV update type. */
10297 tree new_type = TREE_TYPE (step_expr);
10298 init_expr = gimple_convert (&stmts, new_type, init_expr);
10299
10300 /* If we are using the loop mask to "peel" for alignment then we need
10301 to adjust the start value here. */
10302 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10303 if (skip_niters != NULL_TREE)
10304 {
10305 if (FLOAT_TYPE_P (vectype))
10306 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10307 skip_niters);
10308 else
10309 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10310 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10311 skip_niters, step_expr);
10312 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10313 init_expr, skip_step);
10314 }
10315 }
10316
10317 if (stmts)
10318 {
10319 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10320 gcc_assert (!new_bb);
10321 }
10322
10323 /* Create the vector that holds the initial_value of the induction. */
10324 if (nested_in_vect_loop)
10325 {
10326 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10327 been created during vectorization of previous stmts. We obtain it
10328 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10329 auto_vec<tree> vec_inits;
10330 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10331 init_expr, &vec_inits);
10332 vec_init = vec_inits[0];
10333 /* If the initial value is not of proper type, convert it. */
10334 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10335 {
10336 new_stmt
10337 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10338 vect_simple_var,
10339 "vec_iv_"),
10340 VIEW_CONVERT_EXPR,
10341 build1 (VIEW_CONVERT_EXPR, vectype,
10342 vec_init));
10343 vec_init = gimple_assign_lhs (new_stmt);
10344 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10345 new_stmt);
10346 gcc_assert (!new_bb);
10347 }
10348 }
10349 else
10350 {
10351 /* iv_loop is the loop to be vectorized. Create:
10352 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10353 stmts = NULL;
10354 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10355
10356 unsigned HOST_WIDE_INT const_nunits;
10357 if (nunits.is_constant (&const_nunits))
10358 {
10359 tree_vector_builder elts (step_vectype, const_nunits, 1);
10360 elts.quick_push (new_name);
10361 for (i = 1; i < const_nunits; i++)
10362 {
10363 /* Create: new_name_i = new_name + step_expr */
10364 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10365 new_name, step_expr);
10366 elts.quick_push (new_name);
10367 }
10368 /* Create a vector from [new_name_0, new_name_1, ...,
10369 new_name_nunits-1] */
10370 vec_init = gimple_build_vector (&stmts, &elts);
10371 }
10372 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10373 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10374 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10375 new_name, step_expr);
10376 else
10377 {
10378 /* Build:
10379 [base, base, base, ...]
10380 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10381 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10382 gcc_assert (flag_associative_math);
10383 tree index = build_index_vector (step_vectype, 0, 1);
10384 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10385 new_name);
10386 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10387 step_expr);
10388 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10389 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10390 vec_init, step_vec);
10391 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10392 vec_init, base_vec);
10393 }
10394 vec_init = gimple_convert (&stmts, vectype, vec_init);
10395
10396 if (stmts)
10397 {
10398 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10399 gcc_assert (!new_bb);
10400 }
10401 }
10402
10403
10404 /* Create the vector that holds the step of the induction. */
10405 gimple_stmt_iterator *step_iv_si = NULL;
10406 if (nested_in_vect_loop)
10407 /* iv_loop is nested in the loop to be vectorized. Generate:
10408 vec_step = [S, S, S, S] */
10409 new_name = step_expr;
10410 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10411 {
10412 /* When we're using loop_len produced by SELEC_VL, the non-final
10413 iterations are not always processing VF elements. So vectorize
10414 induction variable instead of
10415
10416 _21 = vect_vec_iv_.6_22 + { VF, ... };
10417
10418 We should generate:
10419
10420 _35 = .SELECT_VL (ivtmp_33, VF);
10421 vect_cst__22 = [vec_duplicate_expr] _35;
10422 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10423 gcc_assert (!slp_node);
10424 gimple_seq seq = NULL;
10425 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10426 tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10427 expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10428 unshare_expr (len)),
10429 &seq, true, NULL_TREE);
10430 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10431 step_expr);
10432 gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10433 step_iv_si = &si;
10434 }
10435 else
10436 {
10437 /* iv_loop is the loop to be vectorized. Generate:
10438 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10439 gimple_seq seq = NULL;
10440 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10441 {
10442 expr = build_int_cst (integer_type_node, vf);
10443 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10444 }
10445 else
10446 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10447 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10448 expr, step_expr);
10449 if (seq)
10450 {
10451 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10452 gcc_assert (!new_bb);
10453 }
10454 }
10455
10456 t = unshare_expr (new_name);
10457 gcc_assert (CONSTANT_CLASS_P (new_name)
10458 || TREE_CODE (new_name) == SSA_NAME);
10459 new_vec = build_vector_from_val (step_vectype, t);
10460 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10461 new_vec, step_vectype, step_iv_si);
10462
10463
10464 /* Create the following def-use cycle:
10465 loop prolog:
10466 vec_init = ...
10467 vec_step = ...
10468 loop:
10469 vec_iv = PHI <vec_init, vec_loop>
10470 ...
10471 STMT
10472 ...
10473 vec_loop = vec_iv + vec_step; */
10474
10475 /* Create the induction-phi that defines the induction-operand. */
10476 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10477 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10478 induc_def = PHI_RESULT (induction_phi);
10479
10480 /* Create the iv update inside the loop */
10481 stmts = NULL;
10482 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10483 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10484 vec_def = gimple_convert (&stmts, vectype, vec_def);
10485 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10486 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10487
10488 /* Set the arguments of the phi node: */
10489 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10490 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10491 UNKNOWN_LOCATION);
10492
10493 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10494 *vec_stmt = induction_phi;
10495
10496 /* In case that vectorization factor (VF) is bigger than the number
10497 of elements that we can fit in a vectype (nunits), we have to generate
10498 more than one vector stmt - i.e - we need to "unroll" the
10499 vector stmt by a factor VF/nunits. For more details see documentation
10500 in vectorizable_operation. */
10501
10502 if (ncopies > 1)
10503 {
10504 gimple_seq seq = NULL;
10505 /* FORNOW. This restriction should be relaxed. */
10506 gcc_assert (!nested_in_vect_loop);
10507 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10508 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10509
10510 /* Create the vector that holds the step of the induction. */
10511 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10512 {
10513 expr = build_int_cst (integer_type_node, nunits);
10514 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10515 }
10516 else
10517 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10518 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10519 expr, step_expr);
10520 if (seq)
10521 {
10522 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10523 gcc_assert (!new_bb);
10524 }
10525
10526 t = unshare_expr (new_name);
10527 gcc_assert (CONSTANT_CLASS_P (new_name)
10528 || TREE_CODE (new_name) == SSA_NAME);
10529 new_vec = build_vector_from_val (step_vectype, t);
10530 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10531 new_vec, step_vectype, NULL);
10532
10533 vec_def = induc_def;
10534 for (i = 1; i < ncopies + 1; i++)
10535 {
10536 /* vec_i = vec_prev + vec_step */
10537 gimple_seq stmts = NULL;
10538 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10539 vec_def = gimple_build (&stmts,
10540 PLUS_EXPR, step_vectype, vec_def, vec_step);
10541 vec_def = gimple_convert (&stmts, vectype, vec_def);
10542
10543 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10544 if (i < ncopies)
10545 {
10546 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10547 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10548 }
10549 else
10550 {
10551 /* vec_1 = vec_iv + (VF/n * S)
10552 vec_2 = vec_1 + (VF/n * S)
10553 ...
10554 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10555
10556 vec_n is used as vec_loop to save the large step register and
10557 related operations. */
10558 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10559 UNKNOWN_LOCATION);
10560 }
10561 }
10562 }
10563
10564 if (dump_enabled_p ())
10565 dump_printf_loc (MSG_NOTE, vect_location,
10566 "transform induction: created def-use cycle: %G%G",
10567 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10568
10569 return true;
10570 }
10571
10572 /* Function vectorizable_live_operation_1.
10573
10574 helper function for vectorizable_live_operation. */
10575
10576 static tree
10577 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10578 stmt_vec_info stmt_info, basic_block exit_bb,
10579 tree vectype, int ncopies, slp_tree slp_node,
10580 tree bitsize, tree bitstart, tree vec_lhs,
10581 tree lhs_type, gimple_stmt_iterator *exit_gsi)
10582 {
10583 gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10584
10585 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10586 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10587 for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10588 SET_PHI_ARG_DEF (phi, i, vec_lhs);
10589
10590 gimple_seq stmts = NULL;
10591 tree new_tree;
10592
10593 /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10594 if (integer_zerop (bitstart))
10595 {
10596 tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10597 vec_lhs_phi, bitsize, bitstart);
10598
10599 /* Convert the extracted vector element to the scalar type. */
10600 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10601 }
10602 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10603 {
10604 /* Emit:
10605
10606 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10607
10608 where VEC_LHS is the vectorized live-out result and MASK is
10609 the loop mask for the final iteration. */
10610 gcc_assert (ncopies == 1 && !slp_node);
10611 gimple_seq tem = NULL;
10612 gimple_stmt_iterator gsi = gsi_last (tem);
10613 tree len = vect_get_loop_len (loop_vinfo, &gsi,
10614 &LOOP_VINFO_LENS (loop_vinfo),
10615 1, vectype, 0, 0);
10616
10617 /* BIAS - 1. */
10618 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10619 tree bias_minus_one
10620 = int_const_binop (MINUS_EXPR,
10621 build_int_cst (TREE_TYPE (len), biasval),
10622 build_one_cst (TREE_TYPE (len)));
10623
10624 /* LAST_INDEX = LEN + (BIAS - 1). */
10625 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10626 len, bias_minus_one);
10627
10628 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10629 tree scalar_res
10630 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10631 vec_lhs_phi, last_index);
10632
10633 /* Convert the extracted vector element to the scalar type. */
10634 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10635 }
10636 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10637 {
10638 /* Emit:
10639
10640 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10641
10642 where VEC_LHS is the vectorized live-out result and MASK is
10643 the loop mask for the final iteration. */
10644 gcc_assert (!slp_node);
10645 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10646 gimple_seq tem = NULL;
10647 gimple_stmt_iterator gsi = gsi_last (tem);
10648 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10649 &LOOP_VINFO_MASKS (loop_vinfo),
10650 1, vectype, 0);
10651 tree scalar_res;
10652 gimple_seq_add_seq (&stmts, tem);
10653
10654 scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10655 mask, vec_lhs_phi);
10656
10657 /* Convert the extracted vector element to the scalar type. */
10658 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10659 }
10660 else
10661 {
10662 tree bftype = TREE_TYPE (vectype);
10663 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10664 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10665 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10666 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10667 &stmts, true, NULL_TREE);
10668 }
10669
10670 *exit_gsi = gsi_after_labels (exit_bb);
10671 if (stmts)
10672 gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10673
10674 return new_tree;
10675 }
10676
10677 /* Function vectorizable_live_operation.
10678
10679 STMT_INFO computes a value that is used outside the loop. Check if
10680 it can be supported. */
10681
10682 bool
10683 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10684 slp_tree slp_node, slp_instance slp_node_instance,
10685 int slp_index, bool vec_stmt_p,
10686 stmt_vector_for_cost *cost_vec)
10687 {
10688 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10689 imm_use_iterator imm_iter;
10690 tree lhs, lhs_type, bitsize;
10691 tree vectype = (slp_node
10692 ? SLP_TREE_VECTYPE (slp_node)
10693 : STMT_VINFO_VECTYPE (stmt_info));
10694 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10695 int ncopies;
10696 gimple *use_stmt;
10697 use_operand_p use_p;
10698 auto_vec<tree> vec_oprnds;
10699 int vec_entry = 0;
10700 poly_uint64 vec_index = 0;
10701
10702 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10703 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10704
10705 /* If a stmt of a reduction is live, vectorize it via
10706 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10707 validity so just trigger the transform here. */
10708 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10709 {
10710 if (!vec_stmt_p)
10711 return true;
10712 /* For SLP reductions we vectorize the epilogue for all involved stmts
10713 together. */
10714 if (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) && slp_index != 0)
10715 return true;
10716 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10717 gcc_assert (reduc_info->is_reduc_info);
10718 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10719 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10720 return true;
10721
10722 if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10723 || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10724 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10725 slp_node_instance,
10726 LOOP_VINFO_IV_EXIT (loop_vinfo));
10727
10728 /* If early break we only have to materialize the reduction on the merge
10729 block, but we have to find an alternate exit first. */
10730 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10731 {
10732 slp_tree phis_node = slp_node ? slp_node_instance->reduc_phis : NULL;
10733 for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10734 if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10735 {
10736 vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10737 phis_node, slp_node_instance,
10738 exit);
10739 break;
10740 }
10741 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10742 vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10743 phis_node, slp_node_instance,
10744 LOOP_VINFO_IV_EXIT (loop_vinfo));
10745 }
10746
10747 return true;
10748 }
10749
10750 /* If STMT is not relevant and it is a simple assignment and its inputs are
10751 invariant then it can remain in place, unvectorized. The original last
10752 scalar value that it computes will be used. */
10753 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10754 {
10755 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10756 if (dump_enabled_p ())
10757 dump_printf_loc (MSG_NOTE, vect_location,
10758 "statement is simple and uses invariant. Leaving in "
10759 "place.\n");
10760 return true;
10761 }
10762
10763 if (slp_node)
10764 ncopies = 1;
10765 else
10766 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10767
10768 if (slp_node)
10769 {
10770 gcc_assert (slp_index >= 0);
10771
10772 /* Get the last occurrence of the scalar index from the concatenation of
10773 all the slp vectors. Calculate which slp vector it is and the index
10774 within. */
10775 int num_scalar = SLP_TREE_LANES (slp_node);
10776 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10777 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10778
10779 /* Calculate which vector contains the result, and which lane of
10780 that vector we need. */
10781 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10782 {
10783 if (dump_enabled_p ())
10784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10785 "Cannot determine which vector holds the"
10786 " final result.\n");
10787 return false;
10788 }
10789 }
10790
10791 if (!vec_stmt_p)
10792 {
10793 /* No transformation required. */
10794 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10795 {
10796 if (slp_node)
10797 {
10798 if (dump_enabled_p ())
10799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10800 "can't operate on partial vectors "
10801 "because an SLP statement is live after "
10802 "the loop.\n");
10803 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10804 }
10805 else if (ncopies > 1)
10806 {
10807 if (dump_enabled_p ())
10808 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10809 "can't operate on partial vectors "
10810 "because ncopies is greater than 1.\n");
10811 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10812 }
10813 else
10814 {
10815 gcc_assert (ncopies == 1 && !slp_node);
10816 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10817 OPTIMIZE_FOR_SPEED))
10818 vect_record_loop_mask (loop_vinfo,
10819 &LOOP_VINFO_MASKS (loop_vinfo),
10820 1, vectype, NULL);
10821 else if (can_vec_extract_var_idx_p (
10822 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10823 vect_record_loop_len (loop_vinfo,
10824 &LOOP_VINFO_LENS (loop_vinfo),
10825 1, vectype, 1);
10826 else
10827 {
10828 if (dump_enabled_p ())
10829 dump_printf_loc (
10830 MSG_MISSED_OPTIMIZATION, vect_location,
10831 "can't operate on partial vectors "
10832 "because the target doesn't support extract "
10833 "last reduction.\n");
10834 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10835 }
10836 }
10837 }
10838 /* ??? Enable for loop costing as well. */
10839 if (!loop_vinfo)
10840 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10841 0, vect_epilogue);
10842 return true;
10843 }
10844
10845 /* Use the lhs of the original scalar statement. */
10846 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10847 if (dump_enabled_p ())
10848 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10849 "stmt %G", stmt);
10850
10851 lhs = gimple_get_lhs (stmt);
10852 lhs_type = TREE_TYPE (lhs);
10853
10854 bitsize = vector_element_bits_tree (vectype);
10855
10856 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10857 tree vec_lhs, vec_lhs0, bitstart;
10858 gimple *vec_stmt, *vec_stmt0;
10859 if (slp_node)
10860 {
10861 gcc_assert (!loop_vinfo
10862 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10863 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10864
10865 /* Get the correct slp vectorized stmt. */
10866 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10867 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10868
10869 /* In case we need to early break vectorize also get the first stmt. */
10870 vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10871 vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10872
10873 /* Get entry to use. */
10874 bitstart = bitsize_int (vec_index);
10875 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10876 }
10877 else
10878 {
10879 /* For multiple copies, get the last copy. */
10880 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10881 vec_lhs = gimple_get_lhs (vec_stmt);
10882
10883 /* In case we need to early break vectorize also get the first stmt. */
10884 vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10885 vec_lhs0 = gimple_get_lhs (vec_stmt0);
10886
10887 /* Get the last lane in the vector. */
10888 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10889 }
10890
10891 if (loop_vinfo)
10892 {
10893 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10894 requirement, insert one phi node for it. It looks like:
10895 loop;
10896 BB:
10897 # lhs' = PHI <lhs>
10898 ==>
10899 loop;
10900 BB:
10901 # vec_lhs' = PHI <vec_lhs>
10902 new_tree = lane_extract <vec_lhs', ...>;
10903 lhs' = new_tree; */
10904
10905 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10906 /* Check if we have a loop where the chosen exit is not the main exit,
10907 in these cases for an early break we restart the iteration the vector code
10908 did. For the live values we want the value at the start of the iteration
10909 rather than at the end. */
10910 edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10911 bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10912 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10913 if (!is_gimple_debug (use_stmt)
10914 && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10915 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10916 {
10917 edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10918 phi_arg_index_from_use (use_p));
10919 gcc_assert (loop_exit_edge_p (loop, e));
10920 bool main_exit_edge = e == main_e;
10921 tree tmp_vec_lhs = vec_lhs;
10922 tree tmp_bitstart = bitstart;
10923
10924 /* For early exit where the exit is not in the BB that leads
10925 to the latch then we're restarting the iteration in the
10926 scalar loop. So get the first live value. */
10927 if ((all_exits_as_early_p || !main_exit_edge)
10928 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10929 {
10930 tmp_vec_lhs = vec_lhs0;
10931 tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10932 }
10933
10934 gimple_stmt_iterator exit_gsi;
10935 tree new_tree
10936 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10937 e->dest, vectype, ncopies,
10938 slp_node, bitsize,
10939 tmp_bitstart, tmp_vec_lhs,
10940 lhs_type, &exit_gsi);
10941
10942 auto gsi = gsi_for_stmt (use_stmt);
10943 remove_phi_node (&gsi, false);
10944 tree lhs_phi = gimple_phi_result (use_stmt);
10945 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10946 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10947 break;
10948 }
10949
10950 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10951 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10952 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10953 }
10954 else
10955 {
10956 /* For basic-block vectorization simply insert the lane-extraction. */
10957 tree bftype = TREE_TYPE (vectype);
10958 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10959 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10960 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10961 vec_lhs, bitsize, bitstart);
10962 gimple_seq stmts = NULL;
10963 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10964 &stmts, true, NULL_TREE);
10965 if (TREE_CODE (new_tree) == SSA_NAME
10966 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10967 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10968 if (is_a <gphi *> (vec_stmt))
10969 {
10970 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10971 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10972 }
10973 else
10974 {
10975 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10976 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10977 }
10978
10979 /* Replace use of lhs with newly computed result. If the use stmt is a
10980 single arg PHI, just replace all uses of PHI result. It's necessary
10981 because lcssa PHI defining lhs may be before newly inserted stmt. */
10982 use_operand_p use_p;
10983 stmt_vec_info use_stmt_info;
10984 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10985 if (!is_gimple_debug (use_stmt)
10986 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10987 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10988 {
10989 /* ??? This can happen when the live lane ends up being
10990 rooted in a vector construction code-generated by an
10991 external SLP node (and code-generation for that already
10992 happened). See gcc.dg/vect/bb-slp-47.c.
10993 Doing this is what would happen if that vector CTOR
10994 were not code-generated yet so it is not too bad.
10995 ??? In fact we'd likely want to avoid this situation
10996 in the first place. */
10997 if (TREE_CODE (new_tree) == SSA_NAME
10998 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10999 && gimple_code (use_stmt) != GIMPLE_PHI
11000 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11001 use_stmt))
11002 {
11003 if (dump_enabled_p ())
11004 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11005 "Using original scalar computation for "
11006 "live lane because use preceeds vector "
11007 "def\n");
11008 continue;
11009 }
11010 /* ??? It can also happen that we end up pulling a def into
11011 a loop where replacing out-of-loop uses would require
11012 a new LC SSA PHI node. Retain the original scalar in
11013 those cases as well. PR98064. */
11014 if (TREE_CODE (new_tree) == SSA_NAME
11015 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11016 && (gimple_bb (use_stmt)->loop_father
11017 != gimple_bb (vec_stmt)->loop_father)
11018 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11019 gimple_bb (use_stmt)->loop_father))
11020 {
11021 if (dump_enabled_p ())
11022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11023 "Using original scalar computation for "
11024 "live lane because there is an out-of-loop "
11025 "definition for it\n");
11026 continue;
11027 }
11028 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11029 SET_USE (use_p, new_tree);
11030 update_stmt (use_stmt);
11031 }
11032 }
11033
11034 return true;
11035 }
11036
11037 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11038
11039 static void
11040 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11041 {
11042 ssa_op_iter op_iter;
11043 imm_use_iterator imm_iter;
11044 def_operand_p def_p;
11045 gimple *ustmt;
11046
11047 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11048 {
11049 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11050 {
11051 basic_block bb;
11052
11053 if (!is_gimple_debug (ustmt))
11054 continue;
11055
11056 bb = gimple_bb (ustmt);
11057
11058 if (!flow_bb_inside_loop_p (loop, bb))
11059 {
11060 if (gimple_debug_bind_p (ustmt))
11061 {
11062 if (dump_enabled_p ())
11063 dump_printf_loc (MSG_NOTE, vect_location,
11064 "killing debug use\n");
11065
11066 gimple_debug_bind_reset_value (ustmt);
11067 update_stmt (ustmt);
11068 }
11069 else
11070 gcc_unreachable ();
11071 }
11072 }
11073 }
11074 }
11075
11076 /* Given loop represented by LOOP_VINFO, return true if computation of
11077 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11078 otherwise. */
11079
11080 static bool
11081 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11082 {
11083 /* Constant case. */
11084 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11085 {
11086 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11087 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11088
11089 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11090 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11091 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11092 return true;
11093 }
11094
11095 widest_int max;
11096 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11097 /* Check the upper bound of loop niters. */
11098 if (get_max_loop_iterations (loop, &max))
11099 {
11100 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11101 signop sgn = TYPE_SIGN (type);
11102 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11103 if (max < type_max)
11104 return true;
11105 }
11106 return false;
11107 }
11108
11109 /* Return a mask type with half the number of elements as OLD_TYPE,
11110 given that it should have mode NEW_MODE. */
11111
11112 tree
11113 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11114 {
11115 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11116 return build_truth_vector_type_for_mode (nunits, new_mode);
11117 }
11118
11119 /* Return a mask type with twice as many elements as OLD_TYPE,
11120 given that it should have mode NEW_MODE. */
11121
11122 tree
11123 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11124 {
11125 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11126 return build_truth_vector_type_for_mode (nunits, new_mode);
11127 }
11128
11129 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11130 contain a sequence of NVECTORS masks that each control a vector of type
11131 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11132 these vector masks with the vector version of SCALAR_MASK. */
11133
11134 void
11135 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11136 unsigned int nvectors, tree vectype, tree scalar_mask)
11137 {
11138 gcc_assert (nvectors != 0);
11139
11140 if (scalar_mask)
11141 {
11142 scalar_cond_masked_key cond (scalar_mask, nvectors);
11143 loop_vinfo->scalar_cond_masked_set.add (cond);
11144 }
11145
11146 masks->mask_set.add (std::make_pair (vectype, nvectors));
11147 }
11148
11149 /* Given a complete set of masks MASKS, extract mask number INDEX
11150 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11151 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11152
11153 See the comment above vec_loop_masks for more details about the mask
11154 arrangement. */
11155
11156 tree
11157 vect_get_loop_mask (loop_vec_info loop_vinfo,
11158 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11159 unsigned int nvectors, tree vectype, unsigned int index)
11160 {
11161 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11162 == vect_partial_vectors_while_ult)
11163 {
11164 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11165 tree mask_type = rgm->type;
11166
11167 /* Populate the rgroup's mask array, if this is the first time we've
11168 used it. */
11169 if (rgm->controls.is_empty ())
11170 {
11171 rgm->controls.safe_grow_cleared (nvectors, true);
11172 for (unsigned int i = 0; i < nvectors; ++i)
11173 {
11174 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11175 /* Provide a dummy definition until the real one is available. */
11176 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11177 rgm->controls[i] = mask;
11178 }
11179 }
11180
11181 tree mask = rgm->controls[index];
11182 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11183 TYPE_VECTOR_SUBPARTS (vectype)))
11184 {
11185 /* A loop mask for data type X can be reused for data type Y
11186 if X has N times more elements than Y and if Y's elements
11187 are N times bigger than X's. In this case each sequence
11188 of N elements in the loop mask will be all-zero or all-one.
11189 We can then view-convert the mask so that each sequence of
11190 N elements is replaced by a single element. */
11191 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11192 TYPE_VECTOR_SUBPARTS (vectype)));
11193 gimple_seq seq = NULL;
11194 mask_type = truth_type_for (vectype);
11195 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11196 if (seq)
11197 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11198 }
11199 return mask;
11200 }
11201 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11202 == vect_partial_vectors_avx512)
11203 {
11204 /* The number of scalars per iteration and the number of vectors are
11205 both compile-time constants. */
11206 unsigned int nscalars_per_iter
11207 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11208 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11209
11210 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11211
11212 /* The stored nV is dependent on the mask type produced. */
11213 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11214 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11215 == rgm->factor);
11216 nvectors = rgm->factor;
11217
11218 /* Populate the rgroup's mask array, if this is the first time we've
11219 used it. */
11220 if (rgm->controls.is_empty ())
11221 {
11222 rgm->controls.safe_grow_cleared (nvectors, true);
11223 for (unsigned int i = 0; i < nvectors; ++i)
11224 {
11225 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11226 /* Provide a dummy definition until the real one is available. */
11227 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11228 rgm->controls[i] = mask;
11229 }
11230 }
11231 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11232 TYPE_VECTOR_SUBPARTS (vectype)))
11233 return rgm->controls[index];
11234
11235 /* Split the vector if needed. Since we are dealing with integer mode
11236 masks with AVX512 we can operate on the integer representation
11237 performing the whole vector shifting. */
11238 unsigned HOST_WIDE_INT factor;
11239 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11240 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11241 gcc_assert (ok);
11242 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11243 tree mask_type = truth_type_for (vectype);
11244 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11245 unsigned vi = index / factor;
11246 unsigned vpart = index % factor;
11247 tree vec = rgm->controls[vi];
11248 gimple_seq seq = NULL;
11249 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11250 lang_hooks.types.type_for_mode
11251 (TYPE_MODE (rgm->type), 1), vec);
11252 /* For integer mode masks simply shift the right bits into position. */
11253 if (vpart != 0)
11254 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11255 build_int_cst (integer_type_node,
11256 (TYPE_VECTOR_SUBPARTS (vectype)
11257 * vpart)));
11258 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11259 (TYPE_MODE (mask_type), 1), vec);
11260 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11261 if (seq)
11262 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11263 return vec;
11264 }
11265 else
11266 gcc_unreachable ();
11267 }
11268
11269 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11270 lengths for controlling an operation on VECTYPE. The operation splits
11271 each element of VECTYPE into FACTOR separate subelements, measuring the
11272 length as a number of these subelements. */
11273
11274 void
11275 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11276 unsigned int nvectors, tree vectype, unsigned int factor)
11277 {
11278 gcc_assert (nvectors != 0);
11279 if (lens->length () < nvectors)
11280 lens->safe_grow_cleared (nvectors, true);
11281 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11282
11283 /* The number of scalars per iteration, scalar occupied bytes and
11284 the number of vectors are both compile-time constants. */
11285 unsigned int nscalars_per_iter
11286 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11287 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11288
11289 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11290 {
11291 /* For now, we only support cases in which all loads and stores fall back
11292 to VnQI or none do. */
11293 gcc_assert (!rgl->max_nscalars_per_iter
11294 || (rgl->factor == 1 && factor == 1)
11295 || (rgl->max_nscalars_per_iter * rgl->factor
11296 == nscalars_per_iter * factor));
11297 rgl->max_nscalars_per_iter = nscalars_per_iter;
11298 rgl->type = vectype;
11299 rgl->factor = factor;
11300 }
11301 }
11302
11303 /* Given a complete set of lengths LENS, extract length number INDEX
11304 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11305 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11306 multipled by the number of elements that should be processed.
11307 Insert any set-up statements before GSI. */
11308
11309 tree
11310 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11311 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11312 unsigned int index, unsigned int factor)
11313 {
11314 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11315 bool use_bias_adjusted_len =
11316 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11317
11318 /* Populate the rgroup's len array, if this is the first time we've
11319 used it. */
11320 if (rgl->controls.is_empty ())
11321 {
11322 rgl->controls.safe_grow_cleared (nvectors, true);
11323 for (unsigned int i = 0; i < nvectors; ++i)
11324 {
11325 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11326 gcc_assert (len_type != NULL_TREE);
11327
11328 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11329
11330 /* Provide a dummy definition until the real one is available. */
11331 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11332 rgl->controls[i] = len;
11333
11334 if (use_bias_adjusted_len)
11335 {
11336 gcc_assert (i == 0);
11337 tree adjusted_len =
11338 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11339 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11340 rgl->bias_adjusted_ctrl = adjusted_len;
11341 }
11342 }
11343 }
11344
11345 if (use_bias_adjusted_len)
11346 return rgl->bias_adjusted_ctrl;
11347
11348 tree loop_len = rgl->controls[index];
11349 if (rgl->factor == 1 && factor == 1)
11350 {
11351 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11352 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11353 if (maybe_ne (nunits1, nunits2))
11354 {
11355 /* A loop len for data type X can be reused for data type Y
11356 if X has N times more elements than Y and if Y's elements
11357 are N times bigger than X's. */
11358 gcc_assert (multiple_p (nunits1, nunits2));
11359 factor = exact_div (nunits1, nunits2).to_constant ();
11360 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11361 gimple_seq seq = NULL;
11362 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11363 build_int_cst (iv_type, factor));
11364 if (seq)
11365 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11366 }
11367 }
11368 return loop_len;
11369 }
11370
11371 /* Scale profiling counters by estimation for LOOP which is vectorized
11372 by factor VF.
11373 If FLAT is true, the loop we started with had unrealistically flat
11374 profile. */
11375
11376 static void
11377 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11378 {
11379 /* For flat profiles do not scale down proportionally by VF and only
11380 cap by known iteration count bounds. */
11381 if (flat)
11382 {
11383 if (dump_file && (dump_flags & TDF_DETAILS))
11384 fprintf (dump_file,
11385 "Vectorized loop profile seems flat; not scaling iteration "
11386 "count down by the vectorization factor %i\n", vf);
11387 scale_loop_profile (loop, profile_probability::always (),
11388 get_likely_max_loop_iterations_int (loop));
11389 return;
11390 }
11391 /* Loop body executes VF fewer times and exit increases VF times. */
11392 profile_count entry_count = loop_preheader_edge (loop)->count ();
11393
11394 /* If we have unreliable loop profile avoid dropping entry
11395 count bellow header count. This can happen since loops
11396 has unrealistically low trip counts. */
11397 while (vf > 1
11398 && loop->header->count > entry_count
11399 && loop->header->count < entry_count * vf)
11400 {
11401 if (dump_file && (dump_flags & TDF_DETAILS))
11402 fprintf (dump_file,
11403 "Vectorization factor %i seems too large for profile "
11404 "prevoiusly believed to be consistent; reducing.\n", vf);
11405 vf /= 2;
11406 }
11407
11408 if (entry_count.nonzero_p ())
11409 set_edge_probability_and_rescale_others
11410 (exit_e,
11411 entry_count.probability_in (loop->header->count / vf));
11412 /* Avoid producing very large exit probability when we do not have
11413 sensible profile. */
11414 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11415 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11416 loop->latch->count = single_pred_edge (loop->latch)->count ();
11417
11418 scale_loop_profile (loop, profile_probability::always () / vf,
11419 get_likely_max_loop_iterations_int (loop));
11420 }
11421
11422 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11423 latch edge values originally defined by it. */
11424
11425 static void
11426 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11427 stmt_vec_info def_stmt_info)
11428 {
11429 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11430 if (!def || TREE_CODE (def) != SSA_NAME)
11431 return;
11432 stmt_vec_info phi_info;
11433 imm_use_iterator iter;
11434 use_operand_p use_p;
11435 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11436 {
11437 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11438 if (!phi)
11439 continue;
11440 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11441 && (phi_info = loop_vinfo->lookup_stmt (phi))
11442 && STMT_VINFO_RELEVANT_P (phi_info)))
11443 continue;
11444 loop_p loop = gimple_bb (phi)->loop_father;
11445 edge e = loop_latch_edge (loop);
11446 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11447 continue;
11448
11449 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11450 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11451 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11452 {
11453 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11454 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11455 gcc_assert (phi_defs.length () == latch_defs.length ());
11456 for (unsigned i = 0; i < phi_defs.length (); ++i)
11457 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11458 gimple_get_lhs (latch_defs[i]), e,
11459 gimple_phi_arg_location (phi, e->dest_idx));
11460 }
11461 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11462 {
11463 /* For first order recurrences we have to update both uses of
11464 the latch definition, the one in the PHI node and the one
11465 in the generated VEC_PERM_EXPR. */
11466 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11467 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11468 gcc_assert (phi_defs.length () == latch_defs.length ());
11469 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11470 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11471 for (unsigned i = 0; i < phi_defs.length (); ++i)
11472 {
11473 gassign *perm = as_a <gassign *> (phi_defs[i]);
11474 if (i > 0)
11475 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11476 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11477 update_stmt (perm);
11478 }
11479 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11480 gimple_phi_arg_location (phi, e->dest_idx));
11481 }
11482 }
11483 }
11484
11485 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11486 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11487 stmt_vec_info. */
11488
11489 static bool
11490 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11491 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11492 {
11493 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11494 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11495
11496 if (dump_enabled_p ())
11497 dump_printf_loc (MSG_NOTE, vect_location,
11498 "------>vectorizing statement: %G", stmt_info->stmt);
11499
11500 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11501 vect_loop_kill_debug_uses (loop, stmt_info);
11502
11503 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11504 && !STMT_VINFO_LIVE_P (stmt_info))
11505 {
11506 if (is_gimple_call (stmt_info->stmt)
11507 && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11508 {
11509 gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11510 *seen_store = stmt_info;
11511 return false;
11512 }
11513 return false;
11514 }
11515
11516 if (STMT_VINFO_VECTYPE (stmt_info))
11517 {
11518 poly_uint64 nunits
11519 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11520 if (!STMT_SLP_TYPE (stmt_info)
11521 && maybe_ne (nunits, vf)
11522 && dump_enabled_p ())
11523 /* For SLP VF is set according to unrolling factor, and not
11524 to vector size, hence for SLP this print is not valid. */
11525 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11526 }
11527
11528 /* Pure SLP statements have already been vectorized. We still need
11529 to apply loop vectorization to hybrid SLP statements. */
11530 if (PURE_SLP_STMT (stmt_info))
11531 return false;
11532
11533 if (dump_enabled_p ())
11534 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11535
11536 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11537 *seen_store = stmt_info;
11538
11539 return true;
11540 }
11541
11542 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11543 in the hash_map with its corresponding values. */
11544
11545 static tree
11546 find_in_mapping (tree t, void *context)
11547 {
11548 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11549
11550 tree *value = mapping->get (t);
11551 return value ? *value : t;
11552 }
11553
11554 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11555 original loop that has now been vectorized.
11556
11557 The inits of the data_references need to be advanced with the number of
11558 iterations of the main loop. This has been computed in vect_do_peeling and
11559 is stored in parameter ADVANCE. We first restore the data_references
11560 initial offset with the values recored in ORIG_DRS_INIT.
11561
11562 Since the loop_vec_info of this EPILOGUE was constructed for the original
11563 loop, its stmt_vec_infos all point to the original statements. These need
11564 to be updated to point to their corresponding copies as well as the SSA_NAMES
11565 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11566
11567 The data_reference's connections also need to be updated. Their
11568 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11569 stmt_vec_infos, their statements need to point to their corresponding copy,
11570 if they are gather loads or scatter stores then their reference needs to be
11571 updated to point to its corresponding copy and finally we set
11572 'base_misaligned' to false as we have already peeled for alignment in the
11573 prologue of the main loop. */
11574
11575 static void
11576 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11577 {
11578 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11579 auto_vec<gimple *> stmt_worklist;
11580 hash_map<tree,tree> mapping;
11581 gimple *orig_stmt, *new_stmt;
11582 gimple_stmt_iterator epilogue_gsi;
11583 gphi_iterator epilogue_phi_gsi;
11584 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11585 basic_block *epilogue_bbs = get_loop_body (epilogue);
11586 unsigned i;
11587
11588 free (LOOP_VINFO_BBS (epilogue_vinfo));
11589 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11590
11591 /* Advance data_reference's with the number of iterations of the previous
11592 loop and its prologue. */
11593 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11594
11595
11596 /* The EPILOGUE loop is a copy of the original loop so they share the same
11597 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11598 point to the copied statements. We also create a mapping of all LHS' in
11599 the original loop and all the LHS' in the EPILOGUE and create worklists to
11600 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11601 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11602 {
11603 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11604 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11605 {
11606 new_stmt = epilogue_phi_gsi.phi ();
11607
11608 gcc_assert (gimple_uid (new_stmt) > 0);
11609 stmt_vinfo
11610 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11611
11612 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11613 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11614
11615 mapping.put (gimple_phi_result (orig_stmt),
11616 gimple_phi_result (new_stmt));
11617 /* PHI nodes can not have patterns or related statements. */
11618 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11619 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11620 }
11621
11622 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11623 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11624 {
11625 new_stmt = gsi_stmt (epilogue_gsi);
11626 if (is_gimple_debug (new_stmt))
11627 continue;
11628
11629 gcc_assert (gimple_uid (new_stmt) > 0);
11630 stmt_vinfo
11631 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11632
11633 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11634 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11635
11636 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11637 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11638
11639 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11640 {
11641 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11642 for (gimple_stmt_iterator gsi = gsi_start (seq);
11643 !gsi_end_p (gsi); gsi_next (&gsi))
11644 stmt_worklist.safe_push (gsi_stmt (gsi));
11645 }
11646
11647 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11648 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11649 {
11650 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11651 stmt_worklist.safe_push (stmt);
11652 /* Set BB such that the assert in
11653 'get_initial_def_for_reduction' is able to determine that
11654 the BB of the related stmt is inside this loop. */
11655 gimple_set_bb (stmt,
11656 gimple_bb (new_stmt));
11657 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11658 gcc_assert (related_vinfo == NULL
11659 || related_vinfo == stmt_vinfo);
11660 }
11661 }
11662 }
11663
11664 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11665 using the original main loop and thus need to be updated to refer to the
11666 cloned variables used in the epilogue. */
11667 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11668 {
11669 gimple *stmt = stmt_worklist[i];
11670 tree *new_op;
11671
11672 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11673 {
11674 tree op = gimple_op (stmt, j);
11675 if ((new_op = mapping.get(op)))
11676 gimple_set_op (stmt, j, *new_op);
11677 else
11678 {
11679 /* PR92429: The last argument of simplify_replace_tree disables
11680 folding when replacing arguments. This is required as
11681 otherwise you might end up with different statements than the
11682 ones analyzed in vect_loop_analyze, leading to different
11683 vectorization. */
11684 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11685 &find_in_mapping, &mapping, false);
11686 gimple_set_op (stmt, j, op);
11687 }
11688 }
11689 }
11690
11691 struct data_reference *dr;
11692 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11693 FOR_EACH_VEC_ELT (datarefs, i, dr)
11694 {
11695 orig_stmt = DR_STMT (dr);
11696 gcc_assert (gimple_uid (orig_stmt) > 0);
11697 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11698 /* Data references for gather loads and scatter stores do not use the
11699 updated offset we set using ADVANCE. Instead we have to make sure the
11700 reference in the data references point to the corresponding copy of
11701 the original in the epilogue. Make sure to update both
11702 gather/scatters recognized by dataref analysis and also other
11703 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11704 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11705 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11706 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11707 {
11708 DR_REF (dr)
11709 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11710 &find_in_mapping, &mapping);
11711 DR_BASE_ADDRESS (dr)
11712 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11713 &find_in_mapping, &mapping);
11714 }
11715 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11716 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11717 /* The vector size of the epilogue is smaller than that of the main loop
11718 so the alignment is either the same or lower. This means the dr will
11719 thus by definition be aligned. */
11720 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11721 }
11722
11723 epilogue_vinfo->shared->datarefs_copy.release ();
11724 epilogue_vinfo->shared->save_datarefs ();
11725 }
11726
11727 /* When vectorizing early break statements instructions that happen before
11728 the early break in the current BB need to be moved to after the early
11729 break. This function deals with that and assumes that any validity
11730 checks has already been performed.
11731
11732 While moving the instructions if it encounters a VUSE or VDEF it then
11733 corrects the VUSES as it moves the statements along. GDEST is the location
11734 in which to insert the new statements. */
11735
11736 static void
11737 move_early_exit_stmts (loop_vec_info loop_vinfo)
11738 {
11739 DUMP_VECT_SCOPE ("move_early_exit_stmts");
11740
11741 if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11742 return;
11743
11744 /* Move all stmts that need moving. */
11745 basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11746 gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11747
11748 tree last_seen_vuse = NULL_TREE;
11749 for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11750 {
11751 /* We have to update crossed degenerate virtual PHIs. Simply
11752 elide them. */
11753 if (gphi *vphi = dyn_cast <gphi *> (stmt))
11754 {
11755 tree vdef = gimple_phi_result (vphi);
11756 tree vuse = gimple_phi_arg_def (vphi, 0);
11757 imm_use_iterator iter;
11758 use_operand_p use_p;
11759 gimple *use_stmt;
11760 FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11761 {
11762 FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11763 SET_USE (use_p, vuse);
11764 }
11765 auto gsi = gsi_for_stmt (stmt);
11766 remove_phi_node (&gsi, true);
11767 last_seen_vuse = vuse;
11768 continue;
11769 }
11770
11771 /* Check to see if statement is still required for vect or has been
11772 elided. */
11773 auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11774 if (!stmt_info)
11775 continue;
11776
11777 if (dump_enabled_p ())
11778 dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11779
11780 gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11781 gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11782 last_seen_vuse = gimple_vuse (stmt);
11783 }
11784
11785 /* Update all the stmts with their new reaching VUSES. */
11786 for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11787 {
11788 if (dump_enabled_p ())
11789 dump_printf_loc (MSG_NOTE, vect_location,
11790 "updating vuse to %T for load %G",
11791 last_seen_vuse, p);
11792 gimple_set_vuse (p, last_seen_vuse);
11793 update_stmt (p);
11794 }
11795
11796 /* And update the LC PHIs on exits. */
11797 for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11798 if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11799 if (gphi *phi = get_virtual_phi (e->dest))
11800 SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11801 }
11802
11803 /* Function vect_transform_loop.
11804
11805 The analysis phase has determined that the loop is vectorizable.
11806 Vectorize the loop - created vectorized stmts to replace the scalar
11807 stmts in the loop, and update the loop exit condition.
11808 Returns scalar epilogue loop if any. */
11809
11810 class loop *
11811 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11812 {
11813 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11814 class loop *epilogue = NULL;
11815 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11816 int nbbs = loop->num_nodes;
11817 int i;
11818 tree niters_vector = NULL_TREE;
11819 tree step_vector = NULL_TREE;
11820 tree niters_vector_mult_vf = NULL_TREE;
11821 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11822 unsigned int lowest_vf = constant_lower_bound (vf);
11823 gimple *stmt;
11824 bool check_profitability = false;
11825 unsigned int th;
11826 bool flat = maybe_flat_loop_profile (loop);
11827
11828 DUMP_VECT_SCOPE ("vec_transform_loop");
11829
11830 loop_vinfo->shared->check_datarefs ();
11831
11832 /* Use the more conservative vectorization threshold. If the number
11833 of iterations is constant assume the cost check has been performed
11834 by our caller. If the threshold makes all loops profitable that
11835 run at least the (estimated) vectorization factor number of times
11836 checking is pointless, too. */
11837 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11838 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11839 {
11840 if (dump_enabled_p ())
11841 dump_printf_loc (MSG_NOTE, vect_location,
11842 "Profitability threshold is %d loop iterations.\n",
11843 th);
11844 check_profitability = true;
11845 }
11846
11847 /* Make sure there exists a single-predecessor exit bb. Do this before
11848 versioning. */
11849 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11850 if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11851 {
11852 split_loop_exit_edge (e, true);
11853 if (dump_enabled_p ())
11854 dump_printf (MSG_NOTE, "split exit edge\n");
11855 }
11856
11857 /* Version the loop first, if required, so the profitability check
11858 comes first. */
11859
11860 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11861 {
11862 class loop *sloop
11863 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11864 sloop->force_vectorize = false;
11865 check_profitability = false;
11866 }
11867
11868 /* Make sure there exists a single-predecessor exit bb also on the
11869 scalar loop copy. Do this after versioning but before peeling
11870 so CFG structure is fine for both scalar and if-converted loop
11871 to make slpeel_duplicate_current_defs_from_edges face matched
11872 loop closed PHI nodes on the exit. */
11873 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11874 {
11875 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11876 if (! single_pred_p (e->dest))
11877 {
11878 split_loop_exit_edge (e, true);
11879 if (dump_enabled_p ())
11880 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11881 }
11882 }
11883
11884 tree niters = vect_build_loop_niters (loop_vinfo);
11885 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11886 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11887 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11888 tree advance;
11889 drs_init_vec orig_drs_init;
11890
11891 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11892 &step_vector, &niters_vector_mult_vf, th,
11893 check_profitability, niters_no_overflow,
11894 &advance);
11895 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11896 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11897 {
11898 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11899 block after loop exit. We need to scale all that. */
11900 basic_block preheader
11901 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11902 preheader->count
11903 = preheader->count.apply_probability
11904 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11905 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11906 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11907 LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11908 }
11909
11910 if (niters_vector == NULL_TREE)
11911 {
11912 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11913 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11914 && known_eq (lowest_vf, vf))
11915 {
11916 niters_vector
11917 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11918 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11919 step_vector = build_one_cst (TREE_TYPE (niters));
11920 }
11921 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11922 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11923 &step_vector, niters_no_overflow);
11924 else
11925 /* vect_do_peeling subtracted the number of peeled prologue
11926 iterations from LOOP_VINFO_NITERS. */
11927 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11928 &niters_vector, &step_vector,
11929 niters_no_overflow);
11930 }
11931
11932 /* 1) Make sure the loop header has exactly two entries
11933 2) Make sure we have a preheader basic block. */
11934
11935 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11936
11937 split_edge (loop_preheader_edge (loop));
11938
11939 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11940 /* This will deal with any possible peeling. */
11941 vect_prepare_for_masked_peels (loop_vinfo);
11942
11943 /* Handle any code motion that we need to for early-break vectorization after
11944 we've done peeling but just before we start vectorizing. */
11945 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11946 move_early_exit_stmts (loop_vinfo);
11947
11948 /* Schedule the SLP instances first, then handle loop vectorization
11949 below. */
11950 if (!loop_vinfo->slp_instances.is_empty ())
11951 {
11952 DUMP_VECT_SCOPE ("scheduling SLP instances");
11953 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11954 }
11955
11956 /* FORNOW: the vectorizer supports only loops which body consist
11957 of one basic block (header + empty latch). When the vectorizer will
11958 support more involved loop forms, the order by which the BBs are
11959 traversed need to be reconsidered. */
11960
11961 for (i = 0; i < nbbs; i++)
11962 {
11963 basic_block bb = bbs[i];
11964 stmt_vec_info stmt_info;
11965
11966 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11967 gsi_next (&si))
11968 {
11969 gphi *phi = si.phi ();
11970 if (dump_enabled_p ())
11971 dump_printf_loc (MSG_NOTE, vect_location,
11972 "------>vectorizing phi: %G", (gimple *) phi);
11973 stmt_info = loop_vinfo->lookup_stmt (phi);
11974 if (!stmt_info)
11975 continue;
11976
11977 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11978 vect_loop_kill_debug_uses (loop, stmt_info);
11979
11980 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11981 && !STMT_VINFO_LIVE_P (stmt_info))
11982 continue;
11983
11984 if (STMT_VINFO_VECTYPE (stmt_info)
11985 && (maybe_ne
11986 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11987 && dump_enabled_p ())
11988 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11989
11990 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11991 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11992 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11993 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11994 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11995 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11996 && ! PURE_SLP_STMT (stmt_info))
11997 {
11998 if (dump_enabled_p ())
11999 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12000 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12001 }
12002 }
12003
12004 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12005 gsi_next (&si))
12006 {
12007 gphi *phi = si.phi ();
12008 stmt_info = loop_vinfo->lookup_stmt (phi);
12009 if (!stmt_info)
12010 continue;
12011
12012 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12013 && !STMT_VINFO_LIVE_P (stmt_info))
12014 continue;
12015
12016 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12017 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12018 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12019 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12020 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12021 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12022 && ! PURE_SLP_STMT (stmt_info))
12023 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12024 }
12025
12026 for (gimple_stmt_iterator si = gsi_start_bb (bb);
12027 !gsi_end_p (si);)
12028 {
12029 stmt = gsi_stmt (si);
12030 /* During vectorization remove existing clobber stmts. */
12031 if (gimple_clobber_p (stmt))
12032 {
12033 unlink_stmt_vdef (stmt);
12034 gsi_remove (&si, true);
12035 release_defs (stmt);
12036 }
12037 else
12038 {
12039 /* Ignore vector stmts created in the outer loop. */
12040 stmt_info = loop_vinfo->lookup_stmt (stmt);
12041
12042 /* vector stmts created in the outer-loop during vectorization of
12043 stmts in an inner-loop may not have a stmt_info, and do not
12044 need to be vectorized. */
12045 stmt_vec_info seen_store = NULL;
12046 if (stmt_info)
12047 {
12048 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12049 {
12050 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12051 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12052 !gsi_end_p (subsi); gsi_next (&subsi))
12053 {
12054 stmt_vec_info pat_stmt_info
12055 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12056 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12057 &si, &seen_store);
12058 }
12059 stmt_vec_info pat_stmt_info
12060 = STMT_VINFO_RELATED_STMT (stmt_info);
12061 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12062 &si, &seen_store))
12063 maybe_set_vectorized_backedge_value (loop_vinfo,
12064 pat_stmt_info);
12065 }
12066 else
12067 {
12068 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12069 &seen_store))
12070 maybe_set_vectorized_backedge_value (loop_vinfo,
12071 stmt_info);
12072 }
12073 }
12074 gsi_next (&si);
12075 if (seen_store)
12076 {
12077 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12078 /* Interleaving. If IS_STORE is TRUE, the
12079 vectorization of the interleaving chain was
12080 completed - free all the stores in the chain. */
12081 vect_remove_stores (loop_vinfo,
12082 DR_GROUP_FIRST_ELEMENT (seen_store));
12083 else
12084 /* Free the attached stmt_vec_info and remove the stmt. */
12085 loop_vinfo->remove_stmt (stmt_info);
12086 }
12087 }
12088 }
12089
12090 /* Stub out scalar statements that must not survive vectorization.
12091 Doing this here helps with grouped statements, or statements that
12092 are involved in patterns. */
12093 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12094 !gsi_end_p (gsi); gsi_next (&gsi))
12095 {
12096 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12097 if (!call || !gimple_call_internal_p (call))
12098 continue;
12099 internal_fn ifn = gimple_call_internal_fn (call);
12100 if (ifn == IFN_MASK_LOAD)
12101 {
12102 tree lhs = gimple_get_lhs (call);
12103 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12104 {
12105 tree zero = build_zero_cst (TREE_TYPE (lhs));
12106 gimple *new_stmt = gimple_build_assign (lhs, zero);
12107 gsi_replace (&gsi, new_stmt, true);
12108 }
12109 }
12110 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12111 {
12112 tree lhs = gimple_get_lhs (call);
12113 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12114 {
12115 tree else_arg
12116 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12117 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12118 gsi_replace (&gsi, new_stmt, true);
12119 }
12120 }
12121 }
12122 } /* BBs in loop */
12123
12124 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12125 a zero NITERS becomes a nonzero NITERS_VECTOR. */
12126 if (integer_onep (step_vector))
12127 niters_no_overflow = true;
12128 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12129 niters_vector, step_vector, niters_vector_mult_vf,
12130 !niters_no_overflow);
12131
12132 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12133
12134 /* True if the final iteration might not handle a full vector's
12135 worth of scalar iterations. */
12136 bool final_iter_may_be_partial
12137 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12138 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
12139 /* The minimum number of iterations performed by the epilogue. This
12140 is 1 when peeling for gaps because we always need a final scalar
12141 iteration. */
12142 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12143 /* +1 to convert latch counts to loop iteration counts,
12144 -min_epilogue_iters to remove iterations that cannot be performed
12145 by the vector code. */
12146 int bias_for_lowest = 1 - min_epilogue_iters;
12147 int bias_for_assumed = bias_for_lowest;
12148 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12149 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12150 {
12151 /* When the amount of peeling is known at compile time, the first
12152 iteration will have exactly alignment_npeels active elements.
12153 In the worst case it will have at least one. */
12154 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12155 bias_for_lowest += lowest_vf - min_first_active;
12156 bias_for_assumed += assumed_vf - min_first_active;
12157 }
12158 /* In these calculations the "- 1" converts loop iteration counts
12159 back to latch counts. */
12160 if (loop->any_upper_bound)
12161 {
12162 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12163 loop->nb_iterations_upper_bound
12164 = (final_iter_may_be_partial
12165 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12166 lowest_vf) - 1
12167 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12168 lowest_vf) - 1);
12169 if (main_vinfo
12170 /* Both peeling for alignment and peeling for gaps can end up
12171 with the scalar epilogue running for more than VF-1 iterations. */
12172 && !main_vinfo->peeling_for_alignment
12173 && !main_vinfo->peeling_for_gaps)
12174 {
12175 unsigned int bound;
12176 poly_uint64 main_iters
12177 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12178 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12179 main_iters
12180 = upper_bound (main_iters,
12181 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12182 if (can_div_away_from_zero_p (main_iters,
12183 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12184 &bound))
12185 loop->nb_iterations_upper_bound
12186 = wi::umin ((bound_wide_int) (bound - 1),
12187 loop->nb_iterations_upper_bound);
12188 }
12189 }
12190 if (loop->any_likely_upper_bound)
12191 loop->nb_iterations_likely_upper_bound
12192 = (final_iter_may_be_partial
12193 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12194 + bias_for_lowest, lowest_vf) - 1
12195 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12196 + bias_for_lowest, lowest_vf) - 1);
12197 if (loop->any_estimate)
12198 loop->nb_iterations_estimate
12199 = (final_iter_may_be_partial
12200 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12201 assumed_vf) - 1
12202 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12203 assumed_vf) - 1);
12204 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12205 assumed_vf, flat);
12206
12207 if (dump_enabled_p ())
12208 {
12209 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12210 {
12211 dump_printf_loc (MSG_NOTE, vect_location,
12212 "LOOP VECTORIZED\n");
12213 if (loop->inner)
12214 dump_printf_loc (MSG_NOTE, vect_location,
12215 "OUTER LOOP VECTORIZED\n");
12216 dump_printf (MSG_NOTE, "\n");
12217 }
12218 else
12219 dump_printf_loc (MSG_NOTE, vect_location,
12220 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12221 GET_MODE_NAME (loop_vinfo->vector_mode));
12222 }
12223
12224 /* Loops vectorized with a variable factor won't benefit from
12225 unrolling/peeling. */
12226 if (!vf.is_constant ())
12227 {
12228 loop->unroll = 1;
12229 if (dump_enabled_p ())
12230 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12231 " variable-length vectorization factor\n");
12232 }
12233 /* Free SLP instances here because otherwise stmt reference counting
12234 won't work. */
12235 slp_instance instance;
12236 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12237 vect_free_slp_instance (instance);
12238 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12239 /* Clear-up safelen field since its value is invalid after vectorization
12240 since vectorized loop can have loop-carried dependencies. */
12241 loop->safelen = 0;
12242
12243 if (epilogue)
12244 {
12245 update_epilogue_loop_vinfo (epilogue, advance);
12246
12247 epilogue->simduid = loop->simduid;
12248 epilogue->force_vectorize = loop->force_vectorize;
12249 epilogue->dont_vectorize = false;
12250 }
12251
12252 return epilogue;
12253 }
12254
12255 /* The code below is trying to perform simple optimization - revert
12256 if-conversion for masked stores, i.e. if the mask of a store is zero
12257 do not perform it and all stored value producers also if possible.
12258 For example,
12259 for (i=0; i<n; i++)
12260 if (c[i])
12261 {
12262 p1[i] += 1;
12263 p2[i] = p3[i] +2;
12264 }
12265 this transformation will produce the following semi-hammock:
12266
12267 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12268 {
12269 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12270 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12271 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12272 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12273 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12274 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12275 }
12276 */
12277
12278 void
12279 optimize_mask_stores (class loop *loop)
12280 {
12281 basic_block *bbs = get_loop_body (loop);
12282 unsigned nbbs = loop->num_nodes;
12283 unsigned i;
12284 basic_block bb;
12285 class loop *bb_loop;
12286 gimple_stmt_iterator gsi;
12287 gimple *stmt;
12288 auto_vec<gimple *> worklist;
12289 auto_purge_vect_location sentinel;
12290
12291 vect_location = find_loop_location (loop);
12292 /* Pick up all masked stores in loop if any. */
12293 for (i = 0; i < nbbs; i++)
12294 {
12295 bb = bbs[i];
12296 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12297 gsi_next (&gsi))
12298 {
12299 stmt = gsi_stmt (gsi);
12300 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12301 worklist.safe_push (stmt);
12302 }
12303 }
12304
12305 free (bbs);
12306 if (worklist.is_empty ())
12307 return;
12308
12309 /* Loop has masked stores. */
12310 while (!worklist.is_empty ())
12311 {
12312 gimple *last, *last_store;
12313 edge e, efalse;
12314 tree mask;
12315 basic_block store_bb, join_bb;
12316 gimple_stmt_iterator gsi_to;
12317 tree vdef, new_vdef;
12318 gphi *phi;
12319 tree vectype;
12320 tree zero;
12321
12322 last = worklist.pop ();
12323 mask = gimple_call_arg (last, 2);
12324 bb = gimple_bb (last);
12325 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12326 the same loop as if_bb. It could be different to LOOP when two
12327 level loop-nest is vectorized and mask_store belongs to the inner
12328 one. */
12329 e = split_block (bb, last);
12330 bb_loop = bb->loop_father;
12331 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12332 join_bb = e->dest;
12333 store_bb = create_empty_bb (bb);
12334 add_bb_to_loop (store_bb, bb_loop);
12335 e->flags = EDGE_TRUE_VALUE;
12336 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12337 /* Put STORE_BB to likely part. */
12338 efalse->probability = profile_probability::likely ();
12339 e->probability = efalse->probability.invert ();
12340 store_bb->count = efalse->count ();
12341 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12342 if (dom_info_available_p (CDI_DOMINATORS))
12343 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12344 if (dump_enabled_p ())
12345 dump_printf_loc (MSG_NOTE, vect_location,
12346 "Create new block %d to sink mask stores.",
12347 store_bb->index);
12348 /* Create vector comparison with boolean result. */
12349 vectype = TREE_TYPE (mask);
12350 zero = build_zero_cst (vectype);
12351 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12352 gsi = gsi_last_bb (bb);
12353 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12354 /* Create new PHI node for vdef of the last masked store:
12355 .MEM_2 = VDEF <.MEM_1>
12356 will be converted to
12357 .MEM.3 = VDEF <.MEM_1>
12358 and new PHI node will be created in join bb
12359 .MEM_2 = PHI <.MEM_1, .MEM_3>
12360 */
12361 vdef = gimple_vdef (last);
12362 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12363 gimple_set_vdef (last, new_vdef);
12364 phi = create_phi_node (vdef, join_bb);
12365 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12366
12367 /* Put all masked stores with the same mask to STORE_BB if possible. */
12368 while (true)
12369 {
12370 gimple_stmt_iterator gsi_from;
12371 gimple *stmt1 = NULL;
12372
12373 /* Move masked store to STORE_BB. */
12374 last_store = last;
12375 gsi = gsi_for_stmt (last);
12376 gsi_from = gsi;
12377 /* Shift GSI to the previous stmt for further traversal. */
12378 gsi_prev (&gsi);
12379 gsi_to = gsi_start_bb (store_bb);
12380 gsi_move_before (&gsi_from, &gsi_to);
12381 /* Setup GSI_TO to the non-empty block start. */
12382 gsi_to = gsi_start_bb (store_bb);
12383 if (dump_enabled_p ())
12384 dump_printf_loc (MSG_NOTE, vect_location,
12385 "Move stmt to created bb\n%G", last);
12386 /* Move all stored value producers if possible. */
12387 while (!gsi_end_p (gsi))
12388 {
12389 tree lhs;
12390 imm_use_iterator imm_iter;
12391 use_operand_p use_p;
12392 bool res;
12393
12394 /* Skip debug statements. */
12395 if (is_gimple_debug (gsi_stmt (gsi)))
12396 {
12397 gsi_prev (&gsi);
12398 continue;
12399 }
12400 stmt1 = gsi_stmt (gsi);
12401 /* Do not consider statements writing to memory or having
12402 volatile operand. */
12403 if (gimple_vdef (stmt1)
12404 || gimple_has_volatile_ops (stmt1))
12405 break;
12406 gsi_from = gsi;
12407 gsi_prev (&gsi);
12408 lhs = gimple_get_lhs (stmt1);
12409 if (!lhs)
12410 break;
12411
12412 /* LHS of vectorized stmt must be SSA_NAME. */
12413 if (TREE_CODE (lhs) != SSA_NAME)
12414 break;
12415
12416 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12417 {
12418 /* Remove dead scalar statement. */
12419 if (has_zero_uses (lhs))
12420 {
12421 gsi_remove (&gsi_from, true);
12422 continue;
12423 }
12424 }
12425
12426 /* Check that LHS does not have uses outside of STORE_BB. */
12427 res = true;
12428 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12429 {
12430 gimple *use_stmt;
12431 use_stmt = USE_STMT (use_p);
12432 if (is_gimple_debug (use_stmt))
12433 continue;
12434 if (gimple_bb (use_stmt) != store_bb)
12435 {
12436 res = false;
12437 break;
12438 }
12439 }
12440 if (!res)
12441 break;
12442
12443 if (gimple_vuse (stmt1)
12444 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12445 break;
12446
12447 /* Can move STMT1 to STORE_BB. */
12448 if (dump_enabled_p ())
12449 dump_printf_loc (MSG_NOTE, vect_location,
12450 "Move stmt to created bb\n%G", stmt1);
12451 gsi_move_before (&gsi_from, &gsi_to);
12452 /* Shift GSI_TO for further insertion. */
12453 gsi_prev (&gsi_to);
12454 }
12455 /* Put other masked stores with the same mask to STORE_BB. */
12456 if (worklist.is_empty ()
12457 || gimple_call_arg (worklist.last (), 2) != mask
12458 || worklist.last () != stmt1)
12459 break;
12460 last = worklist.pop ();
12461 }
12462 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12463 }
12464 }
12465
12466 /* Decide whether it is possible to use a zero-based induction variable
12467 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12468 the value that the induction variable must be able to hold in order
12469 to ensure that the rgroups eventually have no active vector elements.
12470 Return -1 otherwise. */
12471
12472 widest_int
12473 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12474 {
12475 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12476 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12477 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12478
12479 /* Calculate the value that the induction variable must be able
12480 to hit in order to ensure that we end the loop with an all-false mask.
12481 This involves adding the maximum number of inactive trailing scalar
12482 iterations. */
12483 widest_int iv_limit = -1;
12484 if (max_loop_iterations (loop, &iv_limit))
12485 {
12486 if (niters_skip)
12487 {
12488 /* Add the maximum number of skipped iterations to the
12489 maximum iteration count. */
12490 if (TREE_CODE (niters_skip) == INTEGER_CST)
12491 iv_limit += wi::to_widest (niters_skip);
12492 else
12493 iv_limit += max_vf - 1;
12494 }
12495 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12496 /* Make a conservatively-correct assumption. */
12497 iv_limit += max_vf - 1;
12498
12499 /* IV_LIMIT is the maximum number of latch iterations, which is also
12500 the maximum in-range IV value. Round this value down to the previous
12501 vector alignment boundary and then add an extra full iteration. */
12502 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12503 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12504 }
12505 return iv_limit;
12506 }
12507
12508 /* For the given rgroup_controls RGC, check whether an induction variable
12509 would ever hit a value that produces a set of all-false masks or zero
12510 lengths before wrapping around. Return true if it's possible to wrap
12511 around before hitting the desirable value, otherwise return false. */
12512
12513 bool
12514 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12515 {
12516 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12517
12518 if (iv_limit == -1)
12519 return true;
12520
12521 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12522 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12523 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12524
12525 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12526 return true;
12527
12528 return false;
12529 }
This page took 0.581134 seconds and 5 git commands to generate.