]> gcc.gnu.org Git - gcc.git/blob - gcc/tree-vect-loop.cc
Daily bump.
[gcc.git] / gcc / tree-vect-loop.cc
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
58
59 /* Loop Vectorization Pass.
60
61 This pass tries to vectorize loops.
62
63 For example, the vectorizer transforms the following simple loop:
64
65 short a[N]; short b[N]; short c[N]; int i;
66
67 for (i=0; i<N; i++){
68 a[i] = b[i] + c[i];
69 }
70
71 as if it was manually vectorized by rewriting the source code into:
72
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 v8hi va, vb, vc;
77
78 for (i=0; i<N/8; i++){
79 vb = pb[i];
80 vc = pc[i];
81 va = vb + vc;
82 pa[i] = va;
83 }
84
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
96
97 Analysis phase:
98 ===============
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
102
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
107
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
118
119 For example, say stmt S1 was vectorized into stmt VS1:
120
121 VS1: vb = px[i];
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 S2: a = b;
124
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
129
130 VS1: vb = px[i];
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 VS2: va = vb;
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
137
138 Target modeling:
139 =================
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
145
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
152
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 */
156
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
158 unsigned *);
159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
160 bool *, bool *, bool);
161
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164 may already be set for general statements (not just data refs). */
165
166 static opt_result
167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
168 bool vectype_maybe_set_p,
169 poly_uint64 *vf)
170 {
171 gimple *stmt = stmt_info->stmt;
172
173 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
174 && !STMT_VINFO_LIVE_P (stmt_info))
175 || gimple_clobber_p (stmt))
176 {
177 if (dump_enabled_p ())
178 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
179 return opt_result::success ();
180 }
181
182 tree stmt_vectype, nunits_vectype;
183 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
184 &stmt_vectype,
185 &nunits_vectype);
186 if (!res)
187 return res;
188
189 if (stmt_vectype)
190 {
191 if (STMT_VINFO_VECTYPE (stmt_info))
192 /* The only case when a vectype had been already set is for stmts
193 that contain a data ref, or for "pattern-stmts" (stmts generated
194 by the vectorizer to represent/replace a certain idiom). */
195 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
196 || vectype_maybe_set_p)
197 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
198 else
199 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 }
201
202 if (nunits_vectype)
203 vect_update_max_nunits (vf, nunits_vectype);
204
205 return opt_result::success ();
206 }
207
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. Return true on success
211 or false if something prevented vectorization. */
212
213 static opt_result
214 vect_determine_vf_for_stmt (vec_info *vinfo,
215 stmt_vec_info stmt_info, poly_uint64 *vf)
216 {
217 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
219 stmt_info->stmt);
220 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
221 if (!res)
222 return res;
223
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225 && STMT_VINFO_RELATED_STMT (stmt_info))
226 {
227 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
228 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
229
230 /* If a pattern statement has def stmts, analyze them too. */
231 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 !gsi_end_p (si); gsi_next (&si))
233 {
234 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_NOTE, vect_location,
237 "==> examining pattern def stmt: %G",
238 def_stmt_info->stmt);
239 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
240 if (!res)
241 return res;
242 }
243
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE, vect_location,
246 "==> examining pattern statement: %G",
247 stmt_info->stmt);
248 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
249 if (!res)
250 return res;
251 }
252
253 return opt_result::success ();
254 }
255
256 /* Function vect_determine_vectorization_factor
257
258 Determine the vectorization factor (VF). VF is the number of data elements
259 that are operated upon in parallel in a single iteration of the vectorized
260 loop. For example, when vectorizing a loop that operates on 4byte elements,
261 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262 elements can fit in a single vector register.
263
264 We currently support vectorization of loops in which all types operated upon
265 are of the same size. Therefore this function currently sets VF according to
266 the size of the types operated upon, and fails if there are multiple sizes
267 in the loop.
268
269 VF is also the factor by which the loop iterations are strip-mined, e.g.:
270 original loop:
271 for (i=0; i<N; i++){
272 a[i] = b[i] + c[i];
273 }
274
275 vectorized loop:
276 for (i=0; i<N; i+=VF){
277 a[i:VF] = b[i:VF] + c[i:VF];
278 }
279 */
280
281 static opt_result
282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
283 {
284 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
285 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
286 unsigned nbbs = loop->num_nodes;
287 poly_uint64 vectorization_factor = 1;
288 tree scalar_type = NULL_TREE;
289 gphi *phi;
290 tree vectype;
291 stmt_vec_info stmt_info;
292 unsigned i;
293
294 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
295
296 for (i = 0; i < nbbs; i++)
297 {
298 basic_block bb = bbs[i];
299
300 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
301 gsi_next (&si))
302 {
303 phi = si.phi ();
304 stmt_info = loop_vinfo->lookup_stmt (phi);
305 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
307 (gimple *) phi);
308
309 gcc_assert (stmt_info);
310
311 if (STMT_VINFO_RELEVANT_P (stmt_info)
312 || STMT_VINFO_LIVE_P (stmt_info))
313 {
314 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
315 scalar_type = TREE_TYPE (PHI_RESULT (phi));
316
317 if (dump_enabled_p ())
318 dump_printf_loc (MSG_NOTE, vect_location,
319 "get vectype for scalar type: %T\n",
320 scalar_type);
321
322 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
323 if (!vectype)
324 return opt_result::failure_at (phi,
325 "not vectorized: unsupported "
326 "data-type %T\n",
327 scalar_type);
328 STMT_VINFO_VECTYPE (stmt_info) = vectype;
329
330 if (dump_enabled_p ())
331 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
332 vectype);
333
334 if (dump_enabled_p ())
335 {
336 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
337 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
338 dump_printf (MSG_NOTE, "\n");
339 }
340
341 vect_update_max_nunits (&vectorization_factor, vectype);
342 }
343 }
344
345 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
346 gsi_next (&si))
347 {
348 if (is_gimple_debug (gsi_stmt (si)))
349 continue;
350 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
351 opt_result res
352 = vect_determine_vf_for_stmt (loop_vinfo,
353 stmt_info, &vectorization_factor);
354 if (!res)
355 return res;
356 }
357 }
358
359 /* TODO: Analyze cost. Decide if worth while to vectorize. */
360 if (dump_enabled_p ())
361 {
362 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
363 dump_dec (MSG_NOTE, vectorization_factor);
364 dump_printf (MSG_NOTE, "\n");
365 }
366
367 if (known_le (vectorization_factor, 1U))
368 return opt_result::failure_at (vect_location,
369 "not vectorized: unsupported data-type\n");
370 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
371 return opt_result::success ();
372 }
373
374
375 /* Function vect_is_simple_iv_evolution.
376
377 FORNOW: A simple evolution of an induction variables in the loop is
378 considered a polynomial evolution. */
379
380 static bool
381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
382 tree * step)
383 {
384 tree init_expr;
385 tree step_expr;
386 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
387 basic_block bb;
388
389 /* When there is no evolution in this loop, the evolution function
390 is not "simple". */
391 if (evolution_part == NULL_TREE)
392 return false;
393
394 /* When the evolution is a polynomial of degree >= 2
395 the evolution function is not "simple". */
396 if (tree_is_chrec (evolution_part))
397 return false;
398
399 step_expr = evolution_part;
400 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
401
402 if (dump_enabled_p ())
403 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
404 step_expr, init_expr);
405
406 *init = init_expr;
407 *step = step_expr;
408
409 if (TREE_CODE (step_expr) != INTEGER_CST
410 && (TREE_CODE (step_expr) != SSA_NAME
411 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
412 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
413 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
414 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
415 || !flag_associative_math)))
416 && (TREE_CODE (step_expr) != REAL_CST
417 || !flag_associative_math))
418 {
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421 "step unknown.\n");
422 return false;
423 }
424
425 return true;
426 }
427
428 /* Function vect_is_nonlinear_iv_evolution
429
430 Only support nonlinear induction for integer type
431 1. neg
432 2. mul by constant
433 3. lshift/rshift by constant.
434
435 For neg induction, return a fake step as integer -1. */
436 static bool
437 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
438 gphi* loop_phi_node, tree *init, tree *step)
439 {
440 tree init_expr, ev_expr, result, op1, op2;
441 gimple* def;
442
443 if (gimple_phi_num_args (loop_phi_node) != 2)
444 return false;
445
446 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
447 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
448
449 /* Support nonlinear induction only for integer type. */
450 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
451 return false;
452
453 *init = init_expr;
454 result = PHI_RESULT (loop_phi_node);
455
456 if (TREE_CODE (ev_expr) != SSA_NAME
457 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
458 || !is_gimple_assign (def))
459 return false;
460
461 enum tree_code t_code = gimple_assign_rhs_code (def);
462 switch (t_code)
463 {
464 case NEGATE_EXPR:
465 if (gimple_assign_rhs1 (def) != result)
466 return false;
467 *step = build_int_cst (TREE_TYPE (init_expr), -1);
468 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
469 break;
470
471 case RSHIFT_EXPR:
472 case LSHIFT_EXPR:
473 case MULT_EXPR:
474 op1 = gimple_assign_rhs1 (def);
475 op2 = gimple_assign_rhs2 (def);
476 if (TREE_CODE (op2) != INTEGER_CST
477 || op1 != result)
478 return false;
479 *step = op2;
480 if (t_code == LSHIFT_EXPR)
481 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
482 else if (t_code == RSHIFT_EXPR)
483 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
484 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
485 else
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
487 break;
488
489 default:
490 return false;
491 }
492
493 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
494 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
495
496 return true;
497 }
498
499 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
500 what we are assuming is a double reduction. For example, given
501 a structure like this:
502
503 outer1:
504 x_1 = PHI <x_4(outer2), ...>;
505 ...
506
507 inner:
508 x_2 = PHI <x_1(outer1), ...>;
509 ...
510 x_3 = ...;
511 ...
512
513 outer2:
514 x_4 = PHI <x_3(inner)>;
515 ...
516
517 outer loop analysis would treat x_1 as a double reduction phi and
518 this function would then return true for x_2. */
519
520 static bool
521 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
522 {
523 use_operand_p use_p;
524 ssa_op_iter op_iter;
525 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
526 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
527 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
528 return true;
529 return false;
530 }
531
532 /* Returns true if Phi is a first-order recurrence. A first-order
533 recurrence is a non-reduction recurrence relation in which the value of
534 the recurrence in the current loop iteration equals a value defined in
535 the previous iteration. */
536
537 static bool
538 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
539 gphi *phi)
540 {
541 /* A nested cycle isn't vectorizable as first order recurrence. */
542 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
543 return false;
544
545 /* Ensure the loop latch definition is from within the loop. */
546 edge latch = loop_latch_edge (loop);
547 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
548 if (TREE_CODE (ldef) != SSA_NAME
549 || SSA_NAME_IS_DEFAULT_DEF (ldef)
550 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
551 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
552 return false;
553
554 tree def = gimple_phi_result (phi);
555
556 /* Ensure every use_stmt of the phi node is dominated by the latch
557 definition. */
558 imm_use_iterator imm_iter;
559 use_operand_p use_p;
560 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
561 if (!is_gimple_debug (USE_STMT (use_p))
562 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
563 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
564 USE_STMT (use_p))))
565 return false;
566
567 /* First-order recurrence autovectorization needs shuffle vector. */
568 tree scalar_type = TREE_TYPE (def);
569 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
570 if (!vectype)
571 return false;
572
573 return true;
574 }
575
576 /* Function vect_analyze_scalar_cycles_1.
577
578 Examine the cross iteration def-use cycles of scalar variables
579 in LOOP. LOOP_VINFO represents the loop that is now being
580 considered for vectorization (can be LOOP, or an outer-loop
581 enclosing LOOP). SLP indicates there will be some subsequent
582 slp analyses or not. */
583
584 static void
585 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
586 bool slp)
587 {
588 basic_block bb = loop->header;
589 tree init, step;
590 auto_vec<stmt_vec_info, 64> worklist;
591 gphi_iterator gsi;
592 bool double_reduc, reduc_chain;
593
594 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
595
596 /* First - identify all inductions. Reduction detection assumes that all the
597 inductions have been identified, therefore, this order must not be
598 changed. */
599 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
600 {
601 gphi *phi = gsi.phi ();
602 tree access_fn = NULL;
603 tree def = PHI_RESULT (phi);
604 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
605
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
608 (gimple *) phi);
609
610 /* Skip virtual phi's. The data dependences that are associated with
611 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
612 if (virtual_operand_p (def))
613 continue;
614
615 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
616
617 /* Analyze the evolution function. */
618 access_fn = analyze_scalar_evolution (loop, def);
619 if (access_fn)
620 {
621 STRIP_NOPS (access_fn);
622 if (dump_enabled_p ())
623 dump_printf_loc (MSG_NOTE, vect_location,
624 "Access function of PHI: %T\n", access_fn);
625 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
626 = initial_condition_in_loop_num (access_fn, loop->num);
627 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
628 = evolution_part_in_loop_num (access_fn, loop->num);
629 }
630
631 if ((!access_fn
632 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
633 || !vect_is_simple_iv_evolution (loop->num, access_fn,
634 &init, &step)
635 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
636 && TREE_CODE (step) != INTEGER_CST))
637 /* Only handle nonlinear iv for same loop. */
638 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
640 phi, &init, &step)))
641 {
642 worklist.safe_push (stmt_vinfo);
643 continue;
644 }
645
646 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
647 != NULL_TREE);
648 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
649
650 if (dump_enabled_p ())
651 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
652 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
653 }
654
655
656 /* Second - identify all reductions and nested cycles. */
657 while (worklist.length () > 0)
658 {
659 stmt_vec_info stmt_vinfo = worklist.pop ();
660 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
661 tree def = PHI_RESULT (phi);
662
663 if (dump_enabled_p ())
664 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
665 (gimple *) phi);
666
667 gcc_assert (!virtual_operand_p (def)
668 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
669
670 stmt_vec_info reduc_stmt_info
671 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
672 &reduc_chain, slp);
673 if (reduc_stmt_info)
674 {
675 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
676 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
677 if (double_reduc)
678 {
679 if (dump_enabled_p ())
680 dump_printf_loc (MSG_NOTE, vect_location,
681 "Detected double reduction.\n");
682
683 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
684 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
685 }
686 else
687 {
688 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
689 {
690 if (dump_enabled_p ())
691 dump_printf_loc (MSG_NOTE, vect_location,
692 "Detected vectorizable nested cycle.\n");
693
694 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
695 }
696 else
697 {
698 if (dump_enabled_p ())
699 dump_printf_loc (MSG_NOTE, vect_location,
700 "Detected reduction.\n");
701
702 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
703 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
704 /* Store the reduction cycles for possible vectorization in
705 loop-aware SLP if it was not detected as reduction
706 chain. */
707 if (! reduc_chain)
708 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
709 (reduc_stmt_info);
710 }
711 }
712 }
713 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
714 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
715 else
716 if (dump_enabled_p ())
717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
718 "Unknown def-use cycle pattern.\n");
719 }
720 }
721
722
723 /* Function vect_analyze_scalar_cycles.
724
725 Examine the cross iteration def-use cycles of scalar variables, by
726 analyzing the loop-header PHIs of scalar variables. Classify each
727 cycle as one of the following: invariant, induction, reduction, unknown.
728 We do that for the loop represented by LOOP_VINFO, and also to its
729 inner-loop, if exists.
730 Examples for scalar cycles:
731
732 Example1: reduction:
733
734 loop1:
735 for (i=0; i<N; i++)
736 sum += a[i];
737
738 Example2: induction:
739
740 loop2:
741 for (i=0; i<N; i++)
742 a[i] = i; */
743
744 static void
745 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
746 {
747 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
748
749 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
750
751 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
752 Reductions in such inner-loop therefore have different properties than
753 the reductions in the nest that gets vectorized:
754 1. When vectorized, they are executed in the same order as in the original
755 scalar loop, so we can't change the order of computation when
756 vectorizing them.
757 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
758 current checks are too strict. */
759
760 if (loop->inner)
761 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
762 }
763
764 /* Transfer group and reduction information from STMT_INFO to its
765 pattern stmt. */
766
767 static void
768 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
769 {
770 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
771 stmt_vec_info stmtp;
772 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
773 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
774 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
775 do
776 {
777 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
778 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
779 == STMT_VINFO_DEF_TYPE (stmt_info));
780 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
781 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
782 if (stmt_info)
783 REDUC_GROUP_NEXT_ELEMENT (stmtp)
784 = STMT_VINFO_RELATED_STMT (stmt_info);
785 }
786 while (stmt_info);
787 }
788
789 /* Fixup scalar cycles that now have their stmts detected as patterns. */
790
791 static void
792 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
793 {
794 stmt_vec_info first;
795 unsigned i;
796
797 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
798 {
799 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
800 while (next)
801 {
802 if ((STMT_VINFO_IN_PATTERN_P (next)
803 != STMT_VINFO_IN_PATTERN_P (first))
804 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
805 break;
806 next = REDUC_GROUP_NEXT_ELEMENT (next);
807 }
808 /* If all reduction chain members are well-formed patterns adjust
809 the group to group the pattern stmts instead. */
810 if (! next
811 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
812 {
813 if (STMT_VINFO_IN_PATTERN_P (first))
814 {
815 vect_fixup_reduc_chain (first);
816 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
817 = STMT_VINFO_RELATED_STMT (first);
818 }
819 }
820 /* If not all stmt in the chain are patterns or if we failed
821 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
822 it as regular reduction instead. */
823 else
824 {
825 stmt_vec_info vinfo = first;
826 stmt_vec_info last = NULL;
827 while (vinfo)
828 {
829 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
830 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
831 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
832 last = vinfo;
833 vinfo = next;
834 }
835 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
836 = vect_internal_def;
837 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
838 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
839 --i;
840 }
841 }
842 }
843
844 /* Function vect_get_loop_niters.
845
846 Determine how many iterations the loop is executed and place it
847 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
848 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
849 niter information holds in ASSUMPTIONS.
850
851 Return the loop exit condition. */
852
853
854 static gcond *
855 vect_get_loop_niters (class loop *loop, tree *assumptions,
856 tree *number_of_iterations, tree *number_of_iterationsm1)
857 {
858 edge exit = single_exit (loop);
859 class tree_niter_desc niter_desc;
860 tree niter_assumptions, niter, may_be_zero;
861 gcond *cond = get_loop_exit_condition (loop);
862
863 *assumptions = boolean_true_node;
864 *number_of_iterationsm1 = chrec_dont_know;
865 *number_of_iterations = chrec_dont_know;
866 DUMP_VECT_SCOPE ("get_loop_niters");
867
868 if (!exit)
869 return cond;
870
871 may_be_zero = NULL_TREE;
872 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
873 || chrec_contains_undetermined (niter_desc.niter))
874 return cond;
875
876 niter_assumptions = niter_desc.assumptions;
877 may_be_zero = niter_desc.may_be_zero;
878 niter = niter_desc.niter;
879
880 if (may_be_zero && integer_zerop (may_be_zero))
881 may_be_zero = NULL_TREE;
882
883 if (may_be_zero)
884 {
885 if (COMPARISON_CLASS_P (may_be_zero))
886 {
887 /* Try to combine may_be_zero with assumptions, this can simplify
888 computation of niter expression. */
889 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
890 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
891 niter_assumptions,
892 fold_build1 (TRUTH_NOT_EXPR,
893 boolean_type_node,
894 may_be_zero));
895 else
896 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
897 build_int_cst (TREE_TYPE (niter), 0),
898 rewrite_to_non_trapping_overflow (niter));
899
900 may_be_zero = NULL_TREE;
901 }
902 else if (integer_nonzerop (may_be_zero))
903 {
904 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
905 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
906 return cond;
907 }
908 else
909 return cond;
910 }
911
912 *assumptions = niter_assumptions;
913 *number_of_iterationsm1 = niter;
914
915 /* We want the number of loop header executions which is the number
916 of latch executions plus one.
917 ??? For UINT_MAX latch executions this number overflows to zero
918 for loops like do { n++; } while (n != 0); */
919 if (niter && !chrec_contains_undetermined (niter))
920 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
921 build_int_cst (TREE_TYPE (niter), 1));
922 *number_of_iterations = niter;
923
924 return cond;
925 }
926
927 /* Function bb_in_loop_p
928
929 Used as predicate for dfs order traversal of the loop bbs. */
930
931 static bool
932 bb_in_loop_p (const_basic_block bb, const void *data)
933 {
934 const class loop *const loop = (const class loop *)data;
935 if (flow_bb_inside_loop_p (loop, bb))
936 return true;
937 return false;
938 }
939
940
941 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
942 stmt_vec_info structs for all the stmts in LOOP_IN. */
943
944 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
945 : vec_info (vec_info::loop, shared),
946 loop (loop_in),
947 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
948 num_itersm1 (NULL_TREE),
949 num_iters (NULL_TREE),
950 num_iters_unchanged (NULL_TREE),
951 num_iters_assumptions (NULL_TREE),
952 vector_costs (nullptr),
953 scalar_costs (nullptr),
954 th (0),
955 versioning_threshold (0),
956 vectorization_factor (0),
957 main_loop_edge (nullptr),
958 skip_main_loop_edge (nullptr),
959 skip_this_loop_edge (nullptr),
960 reusable_accumulators (),
961 suggested_unroll_factor (1),
962 max_vectorization_factor (0),
963 mask_skip_niters (NULL_TREE),
964 rgroup_compare_type (NULL_TREE),
965 simd_if_cond (NULL_TREE),
966 unaligned_dr (NULL),
967 peeling_for_alignment (0),
968 ptr_mask (0),
969 ivexpr_map (NULL),
970 scan_map (NULL),
971 slp_unrolling_factor (1),
972 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
973 vectorizable (false),
974 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
975 using_partial_vectors_p (false),
976 epil_using_partial_vectors_p (false),
977 partial_load_store_bias (0),
978 peeling_for_gaps (false),
979 peeling_for_niter (false),
980 no_data_dependencies (false),
981 has_mask_store (false),
982 scalar_loop_scaling (profile_probability::uninitialized ()),
983 scalar_loop (NULL),
984 orig_loop_info (NULL)
985 {
986 /* CHECKME: We want to visit all BBs before their successors (except for
987 latch blocks, for which this assertion wouldn't hold). In the simple
988 case of the loop forms we allow, a dfs order of the BBs would the same
989 as reversed postorder traversal, so we are safe. */
990
991 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
992 bbs, loop->num_nodes, loop);
993 gcc_assert (nbbs == loop->num_nodes);
994
995 for (unsigned int i = 0; i < nbbs; i++)
996 {
997 basic_block bb = bbs[i];
998 gimple_stmt_iterator si;
999
1000 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1001 {
1002 gimple *phi = gsi_stmt (si);
1003 gimple_set_uid (phi, 0);
1004 add_stmt (phi);
1005 }
1006
1007 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1008 {
1009 gimple *stmt = gsi_stmt (si);
1010 gimple_set_uid (stmt, 0);
1011 if (is_gimple_debug (stmt))
1012 continue;
1013 add_stmt (stmt);
1014 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1015 third argument is the #pragma omp simd if (x) condition, when 0,
1016 loop shouldn't be vectorized, when non-zero constant, it should
1017 be vectorized normally, otherwise versioned with vectorized loop
1018 done if the condition is non-zero at runtime. */
1019 if (loop_in->simduid
1020 && is_gimple_call (stmt)
1021 && gimple_call_internal_p (stmt)
1022 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1023 && gimple_call_num_args (stmt) >= 3
1024 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1025 && (loop_in->simduid
1026 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1027 {
1028 tree arg = gimple_call_arg (stmt, 2);
1029 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1030 simd_if_cond = arg;
1031 else
1032 gcc_assert (integer_nonzerop (arg));
1033 }
1034 }
1035 }
1036
1037 epilogue_vinfos.create (6);
1038 }
1039
1040 /* Free all levels of rgroup CONTROLS. */
1041
1042 void
1043 release_vec_loop_controls (vec<rgroup_controls> *controls)
1044 {
1045 rgroup_controls *rgc;
1046 unsigned int i;
1047 FOR_EACH_VEC_ELT (*controls, i, rgc)
1048 rgc->controls.release ();
1049 controls->release ();
1050 }
1051
1052 /* Free all memory used by the _loop_vec_info, as well as all the
1053 stmt_vec_info structs of all the stmts in the loop. */
1054
1055 _loop_vec_info::~_loop_vec_info ()
1056 {
1057 free (bbs);
1058
1059 release_vec_loop_controls (&masks);
1060 release_vec_loop_controls (&lens);
1061 delete ivexpr_map;
1062 delete scan_map;
1063 epilogue_vinfos.release ();
1064 delete scalar_costs;
1065 delete vector_costs;
1066
1067 /* When we release an epiloge vinfo that we do not intend to use
1068 avoid clearing AUX of the main loop which should continue to
1069 point to the main loop vinfo since otherwise we'll leak that. */
1070 if (loop->aux == this)
1071 loop->aux = NULL;
1072 }
1073
1074 /* Return an invariant or register for EXPR and emit necessary
1075 computations in the LOOP_VINFO loop preheader. */
1076
1077 tree
1078 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1079 {
1080 if (is_gimple_reg (expr)
1081 || is_gimple_min_invariant (expr))
1082 return expr;
1083
1084 if (! loop_vinfo->ivexpr_map)
1085 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1086 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1087 if (! cached)
1088 {
1089 gimple_seq stmts = NULL;
1090 cached = force_gimple_operand (unshare_expr (expr),
1091 &stmts, true, NULL_TREE);
1092 if (stmts)
1093 {
1094 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1095 gsi_insert_seq_on_edge_immediate (e, stmts);
1096 }
1097 }
1098 return cached;
1099 }
1100
1101 /* Return true if we can use CMP_TYPE as the comparison type to produce
1102 all masks required to mask LOOP_VINFO. */
1103
1104 static bool
1105 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1106 {
1107 rgroup_controls *rgm;
1108 unsigned int i;
1109 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1110 if (rgm->type != NULL_TREE
1111 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1112 cmp_type, rgm->type,
1113 OPTIMIZE_FOR_SPEED))
1114 return false;
1115 return true;
1116 }
1117
1118 /* Calculate the maximum number of scalars per iteration for every
1119 rgroup in LOOP_VINFO. */
1120
1121 static unsigned int
1122 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1123 {
1124 unsigned int res = 1;
1125 unsigned int i;
1126 rgroup_controls *rgm;
1127 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1128 res = MAX (res, rgm->max_nscalars_per_iter);
1129 return res;
1130 }
1131
1132 /* Calculate the minimum precision necessary to represent:
1133
1134 MAX_NITERS * FACTOR
1135
1136 as an unsigned integer, where MAX_NITERS is the maximum number of
1137 loop header iterations for the original scalar form of LOOP_VINFO. */
1138
1139 static unsigned
1140 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1141 {
1142 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1143
1144 /* Get the maximum number of iterations that is representable
1145 in the counter type. */
1146 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1147 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1148
1149 /* Get a more refined estimate for the number of iterations. */
1150 widest_int max_back_edges;
1151 if (max_loop_iterations (loop, &max_back_edges))
1152 max_ni = wi::smin (max_ni, max_back_edges + 1);
1153
1154 /* Work out how many bits we need to represent the limit. */
1155 return wi::min_precision (max_ni * factor, UNSIGNED);
1156 }
1157
1158 /* True if the loop needs peeling or partial vectors when vectorized. */
1159
1160 static bool
1161 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1162 {
1163 unsigned HOST_WIDE_INT const_vf;
1164 HOST_WIDE_INT max_niter
1165 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1166
1167 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1168 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1169 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1170 (loop_vinfo));
1171
1172 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1173 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1174 {
1175 /* Work out the (constant) number of iterations that need to be
1176 peeled for reasons other than niters. */
1177 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1178 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1179 peel_niter += 1;
1180 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1181 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1182 return true;
1183 }
1184 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1185 /* ??? When peeling for gaps but not alignment, we could
1186 try to check whether the (variable) niters is known to be
1187 VF * N + 1. That's something of a niche case though. */
1188 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1189 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1190 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1191 < (unsigned) exact_log2 (const_vf))
1192 /* In case of versioning, check if the maximum number of
1193 iterations is greater than th. If they are identical,
1194 the epilogue is unnecessary. */
1195 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1196 || ((unsigned HOST_WIDE_INT) max_niter
1197 > (th / const_vf) * const_vf))))
1198 return true;
1199
1200 return false;
1201 }
1202
1203 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1204 whether we can actually generate the masks required. Return true if so,
1205 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1206
1207 static bool
1208 vect_verify_full_masking (loop_vec_info loop_vinfo)
1209 {
1210 unsigned int min_ni_width;
1211 unsigned int max_nscalars_per_iter
1212 = vect_get_max_nscalars_per_iter (loop_vinfo);
1213
1214 /* Use a normal loop if there are no statements that need masking.
1215 This only happens in rare degenerate cases: it means that the loop
1216 has no loads, no stores, and no live-out values. */
1217 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1218 return false;
1219
1220 /* Work out how many bits we need to represent the limit. */
1221 min_ni_width
1222 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1223
1224 /* Find a scalar mode for which WHILE_ULT is supported. */
1225 opt_scalar_int_mode cmp_mode_iter;
1226 tree cmp_type = NULL_TREE;
1227 tree iv_type = NULL_TREE;
1228 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1229 unsigned int iv_precision = UINT_MAX;
1230
1231 if (iv_limit != -1)
1232 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1233 UNSIGNED);
1234
1235 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1236 {
1237 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1238 if (cmp_bits >= min_ni_width
1239 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1240 {
1241 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1242 if (this_type
1243 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1244 {
1245 /* Although we could stop as soon as we find a valid mode,
1246 there are at least two reasons why that's not always the
1247 best choice:
1248
1249 - An IV that's Pmode or wider is more likely to be reusable
1250 in address calculations than an IV that's narrower than
1251 Pmode.
1252
1253 - Doing the comparison in IV_PRECISION or wider allows
1254 a natural 0-based IV, whereas using a narrower comparison
1255 type requires mitigations against wrap-around.
1256
1257 Conversely, if the IV limit is variable, doing the comparison
1258 in a wider type than the original type can introduce
1259 unnecessary extensions, so picking the widest valid mode
1260 is not always a good choice either.
1261
1262 Here we prefer the first IV type that's Pmode or wider,
1263 and the first comparison type that's IV_PRECISION or wider.
1264 (The comparison type must be no wider than the IV type,
1265 to avoid extensions in the vector loop.)
1266
1267 ??? We might want to try continuing beyond Pmode for ILP32
1268 targets if CMP_BITS < IV_PRECISION. */
1269 iv_type = this_type;
1270 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1271 cmp_type = this_type;
1272 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1273 break;
1274 }
1275 }
1276 }
1277
1278 if (!cmp_type)
1279 return false;
1280
1281 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1282 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1283 return true;
1284 }
1285
1286 /* Check whether we can use vector access with length based on precison
1287 comparison. So far, to keep it simple, we only allow the case that the
1288 precision of the target supported length is larger than the precision
1289 required by loop niters. */
1290
1291 static bool
1292 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1293 {
1294 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1295 return false;
1296
1297 machine_mode len_load_mode = get_len_load_store_mode
1298 (loop_vinfo->vector_mode, true).require ();
1299 machine_mode len_store_mode = get_len_load_store_mode
1300 (loop_vinfo->vector_mode, false).require ();
1301
1302 signed char partial_load_bias = internal_len_load_store_bias
1303 (IFN_LEN_LOAD, len_load_mode);
1304
1305 signed char partial_store_bias = internal_len_load_store_bias
1306 (IFN_LEN_STORE, len_store_mode);
1307
1308 gcc_assert (partial_load_bias == partial_store_bias);
1309
1310 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1311 return false;
1312
1313 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1314 len_loads with a length of zero. In order to avoid that we prohibit
1315 more than one loop length here. */
1316 if (partial_load_bias == -1
1317 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1318 return false;
1319
1320 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1321
1322 unsigned int max_nitems_per_iter = 1;
1323 unsigned int i;
1324 rgroup_controls *rgl;
1325 /* Find the maximum number of items per iteration for every rgroup. */
1326 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1327 {
1328 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1329 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1330 }
1331
1332 /* Work out how many bits we need to represent the length limit. */
1333 unsigned int min_ni_prec
1334 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1335
1336 /* Now use the maximum of below precisions for one suitable IV type:
1337 - the IV's natural precision
1338 - the precision needed to hold: the maximum number of scalar
1339 iterations multiplied by the scale factor (min_ni_prec above)
1340 - the Pmode precision
1341
1342 If min_ni_prec is less than the precision of the current niters,
1343 we perfer to still use the niters type. Prefer to use Pmode and
1344 wider IV to avoid narrow conversions. */
1345
1346 unsigned int ni_prec
1347 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1348 min_ni_prec = MAX (min_ni_prec, ni_prec);
1349 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1350
1351 tree iv_type = NULL_TREE;
1352 opt_scalar_int_mode tmode_iter;
1353 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1354 {
1355 scalar_mode tmode = tmode_iter.require ();
1356 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1357
1358 /* ??? Do we really want to construct one IV whose precision exceeds
1359 BITS_PER_WORD? */
1360 if (tbits > BITS_PER_WORD)
1361 break;
1362
1363 /* Find the first available standard integral type. */
1364 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1365 {
1366 iv_type = build_nonstandard_integer_type (tbits, true);
1367 break;
1368 }
1369 }
1370
1371 if (!iv_type)
1372 {
1373 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1375 "can't vectorize with length-based partial vectors"
1376 " because there is no suitable iv type.\n");
1377 return false;
1378 }
1379
1380 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1381 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1382
1383 return true;
1384 }
1385
1386 /* Calculate the cost of one scalar iteration of the loop. */
1387 static void
1388 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1389 {
1390 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1391 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1392 int nbbs = loop->num_nodes, factor;
1393 int innerloop_iters, i;
1394
1395 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1396
1397 /* Gather costs for statements in the scalar loop. */
1398
1399 /* FORNOW. */
1400 innerloop_iters = 1;
1401 if (loop->inner)
1402 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1403
1404 for (i = 0; i < nbbs; i++)
1405 {
1406 gimple_stmt_iterator si;
1407 basic_block bb = bbs[i];
1408
1409 if (bb->loop_father == loop->inner)
1410 factor = innerloop_iters;
1411 else
1412 factor = 1;
1413
1414 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1415 {
1416 gimple *stmt = gsi_stmt (si);
1417 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1418
1419 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1420 continue;
1421
1422 /* Skip stmts that are not vectorized inside the loop. */
1423 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1424 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1425 && (!STMT_VINFO_LIVE_P (vstmt_info)
1426 || !VECTORIZABLE_CYCLE_DEF
1427 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1428 continue;
1429
1430 vect_cost_for_stmt kind;
1431 if (STMT_VINFO_DATA_REF (stmt_info))
1432 {
1433 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1434 kind = scalar_load;
1435 else
1436 kind = scalar_store;
1437 }
1438 else if (vect_nop_conversion_p (stmt_info))
1439 continue;
1440 else
1441 kind = scalar_stmt;
1442
1443 /* We are using vect_prologue here to avoid scaling twice
1444 by the inner loop factor. */
1445 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1446 factor, kind, stmt_info, 0, vect_prologue);
1447 }
1448 }
1449
1450 /* Now accumulate cost. */
1451 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1452 add_stmt_costs (loop_vinfo->scalar_costs,
1453 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1454 loop_vinfo->scalar_costs->finish_cost (nullptr);
1455 }
1456
1457
1458 /* Function vect_analyze_loop_form.
1459
1460 Verify that certain CFG restrictions hold, including:
1461 - the loop has a pre-header
1462 - the loop has a single entry and exit
1463 - the loop exit condition is simple enough
1464 - the number of iterations can be analyzed, i.e, a countable loop. The
1465 niter could be analyzed under some assumptions. */
1466
1467 opt_result
1468 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1469 {
1470 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1471
1472 /* Different restrictions apply when we are considering an inner-most loop,
1473 vs. an outer (nested) loop.
1474 (FORNOW. May want to relax some of these restrictions in the future). */
1475
1476 info->inner_loop_cond = NULL;
1477 if (!loop->inner)
1478 {
1479 /* Inner-most loop. We currently require that the number of BBs is
1480 exactly 2 (the header and latch). Vectorizable inner-most loops
1481 look like this:
1482
1483 (pre-header)
1484 |
1485 header <--------+
1486 | | |
1487 | +--> latch --+
1488 |
1489 (exit-bb) */
1490
1491 if (loop->num_nodes != 2)
1492 return opt_result::failure_at (vect_location,
1493 "not vectorized:"
1494 " control flow in loop.\n");
1495
1496 if (empty_block_p (loop->header))
1497 return opt_result::failure_at (vect_location,
1498 "not vectorized: empty loop.\n");
1499 }
1500 else
1501 {
1502 class loop *innerloop = loop->inner;
1503 edge entryedge;
1504
1505 /* Nested loop. We currently require that the loop is doubly-nested,
1506 contains a single inner loop, and the number of BBs is exactly 5.
1507 Vectorizable outer-loops look like this:
1508
1509 (pre-header)
1510 |
1511 header <---+
1512 | |
1513 inner-loop |
1514 | |
1515 tail ------+
1516 |
1517 (exit-bb)
1518
1519 The inner-loop has the properties expected of inner-most loops
1520 as described above. */
1521
1522 if ((loop->inner)->inner || (loop->inner)->next)
1523 return opt_result::failure_at (vect_location,
1524 "not vectorized:"
1525 " multiple nested loops.\n");
1526
1527 if (loop->num_nodes != 5)
1528 return opt_result::failure_at (vect_location,
1529 "not vectorized:"
1530 " control flow in loop.\n");
1531
1532 entryedge = loop_preheader_edge (innerloop);
1533 if (entryedge->src != loop->header
1534 || !single_exit (innerloop)
1535 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1536 return opt_result::failure_at (vect_location,
1537 "not vectorized:"
1538 " unsupported outerloop form.\n");
1539
1540 /* Analyze the inner-loop. */
1541 vect_loop_form_info inner;
1542 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1543 if (!res)
1544 {
1545 if (dump_enabled_p ())
1546 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1547 "not vectorized: Bad inner loop.\n");
1548 return res;
1549 }
1550
1551 /* Don't support analyzing niter under assumptions for inner
1552 loop. */
1553 if (!integer_onep (inner.assumptions))
1554 return opt_result::failure_at (vect_location,
1555 "not vectorized: Bad inner loop.\n");
1556
1557 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1558 return opt_result::failure_at (vect_location,
1559 "not vectorized: inner-loop count not"
1560 " invariant.\n");
1561
1562 if (dump_enabled_p ())
1563 dump_printf_loc (MSG_NOTE, vect_location,
1564 "Considering outer-loop vectorization.\n");
1565 info->inner_loop_cond = inner.loop_cond;
1566 }
1567
1568 if (!single_exit (loop))
1569 return opt_result::failure_at (vect_location,
1570 "not vectorized: multiple exits.\n");
1571 if (EDGE_COUNT (loop->header->preds) != 2)
1572 return opt_result::failure_at (vect_location,
1573 "not vectorized:"
1574 " too many incoming edges.\n");
1575
1576 /* We assume that the loop exit condition is at the end of the loop. i.e,
1577 that the loop is represented as a do-while (with a proper if-guard
1578 before the loop if needed), where the loop header contains all the
1579 executable statements, and the latch is empty. */
1580 if (!empty_block_p (loop->latch)
1581 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1582 return opt_result::failure_at (vect_location,
1583 "not vectorized: latch block not empty.\n");
1584
1585 /* Make sure the exit is not abnormal. */
1586 edge e = single_exit (loop);
1587 if (e->flags & EDGE_ABNORMAL)
1588 return opt_result::failure_at (vect_location,
1589 "not vectorized:"
1590 " abnormal loop exit edge.\n");
1591
1592 info->loop_cond
1593 = vect_get_loop_niters (loop, &info->assumptions,
1594 &info->number_of_iterations,
1595 &info->number_of_iterationsm1);
1596 if (!info->loop_cond)
1597 return opt_result::failure_at
1598 (vect_location,
1599 "not vectorized: complicated exit condition.\n");
1600
1601 if (integer_zerop (info->assumptions)
1602 || !info->number_of_iterations
1603 || chrec_contains_undetermined (info->number_of_iterations))
1604 return opt_result::failure_at
1605 (info->loop_cond,
1606 "not vectorized: number of iterations cannot be computed.\n");
1607
1608 if (integer_zerop (info->number_of_iterations))
1609 return opt_result::failure_at
1610 (info->loop_cond,
1611 "not vectorized: number of iterations = 0.\n");
1612
1613 if (!(tree_fits_shwi_p (info->number_of_iterations)
1614 && tree_to_shwi (info->number_of_iterations) > 0))
1615 {
1616 if (dump_enabled_p ())
1617 {
1618 dump_printf_loc (MSG_NOTE, vect_location,
1619 "Symbolic number of iterations is ");
1620 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1621 dump_printf (MSG_NOTE, "\n");
1622 }
1623 }
1624
1625 return opt_result::success ();
1626 }
1627
1628 /* Create a loop_vec_info for LOOP with SHARED and the
1629 vect_analyze_loop_form result. */
1630
1631 loop_vec_info
1632 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1633 const vect_loop_form_info *info,
1634 loop_vec_info main_loop_info)
1635 {
1636 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1637 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1638 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1639 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1640 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1641 /* Also record the assumptions for versioning. */
1642 if (!integer_onep (info->assumptions) && !main_loop_info)
1643 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1644
1645 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1646 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1647 if (info->inner_loop_cond)
1648 {
1649 stmt_vec_info inner_loop_cond_info
1650 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1651 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1652 /* If we have an estimate on the number of iterations of the inner
1653 loop use that to limit the scale for costing, otherwise use
1654 --param vect-inner-loop-cost-factor literally. */
1655 widest_int nit;
1656 if (estimated_stmt_executions (loop->inner, &nit))
1657 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1658 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1659 }
1660
1661 return loop_vinfo;
1662 }
1663
1664
1665
1666 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1667 statements update the vectorization factor. */
1668
1669 static void
1670 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1671 {
1672 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1673 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1674 int nbbs = loop->num_nodes;
1675 poly_uint64 vectorization_factor;
1676 int i;
1677
1678 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1679
1680 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1681 gcc_assert (known_ne (vectorization_factor, 0U));
1682
1683 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1684 vectorization factor of the loop is the unrolling factor required by
1685 the SLP instances. If that unrolling factor is 1, we say, that we
1686 perform pure SLP on loop - cross iteration parallelism is not
1687 exploited. */
1688 bool only_slp_in_loop = true;
1689 for (i = 0; i < nbbs; i++)
1690 {
1691 basic_block bb = bbs[i];
1692 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1693 gsi_next (&si))
1694 {
1695 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1696 if (!stmt_info)
1697 continue;
1698 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1699 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1700 && !PURE_SLP_STMT (stmt_info))
1701 /* STMT needs both SLP and loop-based vectorization. */
1702 only_slp_in_loop = false;
1703 }
1704 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1705 gsi_next (&si))
1706 {
1707 if (is_gimple_debug (gsi_stmt (si)))
1708 continue;
1709 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1710 stmt_info = vect_stmt_to_vectorize (stmt_info);
1711 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1712 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1713 && !PURE_SLP_STMT (stmt_info))
1714 /* STMT needs both SLP and loop-based vectorization. */
1715 only_slp_in_loop = false;
1716 }
1717 }
1718
1719 if (only_slp_in_loop)
1720 {
1721 if (dump_enabled_p ())
1722 dump_printf_loc (MSG_NOTE, vect_location,
1723 "Loop contains only SLP stmts\n");
1724 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1725 }
1726 else
1727 {
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_NOTE, vect_location,
1730 "Loop contains SLP and non-SLP stmts\n");
1731 /* Both the vectorization factor and unroll factor have the form
1732 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1733 so they must have a common multiple. */
1734 vectorization_factor
1735 = force_common_multiple (vectorization_factor,
1736 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1737 }
1738
1739 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1740 if (dump_enabled_p ())
1741 {
1742 dump_printf_loc (MSG_NOTE, vect_location,
1743 "Updating vectorization factor to ");
1744 dump_dec (MSG_NOTE, vectorization_factor);
1745 dump_printf (MSG_NOTE, ".\n");
1746 }
1747 }
1748
1749 /* Return true if STMT_INFO describes a double reduction phi and if
1750 the other phi in the reduction is also relevant for vectorization.
1751 This rejects cases such as:
1752
1753 outer1:
1754 x_1 = PHI <x_3(outer2), ...>;
1755 ...
1756
1757 inner:
1758 x_2 = ...;
1759 ...
1760
1761 outer2:
1762 x_3 = PHI <x_2(inner)>;
1763
1764 if nothing in x_2 or elsewhere makes x_1 relevant. */
1765
1766 static bool
1767 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1768 {
1769 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1770 return false;
1771
1772 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1773 }
1774
1775 /* Function vect_analyze_loop_operations.
1776
1777 Scan the loop stmts and make sure they are all vectorizable. */
1778
1779 static opt_result
1780 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1781 {
1782 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1783 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1784 int nbbs = loop->num_nodes;
1785 int i;
1786 stmt_vec_info stmt_info;
1787 bool need_to_vectorize = false;
1788 bool ok;
1789
1790 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1791
1792 auto_vec<stmt_info_for_cost> cost_vec;
1793
1794 for (i = 0; i < nbbs; i++)
1795 {
1796 basic_block bb = bbs[i];
1797
1798 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1799 gsi_next (&si))
1800 {
1801 gphi *phi = si.phi ();
1802 ok = true;
1803
1804 stmt_info = loop_vinfo->lookup_stmt (phi);
1805 if (dump_enabled_p ())
1806 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1807 (gimple *) phi);
1808 if (virtual_operand_p (gimple_phi_result (phi)))
1809 continue;
1810
1811 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1812 (i.e., a phi in the tail of the outer-loop). */
1813 if (! is_loop_header_bb_p (bb))
1814 {
1815 /* FORNOW: we currently don't support the case that these phis
1816 are not used in the outerloop (unless it is double reduction,
1817 i.e., this phi is vect_reduction_def), cause this case
1818 requires to actually do something here. */
1819 if (STMT_VINFO_LIVE_P (stmt_info)
1820 && !vect_active_double_reduction_p (stmt_info))
1821 return opt_result::failure_at (phi,
1822 "Unsupported loop-closed phi"
1823 " in outer-loop.\n");
1824
1825 /* If PHI is used in the outer loop, we check that its operand
1826 is defined in the inner loop. */
1827 if (STMT_VINFO_RELEVANT_P (stmt_info))
1828 {
1829 tree phi_op;
1830
1831 if (gimple_phi_num_args (phi) != 1)
1832 return opt_result::failure_at (phi, "unsupported phi");
1833
1834 phi_op = PHI_ARG_DEF (phi, 0);
1835 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1836 if (!op_def_info)
1837 return opt_result::failure_at (phi, "unsupported phi\n");
1838
1839 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1840 && (STMT_VINFO_RELEVANT (op_def_info)
1841 != vect_used_in_outer_by_reduction))
1842 return opt_result::failure_at (phi, "unsupported phi\n");
1843
1844 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1845 || (STMT_VINFO_DEF_TYPE (stmt_info)
1846 == vect_double_reduction_def))
1847 && !vectorizable_lc_phi (loop_vinfo,
1848 stmt_info, NULL, NULL))
1849 return opt_result::failure_at (phi, "unsupported phi\n");
1850 }
1851
1852 continue;
1853 }
1854
1855 gcc_assert (stmt_info);
1856
1857 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1858 || STMT_VINFO_LIVE_P (stmt_info))
1859 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
1860 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
1861 /* A scalar-dependence cycle that we don't support. */
1862 return opt_result::failure_at (phi,
1863 "not vectorized:"
1864 " scalar dependence cycle.\n");
1865
1866 if (STMT_VINFO_RELEVANT_P (stmt_info))
1867 {
1868 need_to_vectorize = true;
1869 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1870 && ! PURE_SLP_STMT (stmt_info))
1871 ok = vectorizable_induction (loop_vinfo,
1872 stmt_info, NULL, NULL,
1873 &cost_vec);
1874 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1875 || (STMT_VINFO_DEF_TYPE (stmt_info)
1876 == vect_double_reduction_def)
1877 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1878 && ! PURE_SLP_STMT (stmt_info))
1879 ok = vectorizable_reduction (loop_vinfo,
1880 stmt_info, NULL, NULL, &cost_vec);
1881 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
1882 == vect_first_order_recurrence)
1883 && ! PURE_SLP_STMT (stmt_info))
1884 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
1885 &cost_vec);
1886 }
1887
1888 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1889 if (ok
1890 && STMT_VINFO_LIVE_P (stmt_info)
1891 && !PURE_SLP_STMT (stmt_info))
1892 ok = vectorizable_live_operation (loop_vinfo,
1893 stmt_info, NULL, NULL, NULL,
1894 -1, false, &cost_vec);
1895
1896 if (!ok)
1897 return opt_result::failure_at (phi,
1898 "not vectorized: relevant phi not "
1899 "supported: %G",
1900 static_cast <gimple *> (phi));
1901 }
1902
1903 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1904 gsi_next (&si))
1905 {
1906 gimple *stmt = gsi_stmt (si);
1907 if (!gimple_clobber_p (stmt)
1908 && !is_gimple_debug (stmt))
1909 {
1910 opt_result res
1911 = vect_analyze_stmt (loop_vinfo,
1912 loop_vinfo->lookup_stmt (stmt),
1913 &need_to_vectorize,
1914 NULL, NULL, &cost_vec);
1915 if (!res)
1916 return res;
1917 }
1918 }
1919 } /* bbs */
1920
1921 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1922
1923 /* All operations in the loop are either irrelevant (deal with loop
1924 control, or dead), or only used outside the loop and can be moved
1925 out of the loop (e.g. invariants, inductions). The loop can be
1926 optimized away by scalar optimizations. We're better off not
1927 touching this loop. */
1928 if (!need_to_vectorize)
1929 {
1930 if (dump_enabled_p ())
1931 dump_printf_loc (MSG_NOTE, vect_location,
1932 "All the computation can be taken out of the loop.\n");
1933 return opt_result::failure_at
1934 (vect_location,
1935 "not vectorized: redundant loop. no profit to vectorize.\n");
1936 }
1937
1938 return opt_result::success ();
1939 }
1940
1941 /* Return true if we know that the iteration count is smaller than the
1942 vectorization factor. Return false if it isn't, or if we can't be sure
1943 either way. */
1944
1945 static bool
1946 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1947 {
1948 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1949
1950 HOST_WIDE_INT max_niter;
1951 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1952 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1953 else
1954 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1955
1956 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1957 return true;
1958
1959 return false;
1960 }
1961
1962 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1963 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1964 definitely no, or -1 if it's worth retrying. */
1965
1966 static int
1967 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1968 unsigned *suggested_unroll_factor)
1969 {
1970 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1972
1973 /* Only loops that can handle partially-populated vectors can have iteration
1974 counts less than the vectorization factor. */
1975 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1976 {
1977 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1978 {
1979 if (dump_enabled_p ())
1980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981 "not vectorized: iteration count smaller than "
1982 "vectorization factor.\n");
1983 return 0;
1984 }
1985 }
1986
1987 /* If using the "very cheap" model. reject cases in which we'd keep
1988 a copy of the scalar code (even if we might be able to vectorize it). */
1989 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1990 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1991 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1992 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1993 {
1994 if (dump_enabled_p ())
1995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1996 "some scalar iterations would need to be peeled\n");
1997 return 0;
1998 }
1999
2000 int min_profitable_iters, min_profitable_estimate;
2001 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2002 &min_profitable_estimate,
2003 suggested_unroll_factor);
2004
2005 if (min_profitable_iters < 0)
2006 {
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009 "not vectorized: vectorization not profitable.\n");
2010 if (dump_enabled_p ())
2011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2012 "not vectorized: vector version will never be "
2013 "profitable.\n");
2014 return -1;
2015 }
2016
2017 int min_scalar_loop_bound = (param_min_vect_loop_bound
2018 * assumed_vf);
2019
2020 /* Use the cost model only if it is more conservative than user specified
2021 threshold. */
2022 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2023 min_profitable_iters);
2024
2025 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2026
2027 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2028 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2029 {
2030 if (dump_enabled_p ())
2031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2032 "not vectorized: vectorization not profitable.\n");
2033 if (dump_enabled_p ())
2034 dump_printf_loc (MSG_NOTE, vect_location,
2035 "not vectorized: iteration count smaller than user "
2036 "specified loop bound parameter or minimum profitable "
2037 "iterations (whichever is more conservative).\n");
2038 return 0;
2039 }
2040
2041 /* The static profitablity threshold min_profitable_estimate includes
2042 the cost of having to check at runtime whether the scalar loop
2043 should be used instead. If it turns out that we don't need or want
2044 such a check, the threshold we should use for the static estimate
2045 is simply the point at which the vector loop becomes more profitable
2046 than the scalar loop. */
2047 if (min_profitable_estimate > min_profitable_iters
2048 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2049 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2050 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2051 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2052 {
2053 if (dump_enabled_p ())
2054 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2055 " choice between the scalar and vector loops\n");
2056 min_profitable_estimate = min_profitable_iters;
2057 }
2058
2059 /* If the vector loop needs multiple iterations to be beneficial then
2060 things are probably too close to call, and the conservative thing
2061 would be to stick with the scalar code. */
2062 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2063 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2064 {
2065 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2067 "one iteration of the vector loop would be"
2068 " more expensive than the equivalent number of"
2069 " iterations of the scalar loop\n");
2070 return 0;
2071 }
2072
2073 HOST_WIDE_INT estimated_niter;
2074
2075 /* If we are vectorizing an epilogue then we know the maximum number of
2076 scalar iterations it will cover is at least one lower than the
2077 vectorization factor of the main loop. */
2078 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2079 estimated_niter
2080 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2081 else
2082 {
2083 estimated_niter = estimated_stmt_executions_int (loop);
2084 if (estimated_niter == -1)
2085 estimated_niter = likely_max_stmt_executions_int (loop);
2086 }
2087 if (estimated_niter != -1
2088 && ((unsigned HOST_WIDE_INT) estimated_niter
2089 < MAX (th, (unsigned) min_profitable_estimate)))
2090 {
2091 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2093 "not vectorized: estimated iteration count too "
2094 "small.\n");
2095 if (dump_enabled_p ())
2096 dump_printf_loc (MSG_NOTE, vect_location,
2097 "not vectorized: estimated iteration count smaller "
2098 "than specified loop bound parameter or minimum "
2099 "profitable iterations (whichever is more "
2100 "conservative).\n");
2101 return -1;
2102 }
2103
2104 return 1;
2105 }
2106
2107 static opt_result
2108 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2109 vec<data_reference_p> *datarefs,
2110 unsigned int *n_stmts)
2111 {
2112 *n_stmts = 0;
2113 for (unsigned i = 0; i < loop->num_nodes; i++)
2114 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2115 !gsi_end_p (gsi); gsi_next (&gsi))
2116 {
2117 gimple *stmt = gsi_stmt (gsi);
2118 if (is_gimple_debug (stmt))
2119 continue;
2120 ++(*n_stmts);
2121 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2122 NULL, 0);
2123 if (!res)
2124 {
2125 if (is_gimple_call (stmt) && loop->safelen)
2126 {
2127 tree fndecl = gimple_call_fndecl (stmt), op;
2128 if (fndecl == NULL_TREE
2129 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2130 {
2131 fndecl = gimple_call_arg (stmt, 0);
2132 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2133 fndecl = TREE_OPERAND (fndecl, 0);
2134 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2135 }
2136 if (fndecl != NULL_TREE)
2137 {
2138 cgraph_node *node = cgraph_node::get (fndecl);
2139 if (node != NULL && node->simd_clones != NULL)
2140 {
2141 unsigned int j, n = gimple_call_num_args (stmt);
2142 for (j = 0; j < n; j++)
2143 {
2144 op = gimple_call_arg (stmt, j);
2145 if (DECL_P (op)
2146 || (REFERENCE_CLASS_P (op)
2147 && get_base_address (op)))
2148 break;
2149 }
2150 op = gimple_call_lhs (stmt);
2151 /* Ignore #pragma omp declare simd functions
2152 if they don't have data references in the
2153 call stmt itself. */
2154 if (j == n
2155 && !(op
2156 && (DECL_P (op)
2157 || (REFERENCE_CLASS_P (op)
2158 && get_base_address (op)))))
2159 continue;
2160 }
2161 }
2162 }
2163 return res;
2164 }
2165 /* If dependence analysis will give up due to the limit on the
2166 number of datarefs stop here and fail fatally. */
2167 if (datarefs->length ()
2168 > (unsigned)param_loop_max_datarefs_for_datadeps)
2169 return opt_result::failure_at (stmt, "exceeded param "
2170 "loop-max-datarefs-for-datadeps\n");
2171 }
2172 return opt_result::success ();
2173 }
2174
2175 /* Look for SLP-only access groups and turn each individual access into its own
2176 group. */
2177 static void
2178 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2179 {
2180 unsigned int i;
2181 struct data_reference *dr;
2182
2183 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2184
2185 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2186 FOR_EACH_VEC_ELT (datarefs, i, dr)
2187 {
2188 gcc_assert (DR_REF (dr));
2189 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2190
2191 /* Check if the load is a part of an interleaving chain. */
2192 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2193 {
2194 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2195 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2196 unsigned int group_size = DR_GROUP_SIZE (first_element);
2197
2198 /* Check if SLP-only groups. */
2199 if (!STMT_SLP_TYPE (stmt_info)
2200 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2201 {
2202 /* Dissolve the group. */
2203 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2204
2205 stmt_vec_info vinfo = first_element;
2206 while (vinfo)
2207 {
2208 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2209 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2210 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2211 DR_GROUP_SIZE (vinfo) = 1;
2212 if (STMT_VINFO_STRIDED_P (first_element))
2213 DR_GROUP_GAP (vinfo) = 0;
2214 else
2215 DR_GROUP_GAP (vinfo) = group_size - 1;
2216 /* Duplicate and adjust alignment info, it needs to
2217 be present on each group leader, see dr_misalignment. */
2218 if (vinfo != first_element)
2219 {
2220 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2221 dr_info2->target_alignment = dr_info->target_alignment;
2222 int misalignment = dr_info->misalignment;
2223 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2224 {
2225 HOST_WIDE_INT diff
2226 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2227 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2228 unsigned HOST_WIDE_INT align_c
2229 = dr_info->target_alignment.to_constant ();
2230 misalignment = (misalignment + diff) % align_c;
2231 }
2232 dr_info2->misalignment = misalignment;
2233 }
2234 vinfo = next;
2235 }
2236 }
2237 }
2238 }
2239 }
2240
2241 /* Determine if operating on full vectors for LOOP_VINFO might leave
2242 some scalar iterations still to do. If so, decide how we should
2243 handle those scalar iterations. The possibilities are:
2244
2245 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2246 In this case:
2247
2248 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2249 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2250 LOOP_VINFO_PEELING_FOR_NITER == false
2251
2252 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2253 to handle the remaining scalar iterations. In this case:
2254
2255 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2256 LOOP_VINFO_PEELING_FOR_NITER == true
2257
2258 There are two choices:
2259
2260 (2a) Consider vectorizing the epilogue loop at the same VF as the
2261 main loop, but using partial vectors instead of full vectors.
2262 In this case:
2263
2264 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2265
2266 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2267 In this case:
2268
2269 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2270
2271 When FOR_EPILOGUE_P is true, make this determination based on the
2272 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2273 based on the assumption that LOOP_VINFO is the main loop. The caller
2274 has made sure that the number of iterations is set appropriately for
2275 this value of FOR_EPILOGUE_P. */
2276
2277 opt_result
2278 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2279 bool for_epilogue_p)
2280 {
2281 /* Determine whether there would be any scalar iterations left over. */
2282 bool need_peeling_or_partial_vectors_p
2283 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2284
2285 /* Decide whether to vectorize the loop with partial vectors. */
2286 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2287 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2288 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2289 && need_peeling_or_partial_vectors_p)
2290 {
2291 /* For partial-vector-usage=1, try to push the handling of partial
2292 vectors to the epilogue, with the main loop continuing to operate
2293 on full vectors.
2294
2295 If we are unrolling we also do not want to use partial vectors. This
2296 is to avoid the overhead of generating multiple masks and also to
2297 avoid having to execute entire iterations of FALSE masked instructions
2298 when dealing with one or less full iterations.
2299
2300 ??? We could then end up failing to use partial vectors if we
2301 decide to peel iterations into a prologue, and if the main loop
2302 then ends up processing fewer than VF iterations. */
2303 if ((param_vect_partial_vector_usage == 1
2304 || loop_vinfo->suggested_unroll_factor > 1)
2305 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2306 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2307 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2308 else
2309 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2310 }
2311
2312 if (dump_enabled_p ())
2313 {
2314 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2315 dump_printf_loc (MSG_NOTE, vect_location,
2316 "operating on partial vectors%s.\n",
2317 for_epilogue_p ? " for epilogue loop" : "");
2318 else
2319 dump_printf_loc (MSG_NOTE, vect_location,
2320 "operating only on full vectors%s.\n",
2321 for_epilogue_p ? " for epilogue loop" : "");
2322 }
2323
2324 if (for_epilogue_p)
2325 {
2326 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2327 gcc_assert (orig_loop_vinfo);
2328 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2329 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2330 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2331 }
2332
2333 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2334 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2335 {
2336 /* Check that the loop processes at least one full vector. */
2337 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2338 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2339 if (known_lt (wi::to_widest (scalar_niters), vf))
2340 return opt_result::failure_at (vect_location,
2341 "loop does not have enough iterations"
2342 " to support vectorization.\n");
2343
2344 /* If we need to peel an extra epilogue iteration to handle data
2345 accesses with gaps, check that there are enough scalar iterations
2346 available.
2347
2348 The check above is redundant with this one when peeling for gaps,
2349 but the distinction is useful for diagnostics. */
2350 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2351 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2352 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2353 return opt_result::failure_at (vect_location,
2354 "loop does not have enough iterations"
2355 " to support peeling for gaps.\n");
2356 }
2357
2358 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2359 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2360 && need_peeling_or_partial_vectors_p);
2361
2362 return opt_result::success ();
2363 }
2364
2365 /* Function vect_analyze_loop_2.
2366
2367 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2368 analyses will record information in some members of LOOP_VINFO. FATAL
2369 indicates if some analysis meets fatal error. If one non-NULL pointer
2370 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2371 worked out suggested unroll factor, while one NULL pointer shows it's
2372 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2373 is to hold the slp decision when the suggested unroll factor is worked
2374 out. */
2375 static opt_result
2376 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2377 unsigned *suggested_unroll_factor,
2378 bool& slp_done_for_suggested_uf)
2379 {
2380 opt_result ok = opt_result::success ();
2381 int res;
2382 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2383 poly_uint64 min_vf = 2;
2384 loop_vec_info orig_loop_vinfo = NULL;
2385
2386 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2387 loop_vec_info of the first vectorized loop. */
2388 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2389 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2390 else
2391 orig_loop_vinfo = loop_vinfo;
2392 gcc_assert (orig_loop_vinfo);
2393
2394 /* The first group of checks is independent of the vector size. */
2395 fatal = true;
2396
2397 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2398 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2399 return opt_result::failure_at (vect_location,
2400 "not vectorized: simd if(0)\n");
2401
2402 /* Find all data references in the loop (which correspond to vdefs/vuses)
2403 and analyze their evolution in the loop. */
2404
2405 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2406
2407 /* Gather the data references and count stmts in the loop. */
2408 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2409 {
2410 opt_result res
2411 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2412 &LOOP_VINFO_DATAREFS (loop_vinfo),
2413 &LOOP_VINFO_N_STMTS (loop_vinfo));
2414 if (!res)
2415 {
2416 if (dump_enabled_p ())
2417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2418 "not vectorized: loop contains function "
2419 "calls or data references that cannot "
2420 "be analyzed\n");
2421 return res;
2422 }
2423 loop_vinfo->shared->save_datarefs ();
2424 }
2425 else
2426 loop_vinfo->shared->check_datarefs ();
2427
2428 /* Analyze the data references and also adjust the minimal
2429 vectorization factor according to the loads and stores. */
2430
2431 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2432 if (!ok)
2433 {
2434 if (dump_enabled_p ())
2435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2436 "bad data references.\n");
2437 return ok;
2438 }
2439
2440 /* Check if we are applying unroll factor now. */
2441 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2442 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2443
2444 /* If the slp decision is false when suggested unroll factor is worked
2445 out, and we are applying suggested unroll factor, we can simply skip
2446 all slp related analyses this time. */
2447 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2448
2449 /* Classify all cross-iteration scalar data-flow cycles.
2450 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2451 vect_analyze_scalar_cycles (loop_vinfo, slp);
2452
2453 vect_pattern_recog (loop_vinfo);
2454
2455 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2456
2457 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2458 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2459
2460 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2461 if (!ok)
2462 {
2463 if (dump_enabled_p ())
2464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2465 "bad data access.\n");
2466 return ok;
2467 }
2468
2469 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2470
2471 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2472 if (!ok)
2473 {
2474 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2476 "unexpected pattern.\n");
2477 return ok;
2478 }
2479
2480 /* While the rest of the analysis below depends on it in some way. */
2481 fatal = false;
2482
2483 /* Analyze data dependences between the data-refs in the loop
2484 and adjust the maximum vectorization factor according to
2485 the dependences.
2486 FORNOW: fail at the first data dependence that we encounter. */
2487
2488 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2489 if (!ok)
2490 {
2491 if (dump_enabled_p ())
2492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2493 "bad data dependence.\n");
2494 return ok;
2495 }
2496 if (max_vf != MAX_VECTORIZATION_FACTOR
2497 && maybe_lt (max_vf, min_vf))
2498 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2499 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2500
2501 ok = vect_determine_vectorization_factor (loop_vinfo);
2502 if (!ok)
2503 {
2504 if (dump_enabled_p ())
2505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2506 "can't determine vectorization factor.\n");
2507 return ok;
2508 }
2509 if (max_vf != MAX_VECTORIZATION_FACTOR
2510 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2511 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2512
2513 /* Compute the scalar iteration cost. */
2514 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2515
2516 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2517
2518 if (slp)
2519 {
2520 /* Check the SLP opportunities in the loop, analyze and build
2521 SLP trees. */
2522 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2523 if (!ok)
2524 return ok;
2525
2526 /* If there are any SLP instances mark them as pure_slp. */
2527 slp = vect_make_slp_decision (loop_vinfo);
2528 if (slp)
2529 {
2530 /* Find stmts that need to be both vectorized and SLPed. */
2531 vect_detect_hybrid_slp (loop_vinfo);
2532
2533 /* Update the vectorization factor based on the SLP decision. */
2534 vect_update_vf_for_slp (loop_vinfo);
2535
2536 /* Optimize the SLP graph with the vectorization factor fixed. */
2537 vect_optimize_slp (loop_vinfo);
2538
2539 /* Gather the loads reachable from the SLP graph entries. */
2540 vect_gather_slp_loads (loop_vinfo);
2541 }
2542 }
2543
2544 bool saved_can_use_partial_vectors_p
2545 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2546
2547 /* We don't expect to have to roll back to anything other than an empty
2548 set of rgroups. */
2549 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2550
2551 /* This is the point where we can re-start analysis with SLP forced off. */
2552 start_over:
2553
2554 /* Apply the suggested unrolling factor, this was determined by the backend
2555 during finish_cost the first time we ran the analyzis for this
2556 vector mode. */
2557 if (applying_suggested_uf)
2558 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2559
2560 /* Now the vectorization factor is final. */
2561 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2562 gcc_assert (known_ne (vectorization_factor, 0U));
2563
2564 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2565 {
2566 dump_printf_loc (MSG_NOTE, vect_location,
2567 "vectorization_factor = ");
2568 dump_dec (MSG_NOTE, vectorization_factor);
2569 dump_printf (MSG_NOTE, ", niters = %wd\n",
2570 LOOP_VINFO_INT_NITERS (loop_vinfo));
2571 }
2572
2573 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2574
2575 /* Analyze the alignment of the data-refs in the loop.
2576 Fail if a data reference is found that cannot be vectorized. */
2577
2578 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2579 if (!ok)
2580 {
2581 if (dump_enabled_p ())
2582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2583 "bad data alignment.\n");
2584 return ok;
2585 }
2586
2587 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2588 It is important to call pruning after vect_analyze_data_ref_accesses,
2589 since we use grouping information gathered by interleaving analysis. */
2590 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2591 if (!ok)
2592 return ok;
2593
2594 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2595 vectorization, since we do not want to add extra peeling or
2596 add versioning for alignment. */
2597 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2598 /* This pass will decide on using loop versioning and/or loop peeling in
2599 order to enhance the alignment of data references in the loop. */
2600 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2601 if (!ok)
2602 return ok;
2603
2604 if (slp)
2605 {
2606 /* Analyze operations in the SLP instances. Note this may
2607 remove unsupported SLP instances which makes the above
2608 SLP kind detection invalid. */
2609 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2610 vect_slp_analyze_operations (loop_vinfo);
2611 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2612 {
2613 ok = opt_result::failure_at (vect_location,
2614 "unsupported SLP instances\n");
2615 goto again;
2616 }
2617
2618 /* Check whether any load in ALL SLP instances is possibly permuted. */
2619 slp_tree load_node, slp_root;
2620 unsigned i, x;
2621 slp_instance instance;
2622 bool can_use_lanes = true;
2623 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2624 {
2625 slp_root = SLP_INSTANCE_TREE (instance);
2626 int group_size = SLP_TREE_LANES (slp_root);
2627 tree vectype = SLP_TREE_VECTYPE (slp_root);
2628 bool loads_permuted = false;
2629 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2630 {
2631 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2632 continue;
2633 unsigned j;
2634 stmt_vec_info load_info;
2635 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2636 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2637 {
2638 loads_permuted = true;
2639 break;
2640 }
2641 }
2642
2643 /* If the loads and stores can be handled with load/store-lane
2644 instructions record it and move on to the next instance. */
2645 if (loads_permuted
2646 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2647 && vect_store_lanes_supported (vectype, group_size, false))
2648 {
2649 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2650 {
2651 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2652 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2653 /* Use SLP for strided accesses (or if we can't
2654 load-lanes). */
2655 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2656 || ! vect_load_lanes_supported
2657 (STMT_VINFO_VECTYPE (stmt_vinfo),
2658 DR_GROUP_SIZE (stmt_vinfo), false))
2659 break;
2660 }
2661
2662 can_use_lanes
2663 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2664
2665 if (can_use_lanes && dump_enabled_p ())
2666 dump_printf_loc (MSG_NOTE, vect_location,
2667 "SLP instance %p can use load/store-lanes\n",
2668 (void *) instance);
2669 }
2670 else
2671 {
2672 can_use_lanes = false;
2673 break;
2674 }
2675 }
2676
2677 /* If all SLP instances can use load/store-lanes abort SLP and try again
2678 with SLP disabled. */
2679 if (can_use_lanes)
2680 {
2681 ok = opt_result::failure_at (vect_location,
2682 "Built SLP cancelled: can use "
2683 "load/store-lanes\n");
2684 if (dump_enabled_p ())
2685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2686 "Built SLP cancelled: all SLP instances support "
2687 "load/store-lanes\n");
2688 goto again;
2689 }
2690 }
2691
2692 /* Dissolve SLP-only groups. */
2693 vect_dissolve_slp_only_groups (loop_vinfo);
2694
2695 /* Scan all the remaining operations in the loop that are not subject
2696 to SLP and make sure they are vectorizable. */
2697 ok = vect_analyze_loop_operations (loop_vinfo);
2698 if (!ok)
2699 {
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2702 "bad operation or unsupported loop bound.\n");
2703 return ok;
2704 }
2705
2706 /* For now, we don't expect to mix both masking and length approaches for one
2707 loop, disable it if both are recorded. */
2708 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2709 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2710 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2711 {
2712 if (dump_enabled_p ())
2713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2714 "can't vectorize a loop with partial vectors"
2715 " because we don't expect to mix different"
2716 " approaches with partial vectors for the"
2717 " same loop.\n");
2718 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2719 }
2720
2721 /* If we still have the option of using partial vectors,
2722 check whether we can generate the necessary loop controls. */
2723 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2724 && !vect_verify_full_masking (loop_vinfo)
2725 && !vect_verify_loop_lens (loop_vinfo))
2726 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2727
2728 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2729 to be able to handle fewer than VF scalars, or needs to have a lower VF
2730 than the main loop. */
2731 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2732 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2733 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2734 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2735 return opt_result::failure_at (vect_location,
2736 "Vectorization factor too high for"
2737 " epilogue loop.\n");
2738
2739 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2740 assuming that the loop will be used as a main loop. We will redo
2741 this analysis later if we instead decide to use the loop as an
2742 epilogue loop. */
2743 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2744 if (!ok)
2745 return ok;
2746
2747 /* Check the costings of the loop make vectorizing worthwhile. */
2748 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2749 if (res < 0)
2750 {
2751 ok = opt_result::failure_at (vect_location,
2752 "Loop costings may not be worthwhile.\n");
2753 goto again;
2754 }
2755 if (!res)
2756 return opt_result::failure_at (vect_location,
2757 "Loop costings not worthwhile.\n");
2758
2759 /* If an epilogue loop is required make sure we can create one. */
2760 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2761 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2762 {
2763 if (dump_enabled_p ())
2764 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2765 if (!vect_can_advance_ivs_p (loop_vinfo)
2766 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2767 single_exit (LOOP_VINFO_LOOP
2768 (loop_vinfo))))
2769 {
2770 ok = opt_result::failure_at (vect_location,
2771 "not vectorized: can't create required "
2772 "epilog loop\n");
2773 goto again;
2774 }
2775 }
2776
2777 /* During peeling, we need to check if number of loop iterations is
2778 enough for both peeled prolog loop and vector loop. This check
2779 can be merged along with threshold check of loop versioning, so
2780 increase threshold for this case if necessary.
2781
2782 If we are analyzing an epilogue we still want to check what its
2783 versioning threshold would be. If we decide to vectorize the epilogues we
2784 will want to use the lowest versioning threshold of all epilogues and main
2785 loop. This will enable us to enter a vectorized epilogue even when
2786 versioning the loop. We can't simply check whether the epilogue requires
2787 versioning though since we may have skipped some versioning checks when
2788 analyzing the epilogue. For instance, checks for alias versioning will be
2789 skipped when dealing with epilogues as we assume we already checked them
2790 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2791 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2792 {
2793 poly_uint64 niters_th = 0;
2794 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2795
2796 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2797 {
2798 /* Niters for peeled prolog loop. */
2799 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2800 {
2801 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2802 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2803 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2804 }
2805 else
2806 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2807 }
2808
2809 /* Niters for at least one iteration of vectorized loop. */
2810 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2811 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2812 /* One additional iteration because of peeling for gap. */
2813 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2814 niters_th += 1;
2815
2816 /* Use the same condition as vect_transform_loop to decide when to use
2817 the cost to determine a versioning threshold. */
2818 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2819 && ordered_p (th, niters_th))
2820 niters_th = ordered_max (poly_uint64 (th), niters_th);
2821
2822 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2823 }
2824
2825 gcc_assert (known_eq (vectorization_factor,
2826 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2827
2828 slp_done_for_suggested_uf = slp;
2829
2830 /* Ok to vectorize! */
2831 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2832 return opt_result::success ();
2833
2834 again:
2835 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2836 gcc_assert (!ok);
2837
2838 /* Try again with SLP forced off but if we didn't do any SLP there is
2839 no point in re-trying. */
2840 if (!slp)
2841 return ok;
2842
2843 /* If the slp decision is true when suggested unroll factor is worked
2844 out, and we are applying suggested unroll factor, we don't need to
2845 re-try any more. */
2846 if (applying_suggested_uf && slp_done_for_suggested_uf)
2847 return ok;
2848
2849 /* If there are reduction chains re-trying will fail anyway. */
2850 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2851 return ok;
2852
2853 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2854 via interleaving or lane instructions. */
2855 slp_instance instance;
2856 slp_tree node;
2857 unsigned i, j;
2858 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2859 {
2860 stmt_vec_info vinfo;
2861 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2862 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2863 continue;
2864 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2865 unsigned int size = DR_GROUP_SIZE (vinfo);
2866 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2867 if (! vect_store_lanes_supported (vectype, size, false)
2868 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2869 && ! vect_grouped_store_supported (vectype, size))
2870 return opt_result::failure_at (vinfo->stmt,
2871 "unsupported grouped store\n");
2872 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2873 {
2874 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2875 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2876 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2877 size = DR_GROUP_SIZE (vinfo);
2878 vectype = STMT_VINFO_VECTYPE (vinfo);
2879 if (! vect_load_lanes_supported (vectype, size, false)
2880 && ! vect_grouped_load_supported (vectype, single_element_p,
2881 size))
2882 return opt_result::failure_at (vinfo->stmt,
2883 "unsupported grouped load\n");
2884 }
2885 }
2886
2887 if (dump_enabled_p ())
2888 dump_printf_loc (MSG_NOTE, vect_location,
2889 "re-trying with SLP disabled\n");
2890
2891 /* Roll back state appropriately. No SLP this time. */
2892 slp = false;
2893 /* Restore vectorization factor as it were without SLP. */
2894 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2895 /* Free the SLP instances. */
2896 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2897 vect_free_slp_instance (instance);
2898 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2899 /* Reset SLP type to loop_vect on all stmts. */
2900 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2901 {
2902 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2903 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2904 !gsi_end_p (si); gsi_next (&si))
2905 {
2906 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2907 STMT_SLP_TYPE (stmt_info) = loop_vect;
2908 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2909 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2910 {
2911 /* vectorizable_reduction adjusts reduction stmt def-types,
2912 restore them to that of the PHI. */
2913 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2914 = STMT_VINFO_DEF_TYPE (stmt_info);
2915 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2916 (STMT_VINFO_REDUC_DEF (stmt_info)))
2917 = STMT_VINFO_DEF_TYPE (stmt_info);
2918 }
2919 }
2920 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2921 !gsi_end_p (si); gsi_next (&si))
2922 {
2923 if (is_gimple_debug (gsi_stmt (si)))
2924 continue;
2925 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2926 STMT_SLP_TYPE (stmt_info) = loop_vect;
2927 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2928 {
2929 stmt_vec_info pattern_stmt_info
2930 = STMT_VINFO_RELATED_STMT (stmt_info);
2931 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2932 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2933
2934 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2935 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2936 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2937 !gsi_end_p (pi); gsi_next (&pi))
2938 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2939 = loop_vect;
2940 }
2941 }
2942 }
2943 /* Free optimized alias test DDRS. */
2944 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2945 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2946 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2947 /* Reset target cost data. */
2948 delete loop_vinfo->vector_costs;
2949 loop_vinfo->vector_costs = nullptr;
2950 /* Reset accumulated rgroup information. */
2951 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2952 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2953 /* Reset assorted flags. */
2954 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2955 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2956 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2957 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2958 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2959 = saved_can_use_partial_vectors_p;
2960
2961 goto start_over;
2962 }
2963
2964 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2965 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2966 OLD_LOOP_VINFO is better unless something specifically indicates
2967 otherwise.
2968
2969 Note that this deliberately isn't a partial order. */
2970
2971 static bool
2972 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2973 loop_vec_info old_loop_vinfo)
2974 {
2975 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2976 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2977
2978 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2979 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2980
2981 /* Always prefer a VF of loop->simdlen over any other VF. */
2982 if (loop->simdlen)
2983 {
2984 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2985 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2986 if (new_simdlen_p != old_simdlen_p)
2987 return new_simdlen_p;
2988 }
2989
2990 const auto *old_costs = old_loop_vinfo->vector_costs;
2991 const auto *new_costs = new_loop_vinfo->vector_costs;
2992 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2993 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2994
2995 return new_costs->better_main_loop_than_p (old_costs);
2996 }
2997
2998 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2999 true if we should. */
3000
3001 static bool
3002 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3003 loop_vec_info old_loop_vinfo)
3004 {
3005 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3006 return false;
3007
3008 if (dump_enabled_p ())
3009 dump_printf_loc (MSG_NOTE, vect_location,
3010 "***** Preferring vector mode %s to vector mode %s\n",
3011 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3012 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3013 return true;
3014 }
3015
3016 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3017 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3018 MODE_I to the next mode useful to analyze.
3019 Return the loop_vinfo on success and wrapped null on failure. */
3020
3021 static opt_loop_vec_info
3022 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3023 const vect_loop_form_info *loop_form_info,
3024 loop_vec_info main_loop_vinfo,
3025 const vector_modes &vector_modes, unsigned &mode_i,
3026 machine_mode &autodetected_vector_mode,
3027 bool &fatal)
3028 {
3029 loop_vec_info loop_vinfo
3030 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3031
3032 machine_mode vector_mode = vector_modes[mode_i];
3033 loop_vinfo->vector_mode = vector_mode;
3034 unsigned int suggested_unroll_factor = 1;
3035 bool slp_done_for_suggested_uf;
3036
3037 /* Run the main analysis. */
3038 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3039 &suggested_unroll_factor,
3040 slp_done_for_suggested_uf);
3041 if (dump_enabled_p ())
3042 dump_printf_loc (MSG_NOTE, vect_location,
3043 "***** Analysis %s with vector mode %s\n",
3044 res ? "succeeded" : " failed",
3045 GET_MODE_NAME (loop_vinfo->vector_mode));
3046
3047 if (!main_loop_vinfo && suggested_unroll_factor > 1)
3048 {
3049 if (dump_enabled_p ())
3050 dump_printf_loc (MSG_NOTE, vect_location,
3051 "***** Re-trying analysis for unrolling"
3052 " with unroll factor %d and slp %s.\n",
3053 suggested_unroll_factor,
3054 slp_done_for_suggested_uf ? "on" : "off");
3055 loop_vec_info unroll_vinfo
3056 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3057 unroll_vinfo->vector_mode = vector_mode;
3058 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3059 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3060 slp_done_for_suggested_uf);
3061 if (new_res)
3062 {
3063 delete loop_vinfo;
3064 loop_vinfo = unroll_vinfo;
3065 }
3066 else
3067 delete unroll_vinfo;
3068 }
3069
3070 /* Remember the autodetected vector mode. */
3071 if (vector_mode == VOIDmode)
3072 autodetected_vector_mode = loop_vinfo->vector_mode;
3073
3074 /* Advance mode_i, first skipping modes that would result in the
3075 same analysis result. */
3076 while (mode_i + 1 < vector_modes.length ()
3077 && vect_chooses_same_modes_p (loop_vinfo,
3078 vector_modes[mode_i + 1]))
3079 {
3080 if (dump_enabled_p ())
3081 dump_printf_loc (MSG_NOTE, vect_location,
3082 "***** The result for vector mode %s would"
3083 " be the same\n",
3084 GET_MODE_NAME (vector_modes[mode_i + 1]));
3085 mode_i += 1;
3086 }
3087 if (mode_i + 1 < vector_modes.length ()
3088 && VECTOR_MODE_P (autodetected_vector_mode)
3089 && (related_vector_mode (vector_modes[mode_i + 1],
3090 GET_MODE_INNER (autodetected_vector_mode))
3091 == autodetected_vector_mode)
3092 && (related_vector_mode (autodetected_vector_mode,
3093 GET_MODE_INNER (vector_modes[mode_i + 1]))
3094 == vector_modes[mode_i + 1]))
3095 {
3096 if (dump_enabled_p ())
3097 dump_printf_loc (MSG_NOTE, vect_location,
3098 "***** Skipping vector mode %s, which would"
3099 " repeat the analysis for %s\n",
3100 GET_MODE_NAME (vector_modes[mode_i + 1]),
3101 GET_MODE_NAME (autodetected_vector_mode));
3102 mode_i += 1;
3103 }
3104 mode_i++;
3105
3106 if (!res)
3107 {
3108 delete loop_vinfo;
3109 if (fatal)
3110 gcc_checking_assert (main_loop_vinfo == NULL);
3111 return opt_loop_vec_info::propagate_failure (res);
3112 }
3113
3114 return opt_loop_vec_info::success (loop_vinfo);
3115 }
3116
3117 /* Function vect_analyze_loop.
3118
3119 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3120 for it. The different analyses will record information in the
3121 loop_vec_info struct. */
3122 opt_loop_vec_info
3123 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3124 {
3125 DUMP_VECT_SCOPE ("analyze_loop_nest");
3126
3127 if (loop_outer (loop)
3128 && loop_vec_info_for_loop (loop_outer (loop))
3129 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3130 return opt_loop_vec_info::failure_at (vect_location,
3131 "outer-loop already vectorized.\n");
3132
3133 if (!find_loop_nest (loop, &shared->loop_nest))
3134 return opt_loop_vec_info::failure_at
3135 (vect_location,
3136 "not vectorized: loop nest containing two or more consecutive inner"
3137 " loops cannot be vectorized\n");
3138
3139 /* Analyze the loop form. */
3140 vect_loop_form_info loop_form_info;
3141 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3142 if (!res)
3143 {
3144 if (dump_enabled_p ())
3145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3146 "bad loop form.\n");
3147 return opt_loop_vec_info::propagate_failure (res);
3148 }
3149 if (!integer_onep (loop_form_info.assumptions))
3150 {
3151 /* We consider to vectorize this loop by versioning it under
3152 some assumptions. In order to do this, we need to clear
3153 existing information computed by scev and niter analyzer. */
3154 scev_reset_htab ();
3155 free_numbers_of_iterations_estimates (loop);
3156 /* Also set flag for this loop so that following scev and niter
3157 analysis are done under the assumptions. */
3158 loop_constraint_set (loop, LOOP_C_FINITE);
3159 }
3160
3161 auto_vector_modes vector_modes;
3162 /* Autodetect first vector size we try. */
3163 vector_modes.safe_push (VOIDmode);
3164 unsigned int autovec_flags
3165 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3166 loop->simdlen != 0);
3167 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3168 && !unlimited_cost_model (loop));
3169 machine_mode autodetected_vector_mode = VOIDmode;
3170 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3171 unsigned int mode_i = 0;
3172 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3173
3174 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3175 a mode has not been analyzed. */
3176 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3177 for (unsigned i = 0; i < vector_modes.length (); ++i)
3178 cached_vf_per_mode.safe_push (0);
3179
3180 /* First determine the main loop vectorization mode, either the first
3181 one that works, starting with auto-detecting the vector mode and then
3182 following the targets order of preference, or the one with the
3183 lowest cost if pick_lowest_cost_p. */
3184 while (1)
3185 {
3186 bool fatal;
3187 unsigned int last_mode_i = mode_i;
3188 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3189 failed. */
3190 cached_vf_per_mode[last_mode_i] = -1;
3191 opt_loop_vec_info loop_vinfo
3192 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3193 NULL, vector_modes, mode_i,
3194 autodetected_vector_mode, fatal);
3195 if (fatal)
3196 break;
3197
3198 if (loop_vinfo)
3199 {
3200 /* Analyzis has been successful so update the VF value. The
3201 VF should always be a multiple of unroll_factor and we want to
3202 capture the original VF here. */
3203 cached_vf_per_mode[last_mode_i]
3204 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3205 loop_vinfo->suggested_unroll_factor);
3206 /* Once we hit the desired simdlen for the first time,
3207 discard any previous attempts. */
3208 if (simdlen
3209 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3210 {
3211 delete first_loop_vinfo;
3212 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3213 simdlen = 0;
3214 }
3215 else if (pick_lowest_cost_p
3216 && first_loop_vinfo
3217 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3218 {
3219 /* Pick loop_vinfo over first_loop_vinfo. */
3220 delete first_loop_vinfo;
3221 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3222 }
3223 if (first_loop_vinfo == NULL)
3224 first_loop_vinfo = loop_vinfo;
3225 else
3226 {
3227 delete loop_vinfo;
3228 loop_vinfo = opt_loop_vec_info::success (NULL);
3229 }
3230
3231 /* Commit to first_loop_vinfo if we have no reason to try
3232 alternatives. */
3233 if (!simdlen && !pick_lowest_cost_p)
3234 break;
3235 }
3236 if (mode_i == vector_modes.length ()
3237 || autodetected_vector_mode == VOIDmode)
3238 break;
3239
3240 /* Try the next biggest vector size. */
3241 if (dump_enabled_p ())
3242 dump_printf_loc (MSG_NOTE, vect_location,
3243 "***** Re-trying analysis with vector mode %s\n",
3244 GET_MODE_NAME (vector_modes[mode_i]));
3245 }
3246 if (!first_loop_vinfo)
3247 return opt_loop_vec_info::propagate_failure (res);
3248
3249 if (dump_enabled_p ())
3250 dump_printf_loc (MSG_NOTE, vect_location,
3251 "***** Choosing vector mode %s\n",
3252 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3253
3254 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3255 enabled, SIMDUID is not set, it is the innermost loop and we have
3256 either already found the loop's SIMDLEN or there was no SIMDLEN to
3257 begin with.
3258 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3259 bool vect_epilogues = (!simdlen
3260 && loop->inner == NULL
3261 && param_vect_epilogues_nomask
3262 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3263 && !loop->simduid);
3264 if (!vect_epilogues)
3265 return first_loop_vinfo;
3266
3267 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3268 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3269
3270 /* For epilogues start the analysis from the first mode. The motivation
3271 behind starting from the beginning comes from cases where the VECTOR_MODES
3272 array may contain length-agnostic and length-specific modes. Their
3273 ordering is not guaranteed, so we could end up picking a mode for the main
3274 loop that is after the epilogue's optimal mode. */
3275 vector_modes[0] = autodetected_vector_mode;
3276 mode_i = 0;
3277
3278 bool supports_partial_vectors =
3279 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3280 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3281
3282 while (1)
3283 {
3284 /* If the target does not support partial vectors we can shorten the
3285 number of modes to analyze for the epilogue as we know we can't pick a
3286 mode that would lead to a VF at least as big as the
3287 FIRST_VINFO_VF. */
3288 if (!supports_partial_vectors
3289 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3290 {
3291 mode_i++;
3292 if (mode_i == vector_modes.length ())
3293 break;
3294 continue;
3295 }
3296
3297 if (dump_enabled_p ())
3298 dump_printf_loc (MSG_NOTE, vect_location,
3299 "***** Re-trying epilogue analysis with vector "
3300 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3301
3302 bool fatal;
3303 opt_loop_vec_info loop_vinfo
3304 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3305 first_loop_vinfo,
3306 vector_modes, mode_i,
3307 autodetected_vector_mode, fatal);
3308 if (fatal)
3309 break;
3310
3311 if (loop_vinfo)
3312 {
3313 if (pick_lowest_cost_p)
3314 {
3315 /* Keep trying to roll back vectorization attempts while the
3316 loop_vec_infos they produced were worse than this one. */
3317 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3318 while (!vinfos.is_empty ()
3319 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3320 {
3321 gcc_assert (vect_epilogues);
3322 delete vinfos.pop ();
3323 }
3324 }
3325 /* For now only allow one epilogue loop. */
3326 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3327 {
3328 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3329 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3330 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3331 || maybe_ne (lowest_th, 0U));
3332 /* Keep track of the known smallest versioning
3333 threshold. */
3334 if (ordered_p (lowest_th, th))
3335 lowest_th = ordered_min (lowest_th, th);
3336 }
3337 else
3338 {
3339 delete loop_vinfo;
3340 loop_vinfo = opt_loop_vec_info::success (NULL);
3341 }
3342
3343 /* For now only allow one epilogue loop, but allow
3344 pick_lowest_cost_p to replace it, so commit to the
3345 first epilogue if we have no reason to try alternatives. */
3346 if (!pick_lowest_cost_p)
3347 break;
3348 }
3349
3350 if (mode_i == vector_modes.length ())
3351 break;
3352
3353 }
3354
3355 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3356 {
3357 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3358 if (dump_enabled_p ())
3359 dump_printf_loc (MSG_NOTE, vect_location,
3360 "***** Choosing epilogue vector mode %s\n",
3361 GET_MODE_NAME
3362 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3363 }
3364
3365 return first_loop_vinfo;
3366 }
3367
3368 /* Return true if there is an in-order reduction function for CODE, storing
3369 it in *REDUC_FN if so. */
3370
3371 static bool
3372 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3373 {
3374 if (code == PLUS_EXPR)
3375 {
3376 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3377 return true;
3378 }
3379 return false;
3380 }
3381
3382 /* Function reduction_fn_for_scalar_code
3383
3384 Input:
3385 CODE - tree_code of a reduction operations.
3386
3387 Output:
3388 REDUC_FN - the corresponding internal function to be used to reduce the
3389 vector of partial results into a single scalar result, or IFN_LAST
3390 if the operation is a supported reduction operation, but does not have
3391 such an internal function.
3392
3393 Return FALSE if CODE currently cannot be vectorized as reduction. */
3394
3395 bool
3396 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3397 {
3398 if (code.is_tree_code ())
3399 switch (tree_code (code))
3400 {
3401 case MAX_EXPR:
3402 *reduc_fn = IFN_REDUC_MAX;
3403 return true;
3404
3405 case MIN_EXPR:
3406 *reduc_fn = IFN_REDUC_MIN;
3407 return true;
3408
3409 case PLUS_EXPR:
3410 *reduc_fn = IFN_REDUC_PLUS;
3411 return true;
3412
3413 case BIT_AND_EXPR:
3414 *reduc_fn = IFN_REDUC_AND;
3415 return true;
3416
3417 case BIT_IOR_EXPR:
3418 *reduc_fn = IFN_REDUC_IOR;
3419 return true;
3420
3421 case BIT_XOR_EXPR:
3422 *reduc_fn = IFN_REDUC_XOR;
3423 return true;
3424
3425 case MULT_EXPR:
3426 case MINUS_EXPR:
3427 *reduc_fn = IFN_LAST;
3428 return true;
3429
3430 default:
3431 return false;
3432 }
3433 else
3434 switch (combined_fn (code))
3435 {
3436 CASE_CFN_FMAX:
3437 *reduc_fn = IFN_REDUC_FMAX;
3438 return true;
3439
3440 CASE_CFN_FMIN:
3441 *reduc_fn = IFN_REDUC_FMIN;
3442 return true;
3443
3444 default:
3445 return false;
3446 }
3447 }
3448
3449 /* If there is a neutral value X such that a reduction would not be affected
3450 by the introduction of additional X elements, return that X, otherwise
3451 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3452 of the scalar elements. If the reduction has just a single initial value
3453 then INITIAL_VALUE is that value, otherwise it is null. */
3454
3455 tree
3456 neutral_op_for_reduction (tree scalar_type, code_helper code,
3457 tree initial_value)
3458 {
3459 if (code.is_tree_code ())
3460 switch (tree_code (code))
3461 {
3462 case WIDEN_SUM_EXPR:
3463 case DOT_PROD_EXPR:
3464 case SAD_EXPR:
3465 case PLUS_EXPR:
3466 case MINUS_EXPR:
3467 case BIT_IOR_EXPR:
3468 case BIT_XOR_EXPR:
3469 return build_zero_cst (scalar_type);
3470
3471 case MULT_EXPR:
3472 return build_one_cst (scalar_type);
3473
3474 case BIT_AND_EXPR:
3475 return build_all_ones_cst (scalar_type);
3476
3477 case MAX_EXPR:
3478 case MIN_EXPR:
3479 return initial_value;
3480
3481 default:
3482 return NULL_TREE;
3483 }
3484 else
3485 switch (combined_fn (code))
3486 {
3487 CASE_CFN_FMIN:
3488 CASE_CFN_FMAX:
3489 return initial_value;
3490
3491 default:
3492 return NULL_TREE;
3493 }
3494 }
3495
3496 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3497 STMT is printed with a message MSG. */
3498
3499 static void
3500 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3501 {
3502 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3503 }
3504
3505 /* Return true if we need an in-order reduction for operation CODE
3506 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3507 overflow must wrap. */
3508
3509 bool
3510 needs_fold_left_reduction_p (tree type, code_helper code)
3511 {
3512 /* CHECKME: check for !flag_finite_math_only too? */
3513 if (SCALAR_FLOAT_TYPE_P (type))
3514 {
3515 if (code.is_tree_code ())
3516 switch (tree_code (code))
3517 {
3518 case MIN_EXPR:
3519 case MAX_EXPR:
3520 return false;
3521
3522 default:
3523 return !flag_associative_math;
3524 }
3525 else
3526 switch (combined_fn (code))
3527 {
3528 CASE_CFN_FMIN:
3529 CASE_CFN_FMAX:
3530 return false;
3531
3532 default:
3533 return !flag_associative_math;
3534 }
3535 }
3536
3537 if (INTEGRAL_TYPE_P (type))
3538 return (!code.is_tree_code ()
3539 || !operation_no_trapping_overflow (type, tree_code (code)));
3540
3541 if (SAT_FIXED_POINT_TYPE_P (type))
3542 return true;
3543
3544 return false;
3545 }
3546
3547 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3548 has a handled computation expression. Store the main reduction
3549 operation in *CODE. */
3550
3551 static bool
3552 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3553 tree loop_arg, code_helper *code,
3554 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3555 {
3556 auto_bitmap visited;
3557 tree lookfor = PHI_RESULT (phi);
3558 ssa_op_iter curri;
3559 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3560 while (USE_FROM_PTR (curr) != loop_arg)
3561 curr = op_iter_next_use (&curri);
3562 curri.i = curri.numops;
3563 do
3564 {
3565 path.safe_push (std::make_pair (curri, curr));
3566 tree use = USE_FROM_PTR (curr);
3567 if (use == lookfor)
3568 break;
3569 gimple *def = SSA_NAME_DEF_STMT (use);
3570 if (gimple_nop_p (def)
3571 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3572 {
3573 pop:
3574 do
3575 {
3576 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3577 curri = x.first;
3578 curr = x.second;
3579 do
3580 curr = op_iter_next_use (&curri);
3581 /* Skip already visited or non-SSA operands (from iterating
3582 over PHI args). */
3583 while (curr != NULL_USE_OPERAND_P
3584 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3585 || ! bitmap_set_bit (visited,
3586 SSA_NAME_VERSION
3587 (USE_FROM_PTR (curr)))));
3588 }
3589 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3590 if (curr == NULL_USE_OPERAND_P)
3591 break;
3592 }
3593 else
3594 {
3595 if (gimple_code (def) == GIMPLE_PHI)
3596 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3597 else
3598 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3599 while (curr != NULL_USE_OPERAND_P
3600 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3601 || ! bitmap_set_bit (visited,
3602 SSA_NAME_VERSION
3603 (USE_FROM_PTR (curr)))))
3604 curr = op_iter_next_use (&curri);
3605 if (curr == NULL_USE_OPERAND_P)
3606 goto pop;
3607 }
3608 }
3609 while (1);
3610 if (dump_file && (dump_flags & TDF_DETAILS))
3611 {
3612 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3613 unsigned i;
3614 std::pair<ssa_op_iter, use_operand_p> *x;
3615 FOR_EACH_VEC_ELT (path, i, x)
3616 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3617 dump_printf (MSG_NOTE, "\n");
3618 }
3619
3620 /* Check whether the reduction path detected is valid. */
3621 bool fail = path.length () == 0;
3622 bool neg = false;
3623 int sign = -1;
3624 *code = ERROR_MARK;
3625 for (unsigned i = 1; i < path.length (); ++i)
3626 {
3627 gimple *use_stmt = USE_STMT (path[i].second);
3628 gimple_match_op op;
3629 if (!gimple_extract_op (use_stmt, &op))
3630 {
3631 fail = true;
3632 break;
3633 }
3634 unsigned int opi = op.num_ops;
3635 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3636 {
3637 /* The following make sure we can compute the operand index
3638 easily plus it mostly disallows chaining via COND_EXPR condition
3639 operands. */
3640 for (opi = 0; opi < op.num_ops; ++opi)
3641 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3642 break;
3643 }
3644 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3645 {
3646 for (opi = 0; opi < op.num_ops; ++opi)
3647 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3648 break;
3649 }
3650 if (opi == op.num_ops)
3651 {
3652 fail = true;
3653 break;
3654 }
3655 op.code = canonicalize_code (op.code, op.type);
3656 if (op.code == MINUS_EXPR)
3657 {
3658 op.code = PLUS_EXPR;
3659 /* Track whether we negate the reduction value each iteration. */
3660 if (op.ops[1] == op.ops[opi])
3661 neg = ! neg;
3662 }
3663 if (CONVERT_EXPR_CODE_P (op.code)
3664 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3665 ;
3666 else if (*code == ERROR_MARK)
3667 {
3668 *code = op.code;
3669 sign = TYPE_SIGN (op.type);
3670 }
3671 else if (op.code != *code)
3672 {
3673 fail = true;
3674 break;
3675 }
3676 else if ((op.code == MIN_EXPR
3677 || op.code == MAX_EXPR)
3678 && sign != TYPE_SIGN (op.type))
3679 {
3680 fail = true;
3681 break;
3682 }
3683 /* Check there's only a single stmt the op is used on. For the
3684 not value-changing tail and the last stmt allow out-of-loop uses.
3685 ??? We could relax this and handle arbitrary live stmts by
3686 forcing a scalar epilogue for example. */
3687 imm_use_iterator imm_iter;
3688 gimple *op_use_stmt;
3689 unsigned cnt = 0;
3690 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3691 if (!is_gimple_debug (op_use_stmt)
3692 && (*code != ERROR_MARK
3693 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3694 {
3695 /* We want to allow x + x but not x < 1 ? x : 2. */
3696 if (is_gimple_assign (op_use_stmt)
3697 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3698 {
3699 use_operand_p use_p;
3700 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3701 cnt++;
3702 }
3703 else
3704 cnt++;
3705 }
3706 if (cnt != 1)
3707 {
3708 fail = true;
3709 break;
3710 }
3711 }
3712 return ! fail && ! neg && *code != ERROR_MARK;
3713 }
3714
3715 bool
3716 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3717 tree loop_arg, enum tree_code code)
3718 {
3719 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3720 code_helper code_;
3721 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3722 && code_ == code);
3723 }
3724
3725
3726
3727 /* Function vect_is_simple_reduction
3728
3729 (1) Detect a cross-iteration def-use cycle that represents a simple
3730 reduction computation. We look for the following pattern:
3731
3732 loop_header:
3733 a1 = phi < a0, a2 >
3734 a3 = ...
3735 a2 = operation (a3, a1)
3736
3737 or
3738
3739 a3 = ...
3740 loop_header:
3741 a1 = phi < a0, a2 >
3742 a2 = operation (a3, a1)
3743
3744 such that:
3745 1. operation is commutative and associative and it is safe to
3746 change the order of the computation
3747 2. no uses for a2 in the loop (a2 is used out of the loop)
3748 3. no uses of a1 in the loop besides the reduction operation
3749 4. no uses of a1 outside the loop.
3750
3751 Conditions 1,4 are tested here.
3752 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3753
3754 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3755 nested cycles.
3756
3757 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3758 reductions:
3759
3760 a1 = phi < a0, a2 >
3761 inner loop (def of a3)
3762 a2 = phi < a3 >
3763
3764 (4) Detect condition expressions, ie:
3765 for (int i = 0; i < N; i++)
3766 if (a[i] < val)
3767 ret_val = a[i];
3768
3769 */
3770
3771 static stmt_vec_info
3772 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3773 bool *double_reduc, bool *reduc_chain_p, bool slp)
3774 {
3775 gphi *phi = as_a <gphi *> (phi_info->stmt);
3776 gimple *phi_use_stmt = NULL;
3777 imm_use_iterator imm_iter;
3778 use_operand_p use_p;
3779
3780 *double_reduc = false;
3781 *reduc_chain_p = false;
3782 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3783
3784 tree phi_name = PHI_RESULT (phi);
3785 /* ??? If there are no uses of the PHI result the inner loop reduction
3786 won't be detected as possibly double-reduction by vectorizable_reduction
3787 because that tries to walk the PHI arg from the preheader edge which
3788 can be constant. See PR60382. */
3789 if (has_zero_uses (phi_name))
3790 return NULL;
3791 class loop *loop = (gimple_bb (phi))->loop_father;
3792 unsigned nphi_def_loop_uses = 0;
3793 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3794 {
3795 gimple *use_stmt = USE_STMT (use_p);
3796 if (is_gimple_debug (use_stmt))
3797 continue;
3798
3799 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3800 {
3801 if (dump_enabled_p ())
3802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3803 "intermediate value used outside loop.\n");
3804
3805 return NULL;
3806 }
3807
3808 nphi_def_loop_uses++;
3809 phi_use_stmt = use_stmt;
3810 }
3811
3812 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3813 if (TREE_CODE (latch_def) != SSA_NAME)
3814 {
3815 if (dump_enabled_p ())
3816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3817 "reduction: not ssa_name: %T\n", latch_def);
3818 return NULL;
3819 }
3820
3821 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3822 if (!def_stmt_info
3823 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3824 return NULL;
3825
3826 bool nested_in_vect_loop
3827 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3828 unsigned nlatch_def_loop_uses = 0;
3829 auto_vec<gphi *, 3> lcphis;
3830 bool inner_loop_of_double_reduc = false;
3831 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3832 {
3833 gimple *use_stmt = USE_STMT (use_p);
3834 if (is_gimple_debug (use_stmt))
3835 continue;
3836 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3837 nlatch_def_loop_uses++;
3838 else
3839 {
3840 /* We can have more than one loop-closed PHI. */
3841 lcphis.safe_push (as_a <gphi *> (use_stmt));
3842 if (nested_in_vect_loop
3843 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3844 == vect_double_reduction_def))
3845 inner_loop_of_double_reduc = true;
3846 }
3847 }
3848
3849 /* If we are vectorizing an inner reduction we are executing that
3850 in the original order only in case we are not dealing with a
3851 double reduction. */
3852 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3853 {
3854 if (dump_enabled_p ())
3855 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3856 "detected nested cycle: ");
3857 return def_stmt_info;
3858 }
3859
3860 /* When the inner loop of a double reduction ends up with more than
3861 one loop-closed PHI we have failed to classify alternate such
3862 PHIs as double reduction, leading to wrong code. See PR103237. */
3863 if (inner_loop_of_double_reduc && lcphis.length () != 1)
3864 {
3865 if (dump_enabled_p ())
3866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3867 "unhandle double reduction\n");
3868 return NULL;
3869 }
3870
3871 /* If this isn't a nested cycle or if the nested cycle reduction value
3872 is used ouside of the inner loop we cannot handle uses of the reduction
3873 value. */
3874 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3875 {
3876 if (dump_enabled_p ())
3877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3878 "reduction used in loop.\n");
3879 return NULL;
3880 }
3881
3882 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3883 defined in the inner loop. */
3884 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3885 {
3886 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3887 if (gimple_phi_num_args (def_stmt) != 1
3888 || TREE_CODE (op1) != SSA_NAME)
3889 {
3890 if (dump_enabled_p ())
3891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3892 "unsupported phi node definition.\n");
3893
3894 return NULL;
3895 }
3896
3897 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3898 and the latch definition op1. */
3899 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3900 if (gimple_bb (def1)
3901 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3902 && loop->inner
3903 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3904 && (is_gimple_assign (def1) || is_gimple_call (def1))
3905 && is_a <gphi *> (phi_use_stmt)
3906 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3907 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3908 loop_latch_edge (loop->inner))))
3909 {
3910 if (dump_enabled_p ())
3911 report_vect_op (MSG_NOTE, def_stmt,
3912 "detected double reduction: ");
3913
3914 *double_reduc = true;
3915 return def_stmt_info;
3916 }
3917
3918 return NULL;
3919 }
3920
3921 /* Look for the expression computing latch_def from then loop PHI result. */
3922 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3923 code_helper code;
3924 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3925 path))
3926 {
3927 STMT_VINFO_REDUC_CODE (phi_info) = code;
3928 if (code == COND_EXPR && !nested_in_vect_loop)
3929 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3930
3931 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3932 reduction chain for which the additional restriction is that
3933 all operations in the chain are the same. */
3934 auto_vec<stmt_vec_info, 8> reduc_chain;
3935 unsigned i;
3936 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3937 for (i = path.length () - 1; i >= 1; --i)
3938 {
3939 gimple *stmt = USE_STMT (path[i].second);
3940 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3941 gimple_match_op op;
3942 if (!gimple_extract_op (stmt, &op))
3943 gcc_unreachable ();
3944 if (gassign *assign = dyn_cast<gassign *> (stmt))
3945 STMT_VINFO_REDUC_IDX (stmt_info)
3946 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3947 else
3948 {
3949 gcall *call = as_a<gcall *> (stmt);
3950 STMT_VINFO_REDUC_IDX (stmt_info)
3951 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3952 }
3953 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3954 && (i == 1 || i == path.length () - 1));
3955 if ((op.code != code && !leading_conversion)
3956 /* We can only handle the final value in epilogue
3957 generation for reduction chains. */
3958 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3959 is_slp_reduc = false;
3960 /* For reduction chains we support a trailing/leading
3961 conversions. We do not store those in the actual chain. */
3962 if (leading_conversion)
3963 continue;
3964 reduc_chain.safe_push (stmt_info);
3965 }
3966 if (slp && is_slp_reduc && reduc_chain.length () > 1)
3967 {
3968 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3969 {
3970 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3971 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3972 }
3973 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3974 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3975
3976 /* Save the chain for further analysis in SLP detection. */
3977 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3978 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3979
3980 *reduc_chain_p = true;
3981 if (dump_enabled_p ())
3982 dump_printf_loc (MSG_NOTE, vect_location,
3983 "reduction: detected reduction chain\n");
3984 }
3985 else if (dump_enabled_p ())
3986 dump_printf_loc (MSG_NOTE, vect_location,
3987 "reduction: detected reduction\n");
3988
3989 return def_stmt_info;
3990 }
3991
3992 if (dump_enabled_p ())
3993 dump_printf_loc (MSG_NOTE, vect_location,
3994 "reduction: unknown pattern\n");
3995
3996 return NULL;
3997 }
3998
3999 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4000 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4001 or -1 if not known. */
4002
4003 static int
4004 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4005 {
4006 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4007 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4008 {
4009 if (dump_enabled_p ())
4010 dump_printf_loc (MSG_NOTE, vect_location,
4011 "cost model: epilogue peel iters set to vf/2 "
4012 "because loop iterations are unknown .\n");
4013 return assumed_vf / 2;
4014 }
4015 else
4016 {
4017 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4018 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4019 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4020 /* If we need to peel for gaps, but no peeling is required, we have to
4021 peel VF iterations. */
4022 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4023 peel_iters_epilogue = assumed_vf;
4024 return peel_iters_epilogue;
4025 }
4026 }
4027
4028 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4029 int
4030 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4031 int *peel_iters_epilogue,
4032 stmt_vector_for_cost *scalar_cost_vec,
4033 stmt_vector_for_cost *prologue_cost_vec,
4034 stmt_vector_for_cost *epilogue_cost_vec)
4035 {
4036 int retval = 0;
4037
4038 *peel_iters_epilogue
4039 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4040
4041 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4042 {
4043 /* If peeled iterations are known but number of scalar loop
4044 iterations are unknown, count a taken branch per peeled loop. */
4045 if (peel_iters_prologue > 0)
4046 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4047 vect_prologue);
4048 if (*peel_iters_epilogue > 0)
4049 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4050 vect_epilogue);
4051 }
4052
4053 stmt_info_for_cost *si;
4054 int j;
4055 if (peel_iters_prologue)
4056 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4057 retval += record_stmt_cost (prologue_cost_vec,
4058 si->count * peel_iters_prologue,
4059 si->kind, si->stmt_info, si->misalign,
4060 vect_prologue);
4061 if (*peel_iters_epilogue)
4062 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4063 retval += record_stmt_cost (epilogue_cost_vec,
4064 si->count * *peel_iters_epilogue,
4065 si->kind, si->stmt_info, si->misalign,
4066 vect_epilogue);
4067
4068 return retval;
4069 }
4070
4071 /* Function vect_estimate_min_profitable_iters
4072
4073 Return the number of iterations required for the vector version of the
4074 loop to be profitable relative to the cost of the scalar version of the
4075 loop.
4076
4077 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4078 of iterations for vectorization. -1 value means loop vectorization
4079 is not profitable. This returned value may be used for dynamic
4080 profitability check.
4081
4082 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4083 for static check against estimated number of iterations. */
4084
4085 static void
4086 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4087 int *ret_min_profitable_niters,
4088 int *ret_min_profitable_estimate,
4089 unsigned *suggested_unroll_factor)
4090 {
4091 int min_profitable_iters;
4092 int min_profitable_estimate;
4093 int peel_iters_prologue;
4094 int peel_iters_epilogue;
4095 unsigned vec_inside_cost = 0;
4096 int vec_outside_cost = 0;
4097 unsigned vec_prologue_cost = 0;
4098 unsigned vec_epilogue_cost = 0;
4099 int scalar_single_iter_cost = 0;
4100 int scalar_outside_cost = 0;
4101 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4102 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4103 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4104
4105 /* Cost model disabled. */
4106 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4107 {
4108 if (dump_enabled_p ())
4109 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4110 *ret_min_profitable_niters = 0;
4111 *ret_min_profitable_estimate = 0;
4112 return;
4113 }
4114
4115 /* Requires loop versioning tests to handle misalignment. */
4116 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4117 {
4118 /* FIXME: Make cost depend on complexity of individual check. */
4119 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4120 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4121 if (dump_enabled_p ())
4122 dump_printf (MSG_NOTE,
4123 "cost model: Adding cost of checks for loop "
4124 "versioning to treat misalignment.\n");
4125 }
4126
4127 /* Requires loop versioning with alias checks. */
4128 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4129 {
4130 /* FIXME: Make cost depend on complexity of individual check. */
4131 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4132 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4133 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4134 if (len)
4135 /* Count LEN - 1 ANDs and LEN comparisons. */
4136 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4137 scalar_stmt, vect_prologue);
4138 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4139 if (len)
4140 {
4141 /* Count LEN - 1 ANDs and LEN comparisons. */
4142 unsigned int nstmts = len * 2 - 1;
4143 /* +1 for each bias that needs adding. */
4144 for (unsigned int i = 0; i < len; ++i)
4145 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4146 nstmts += 1;
4147 (void) add_stmt_cost (target_cost_data, nstmts,
4148 scalar_stmt, vect_prologue);
4149 }
4150 if (dump_enabled_p ())
4151 dump_printf (MSG_NOTE,
4152 "cost model: Adding cost of checks for loop "
4153 "versioning aliasing.\n");
4154 }
4155
4156 /* Requires loop versioning with niter checks. */
4157 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4158 {
4159 /* FIXME: Make cost depend on complexity of individual check. */
4160 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4161 NULL, NULL, NULL_TREE, 0, vect_prologue);
4162 if (dump_enabled_p ())
4163 dump_printf (MSG_NOTE,
4164 "cost model: Adding cost of checks for loop "
4165 "versioning niters.\n");
4166 }
4167
4168 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4169 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4170 vect_prologue);
4171
4172 /* Count statements in scalar loop. Using this as scalar cost for a single
4173 iteration for now.
4174
4175 TODO: Add outer loop support.
4176
4177 TODO: Consider assigning different costs to different scalar
4178 statements. */
4179
4180 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4181
4182 /* Add additional cost for the peeled instructions in prologue and epilogue
4183 loop. (For fully-masked loops there will be no peeling.)
4184
4185 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4186 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4187
4188 TODO: Build an expression that represents peel_iters for prologue and
4189 epilogue to be used in a run-time test. */
4190
4191 bool prologue_need_br_taken_cost = false;
4192 bool prologue_need_br_not_taken_cost = false;
4193
4194 /* Calculate peel_iters_prologue. */
4195 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4196 peel_iters_prologue = 0;
4197 else if (npeel < 0)
4198 {
4199 peel_iters_prologue = assumed_vf / 2;
4200 if (dump_enabled_p ())
4201 dump_printf (MSG_NOTE, "cost model: "
4202 "prologue peel iters set to vf/2.\n");
4203
4204 /* If peeled iterations are unknown, count a taken branch and a not taken
4205 branch per peeled loop. Even if scalar loop iterations are known,
4206 vector iterations are not known since peeled prologue iterations are
4207 not known. Hence guards remain the same. */
4208 prologue_need_br_taken_cost = true;
4209 prologue_need_br_not_taken_cost = true;
4210 }
4211 else
4212 {
4213 peel_iters_prologue = npeel;
4214 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4215 /* If peeled iterations are known but number of scalar loop
4216 iterations are unknown, count a taken branch per peeled loop. */
4217 prologue_need_br_taken_cost = true;
4218 }
4219
4220 bool epilogue_need_br_taken_cost = false;
4221 bool epilogue_need_br_not_taken_cost = false;
4222
4223 /* Calculate peel_iters_epilogue. */
4224 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4225 /* We need to peel exactly one iteration for gaps. */
4226 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4227 else if (npeel < 0)
4228 {
4229 /* If peeling for alignment is unknown, loop bound of main loop
4230 becomes unknown. */
4231 peel_iters_epilogue = assumed_vf / 2;
4232 if (dump_enabled_p ())
4233 dump_printf (MSG_NOTE, "cost model: "
4234 "epilogue peel iters set to vf/2 because "
4235 "peeling for alignment is unknown.\n");
4236
4237 /* See the same reason above in peel_iters_prologue calculation. */
4238 epilogue_need_br_taken_cost = true;
4239 epilogue_need_br_not_taken_cost = true;
4240 }
4241 else
4242 {
4243 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4244 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4245 /* If peeled iterations are known but number of scalar loop
4246 iterations are unknown, count a taken branch per peeled loop. */
4247 epilogue_need_br_taken_cost = true;
4248 }
4249
4250 stmt_info_for_cost *si;
4251 int j;
4252 /* Add costs associated with peel_iters_prologue. */
4253 if (peel_iters_prologue)
4254 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4255 {
4256 (void) add_stmt_cost (target_cost_data,
4257 si->count * peel_iters_prologue, si->kind,
4258 si->stmt_info, si->node, si->vectype,
4259 si->misalign, vect_prologue);
4260 }
4261
4262 /* Add costs associated with peel_iters_epilogue. */
4263 if (peel_iters_epilogue)
4264 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4265 {
4266 (void) add_stmt_cost (target_cost_data,
4267 si->count * peel_iters_epilogue, si->kind,
4268 si->stmt_info, si->node, si->vectype,
4269 si->misalign, vect_epilogue);
4270 }
4271
4272 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4273
4274 if (prologue_need_br_taken_cost)
4275 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4276 vect_prologue);
4277
4278 if (prologue_need_br_not_taken_cost)
4279 (void) add_stmt_cost (target_cost_data, 1,
4280 cond_branch_not_taken, vect_prologue);
4281
4282 if (epilogue_need_br_taken_cost)
4283 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4284 vect_epilogue);
4285
4286 if (epilogue_need_br_not_taken_cost)
4287 (void) add_stmt_cost (target_cost_data, 1,
4288 cond_branch_not_taken, vect_epilogue);
4289
4290 /* Take care of special costs for rgroup controls of partial vectors. */
4291 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4292 {
4293 /* Calculate how many masks we need to generate. */
4294 unsigned int num_masks = 0;
4295 rgroup_controls *rgm;
4296 unsigned int num_vectors_m1;
4297 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4298 if (rgm->type)
4299 num_masks += num_vectors_m1 + 1;
4300 gcc_assert (num_masks > 0);
4301
4302 /* In the worst case, we need to generate each mask in the prologue
4303 and in the loop body. One of the loop body mask instructions
4304 replaces the comparison in the scalar loop, and since we don't
4305 count the scalar comparison against the scalar body, we shouldn't
4306 count that vector instruction against the vector body either.
4307
4308 Sometimes we can use unpacks instead of generating prologue
4309 masks and sometimes the prologue mask will fold to a constant,
4310 so the actual prologue cost might be smaller. However, it's
4311 simpler and safer to use the worst-case cost; if this ends up
4312 being the tie-breaker between vectorizing or not, then it's
4313 probably better not to vectorize. */
4314 (void) add_stmt_cost (target_cost_data, num_masks,
4315 vector_stmt, NULL, NULL, NULL_TREE, 0,
4316 vect_prologue);
4317 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4318 vector_stmt, NULL, NULL, NULL_TREE, 0,
4319 vect_body);
4320 }
4321 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4322 {
4323 /* Referring to the functions vect_set_loop_condition_partial_vectors
4324 and vect_set_loop_controls_directly, we need to generate each
4325 length in the prologue and in the loop body if required. Although
4326 there are some possible optimizations, we consider the worst case
4327 here. */
4328
4329 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4330 signed char partial_load_store_bias
4331 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4332 bool need_iterate_p
4333 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4334 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4335
4336 /* Calculate how many statements to be added. */
4337 unsigned int prologue_stmts = 0;
4338 unsigned int body_stmts = 0;
4339
4340 rgroup_controls *rgc;
4341 unsigned int num_vectors_m1;
4342 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4343 if (rgc->type)
4344 {
4345 /* May need one SHIFT for nitems_total computation. */
4346 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4347 if (nitems != 1 && !niters_known_p)
4348 prologue_stmts += 1;
4349
4350 /* May need one MAX and one MINUS for wrap around. */
4351 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4352 prologue_stmts += 2;
4353
4354 /* Need one MAX and one MINUS for each batch limit excepting for
4355 the 1st one. */
4356 prologue_stmts += num_vectors_m1 * 2;
4357
4358 unsigned int num_vectors = num_vectors_m1 + 1;
4359
4360 /* Need to set up lengths in prologue, only one MIN required
4361 for each since start index is zero. */
4362 prologue_stmts += num_vectors;
4363
4364 /* If we have a non-zero partial load bias, we need one PLUS
4365 to adjust the load length. */
4366 if (partial_load_store_bias != 0)
4367 body_stmts += 1;
4368
4369 /* Each may need two MINs and one MINUS to update lengths in body
4370 for next iteration. */
4371 if (need_iterate_p)
4372 body_stmts += 3 * num_vectors;
4373 }
4374
4375 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4376 scalar_stmt, vect_prologue);
4377 (void) add_stmt_cost (target_cost_data, body_stmts,
4378 scalar_stmt, vect_body);
4379 }
4380
4381 /* FORNOW: The scalar outside cost is incremented in one of the
4382 following ways:
4383
4384 1. The vectorizer checks for alignment and aliasing and generates
4385 a condition that allows dynamic vectorization. A cost model
4386 check is ANDED with the versioning condition. Hence scalar code
4387 path now has the added cost of the versioning check.
4388
4389 if (cost > th & versioning_check)
4390 jmp to vector code
4391
4392 Hence run-time scalar is incremented by not-taken branch cost.
4393
4394 2. The vectorizer then checks if a prologue is required. If the
4395 cost model check was not done before during versioning, it has to
4396 be done before the prologue check.
4397
4398 if (cost <= th)
4399 prologue = scalar_iters
4400 if (prologue == 0)
4401 jmp to vector code
4402 else
4403 execute prologue
4404 if (prologue == num_iters)
4405 go to exit
4406
4407 Hence the run-time scalar cost is incremented by a taken branch,
4408 plus a not-taken branch, plus a taken branch cost.
4409
4410 3. The vectorizer then checks if an epilogue is required. If the
4411 cost model check was not done before during prologue check, it
4412 has to be done with the epilogue check.
4413
4414 if (prologue == 0)
4415 jmp to vector code
4416 else
4417 execute prologue
4418 if (prologue == num_iters)
4419 go to exit
4420 vector code:
4421 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4422 jmp to epilogue
4423
4424 Hence the run-time scalar cost should be incremented by 2 taken
4425 branches.
4426
4427 TODO: The back end may reorder the BBS's differently and reverse
4428 conditions/branch directions. Change the estimates below to
4429 something more reasonable. */
4430
4431 /* If the number of iterations is known and we do not do versioning, we can
4432 decide whether to vectorize at compile time. Hence the scalar version
4433 do not carry cost model guard costs. */
4434 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4435 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4436 {
4437 /* Cost model check occurs at versioning. */
4438 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4439 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4440 else
4441 {
4442 /* Cost model check occurs at prologue generation. */
4443 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4444 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4445 + vect_get_stmt_cost (cond_branch_not_taken);
4446 /* Cost model check occurs at epilogue generation. */
4447 else
4448 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4449 }
4450 }
4451
4452 /* Complete the target-specific cost calculations. */
4453 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4454 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4455 suggested_unroll_factor);
4456
4457 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4458 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4459 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4460 *suggested_unroll_factor,
4461 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4462 {
4463 if (dump_enabled_p ())
4464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4465 "can't unroll as unrolled vectorization factor larger"
4466 " than maximum vectorization factor: "
4467 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4468 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4469 *suggested_unroll_factor = 1;
4470 }
4471
4472 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4473
4474 if (dump_enabled_p ())
4475 {
4476 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4477 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4478 vec_inside_cost);
4479 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4480 vec_prologue_cost);
4481 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4482 vec_epilogue_cost);
4483 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4484 scalar_single_iter_cost);
4485 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4486 scalar_outside_cost);
4487 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4488 vec_outside_cost);
4489 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4490 peel_iters_prologue);
4491 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4492 peel_iters_epilogue);
4493 }
4494
4495 /* Calculate number of iterations required to make the vector version
4496 profitable, relative to the loop bodies only. The following condition
4497 must hold true:
4498 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4499 where
4500 SIC = scalar iteration cost, VIC = vector iteration cost,
4501 VOC = vector outside cost, VF = vectorization factor,
4502 NPEEL = prologue iterations + epilogue iterations,
4503 SOC = scalar outside cost for run time cost model check. */
4504
4505 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4506 - vec_inside_cost);
4507 if (saving_per_viter <= 0)
4508 {
4509 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4510 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4511 "vectorization did not happen for a simd loop");
4512
4513 if (dump_enabled_p ())
4514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4515 "cost model: the vector iteration cost = %d "
4516 "divided by the scalar iteration cost = %d "
4517 "is greater or equal to the vectorization factor = %d"
4518 ".\n",
4519 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4520 *ret_min_profitable_niters = -1;
4521 *ret_min_profitable_estimate = -1;
4522 return;
4523 }
4524
4525 /* ??? The "if" arm is written to handle all cases; see below for what
4526 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4527 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4528 {
4529 /* Rewriting the condition above in terms of the number of
4530 vector iterations (vniters) rather than the number of
4531 scalar iterations (niters) gives:
4532
4533 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4534
4535 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4536
4537 For integer N, X and Y when X > 0:
4538
4539 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4540 int outside_overhead = (vec_outside_cost
4541 - scalar_single_iter_cost * peel_iters_prologue
4542 - scalar_single_iter_cost * peel_iters_epilogue
4543 - scalar_outside_cost);
4544 /* We're only interested in cases that require at least one
4545 vector iteration. */
4546 int min_vec_niters = 1;
4547 if (outside_overhead > 0)
4548 min_vec_niters = outside_overhead / saving_per_viter + 1;
4549
4550 if (dump_enabled_p ())
4551 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4552 min_vec_niters);
4553
4554 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4555 {
4556 /* Now that we know the minimum number of vector iterations,
4557 find the minimum niters for which the scalar cost is larger:
4558
4559 SIC * niters > VIC * vniters + VOC - SOC
4560
4561 We know that the minimum niters is no more than
4562 vniters * VF + NPEEL, but it might be (and often is) less
4563 than that if a partial vector iteration is cheaper than the
4564 equivalent scalar code. */
4565 int threshold = (vec_inside_cost * min_vec_niters
4566 + vec_outside_cost
4567 - scalar_outside_cost);
4568 if (threshold <= 0)
4569 min_profitable_iters = 1;
4570 else
4571 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4572 }
4573 else
4574 /* Convert the number of vector iterations into a number of
4575 scalar iterations. */
4576 min_profitable_iters = (min_vec_niters * assumed_vf
4577 + peel_iters_prologue
4578 + peel_iters_epilogue);
4579 }
4580 else
4581 {
4582 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4583 * assumed_vf
4584 - vec_inside_cost * peel_iters_prologue
4585 - vec_inside_cost * peel_iters_epilogue);
4586 if (min_profitable_iters <= 0)
4587 min_profitable_iters = 0;
4588 else
4589 {
4590 min_profitable_iters /= saving_per_viter;
4591
4592 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4593 <= (((int) vec_inside_cost * min_profitable_iters)
4594 + (((int) vec_outside_cost - scalar_outside_cost)
4595 * assumed_vf)))
4596 min_profitable_iters++;
4597 }
4598 }
4599
4600 if (dump_enabled_p ())
4601 dump_printf (MSG_NOTE,
4602 " Calculated minimum iters for profitability: %d\n",
4603 min_profitable_iters);
4604
4605 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4606 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4607 /* We want the vectorized loop to execute at least once. */
4608 min_profitable_iters = assumed_vf + peel_iters_prologue;
4609 else if (min_profitable_iters < peel_iters_prologue)
4610 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4611 vectorized loop executes at least once. */
4612 min_profitable_iters = peel_iters_prologue;
4613
4614 if (dump_enabled_p ())
4615 dump_printf_loc (MSG_NOTE, vect_location,
4616 " Runtime profitability threshold = %d\n",
4617 min_profitable_iters);
4618
4619 *ret_min_profitable_niters = min_profitable_iters;
4620
4621 /* Calculate number of iterations required to make the vector version
4622 profitable, relative to the loop bodies only.
4623
4624 Non-vectorized variant is SIC * niters and it must win over vector
4625 variant on the expected loop trip count. The following condition must hold true:
4626 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4627
4628 if (vec_outside_cost <= 0)
4629 min_profitable_estimate = 0;
4630 /* ??? This "else if" arm is written to handle all cases; see below for
4631 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4632 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4633 {
4634 /* This is a repeat of the code above, but with + SOC rather
4635 than - SOC. */
4636 int outside_overhead = (vec_outside_cost
4637 - scalar_single_iter_cost * peel_iters_prologue
4638 - scalar_single_iter_cost * peel_iters_epilogue
4639 + scalar_outside_cost);
4640 int min_vec_niters = 1;
4641 if (outside_overhead > 0)
4642 min_vec_niters = outside_overhead / saving_per_viter + 1;
4643
4644 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4645 {
4646 int threshold = (vec_inside_cost * min_vec_niters
4647 + vec_outside_cost
4648 + scalar_outside_cost);
4649 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4650 }
4651 else
4652 min_profitable_estimate = (min_vec_niters * assumed_vf
4653 + peel_iters_prologue
4654 + peel_iters_epilogue);
4655 }
4656 else
4657 {
4658 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4659 * assumed_vf
4660 - vec_inside_cost * peel_iters_prologue
4661 - vec_inside_cost * peel_iters_epilogue)
4662 / ((scalar_single_iter_cost * assumed_vf)
4663 - vec_inside_cost);
4664 }
4665 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4666 if (dump_enabled_p ())
4667 dump_printf_loc (MSG_NOTE, vect_location,
4668 " Static estimate profitability threshold = %d\n",
4669 min_profitable_estimate);
4670
4671 *ret_min_profitable_estimate = min_profitable_estimate;
4672 }
4673
4674 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4675 vector elements (not bits) for a vector with NELT elements. */
4676 static void
4677 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4678 vec_perm_builder *sel)
4679 {
4680 /* The encoding is a single stepped pattern. Any wrap-around is handled
4681 by vec_perm_indices. */
4682 sel->new_vector (nelt, 1, 3);
4683 for (unsigned int i = 0; i < 3; i++)
4684 sel->quick_push (i + offset);
4685 }
4686
4687 /* Checks whether the target supports whole-vector shifts for vectors of mode
4688 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4689 it supports vec_perm_const with masks for all necessary shift amounts. */
4690 static bool
4691 have_whole_vector_shift (machine_mode mode)
4692 {
4693 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4694 return true;
4695
4696 /* Variable-length vectors should be handled via the optab. */
4697 unsigned int nelt;
4698 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4699 return false;
4700
4701 vec_perm_builder sel;
4702 vec_perm_indices indices;
4703 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4704 {
4705 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4706 indices.new_vector (sel, 2, nelt);
4707 if (!can_vec_perm_const_p (mode, mode, indices, false))
4708 return false;
4709 }
4710 return true;
4711 }
4712
4713 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4714 multiplication operands have differing signs and (b) we intend
4715 to emulate the operation using a series of signed DOT_PROD_EXPRs.
4716 See vect_emulate_mixed_dot_prod for the actual sequence used. */
4717
4718 static bool
4719 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
4720 stmt_vec_info stmt_info)
4721 {
4722 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4723 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4724 return false;
4725
4726 tree rhs1 = gimple_assign_rhs1 (assign);
4727 tree rhs2 = gimple_assign_rhs2 (assign);
4728 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4729 return false;
4730
4731 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4732 gcc_assert (reduc_info->is_reduc_info);
4733 return !directly_supported_p (DOT_PROD_EXPR,
4734 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
4735 optab_vector_mixed_sign);
4736 }
4737
4738 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4739 functions. Design better to avoid maintenance issues. */
4740
4741 /* Function vect_model_reduction_cost.
4742
4743 Models cost for a reduction operation, including the vector ops
4744 generated within the strip-mine loop in some cases, the initial
4745 definition before the loop, and the epilogue code that must be generated. */
4746
4747 static void
4748 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4749 stmt_vec_info stmt_info, internal_fn reduc_fn,
4750 vect_reduction_type reduction_type,
4751 int ncopies, stmt_vector_for_cost *cost_vec)
4752 {
4753 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4754 tree vectype;
4755 machine_mode mode;
4756 class loop *loop = NULL;
4757
4758 if (loop_vinfo)
4759 loop = LOOP_VINFO_LOOP (loop_vinfo);
4760
4761 /* Condition reductions generate two reductions in the loop. */
4762 if (reduction_type == COND_REDUCTION)
4763 ncopies *= 2;
4764
4765 vectype = STMT_VINFO_VECTYPE (stmt_info);
4766 mode = TYPE_MODE (vectype);
4767 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4768
4769 gimple_match_op op;
4770 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4771 gcc_unreachable ();
4772
4773 bool emulated_mixed_dot_prod
4774 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
4775 if (reduction_type == EXTRACT_LAST_REDUCTION)
4776 /* No extra instructions are needed in the prologue. The loop body
4777 operations are costed in vectorizable_condition. */
4778 inside_cost = 0;
4779 else if (reduction_type == FOLD_LEFT_REDUCTION)
4780 {
4781 /* No extra instructions needed in the prologue. */
4782 prologue_cost = 0;
4783
4784 if (reduc_fn != IFN_LAST)
4785 /* Count one reduction-like operation per vector. */
4786 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4787 stmt_info, 0, vect_body);
4788 else
4789 {
4790 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4791 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4792 inside_cost = record_stmt_cost (cost_vec, nelements,
4793 vec_to_scalar, stmt_info, 0,
4794 vect_body);
4795 inside_cost += record_stmt_cost (cost_vec, nelements,
4796 scalar_stmt, stmt_info, 0,
4797 vect_body);
4798 }
4799 }
4800 else
4801 {
4802 /* Add in the cost of the initial definitions. */
4803 int prologue_stmts;
4804 if (reduction_type == COND_REDUCTION)
4805 /* For cond reductions we have four vectors: initial index, step,
4806 initial result of the data reduction, initial value of the index
4807 reduction. */
4808 prologue_stmts = 4;
4809 else if (emulated_mixed_dot_prod)
4810 /* We need the initial reduction value and two invariants:
4811 one that contains the minimum signed value and one that
4812 contains half of its negative. */
4813 prologue_stmts = 3;
4814 else
4815 prologue_stmts = 1;
4816 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4817 scalar_to_vec, stmt_info, 0,
4818 vect_prologue);
4819 }
4820
4821 /* Determine cost of epilogue code.
4822
4823 We have a reduction operator that will reduce the vector in one statement.
4824 Also requires scalar extract. */
4825
4826 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4827 {
4828 if (reduc_fn != IFN_LAST)
4829 {
4830 if (reduction_type == COND_REDUCTION)
4831 {
4832 /* An EQ stmt and an COND_EXPR stmt. */
4833 epilogue_cost += record_stmt_cost (cost_vec, 2,
4834 vector_stmt, stmt_info, 0,
4835 vect_epilogue);
4836 /* Reduction of the max index and a reduction of the found
4837 values. */
4838 epilogue_cost += record_stmt_cost (cost_vec, 2,
4839 vec_to_scalar, stmt_info, 0,
4840 vect_epilogue);
4841 /* A broadcast of the max value. */
4842 epilogue_cost += record_stmt_cost (cost_vec, 1,
4843 scalar_to_vec, stmt_info, 0,
4844 vect_epilogue);
4845 }
4846 else
4847 {
4848 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4849 stmt_info, 0, vect_epilogue);
4850 epilogue_cost += record_stmt_cost (cost_vec, 1,
4851 vec_to_scalar, stmt_info, 0,
4852 vect_epilogue);
4853 }
4854 }
4855 else if (reduction_type == COND_REDUCTION)
4856 {
4857 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4858 /* Extraction of scalar elements. */
4859 epilogue_cost += record_stmt_cost (cost_vec,
4860 2 * estimated_nunits,
4861 vec_to_scalar, stmt_info, 0,
4862 vect_epilogue);
4863 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4864 epilogue_cost += record_stmt_cost (cost_vec,
4865 2 * estimated_nunits - 3,
4866 scalar_stmt, stmt_info, 0,
4867 vect_epilogue);
4868 }
4869 else if (reduction_type == EXTRACT_LAST_REDUCTION
4870 || reduction_type == FOLD_LEFT_REDUCTION)
4871 /* No extra instructions need in the epilogue. */
4872 ;
4873 else
4874 {
4875 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4876 tree bitsize = TYPE_SIZE (op.type);
4877 int element_bitsize = tree_to_uhwi (bitsize);
4878 int nelements = vec_size_in_bits / element_bitsize;
4879
4880 if (op.code == COND_EXPR)
4881 op.code = MAX_EXPR;
4882
4883 /* We have a whole vector shift available. */
4884 if (VECTOR_MODE_P (mode)
4885 && directly_supported_p (op.code, vectype)
4886 && have_whole_vector_shift (mode))
4887 {
4888 /* Final reduction via vector shifts and the reduction operator.
4889 Also requires scalar extract. */
4890 epilogue_cost += record_stmt_cost (cost_vec,
4891 exact_log2 (nelements) * 2,
4892 vector_stmt, stmt_info, 0,
4893 vect_epilogue);
4894 epilogue_cost += record_stmt_cost (cost_vec, 1,
4895 vec_to_scalar, stmt_info, 0,
4896 vect_epilogue);
4897 }
4898 else
4899 /* Use extracts and reduction op for final reduction. For N
4900 elements, we have N extracts and N-1 reduction ops. */
4901 epilogue_cost += record_stmt_cost (cost_vec,
4902 nelements + nelements - 1,
4903 vector_stmt, stmt_info, 0,
4904 vect_epilogue);
4905 }
4906 }
4907
4908 if (dump_enabled_p ())
4909 dump_printf (MSG_NOTE,
4910 "vect_model_reduction_cost: inside_cost = %d, "
4911 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4912 prologue_cost, epilogue_cost);
4913 }
4914
4915 /* SEQ is a sequence of instructions that initialize the reduction
4916 described by REDUC_INFO. Emit them in the appropriate place. */
4917
4918 static void
4919 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4920 stmt_vec_info reduc_info, gimple *seq)
4921 {
4922 if (reduc_info->reused_accumulator)
4923 {
4924 /* When reusing an accumulator from the main loop, we only need
4925 initialization instructions if the main loop can be skipped.
4926 In that case, emit the initialization instructions at the end
4927 of the guard block that does the skip. */
4928 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4929 gcc_assert (skip_edge);
4930 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4931 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4932 }
4933 else
4934 {
4935 /* The normal case: emit the initialization instructions on the
4936 preheader edge. */
4937 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4938 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4939 }
4940 }
4941
4942 /* Function get_initial_def_for_reduction
4943
4944 Input:
4945 REDUC_INFO - the info_for_reduction
4946 INIT_VAL - the initial value of the reduction variable
4947 NEUTRAL_OP - a value that has no effect on the reduction, as per
4948 neutral_op_for_reduction
4949
4950 Output:
4951 Return a vector variable, initialized according to the operation that
4952 STMT_VINFO performs. This vector will be used as the initial value
4953 of the vector of partial results.
4954
4955 The value we need is a vector in which element 0 has value INIT_VAL
4956 and every other element has value NEUTRAL_OP. */
4957
4958 static tree
4959 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4960 stmt_vec_info reduc_info,
4961 tree init_val, tree neutral_op)
4962 {
4963 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4964 tree scalar_type = TREE_TYPE (init_val);
4965 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4966 tree init_def;
4967 gimple_seq stmts = NULL;
4968
4969 gcc_assert (vectype);
4970
4971 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4972 || SCALAR_FLOAT_TYPE_P (scalar_type));
4973
4974 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4975 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4976
4977 if (operand_equal_p (init_val, neutral_op))
4978 {
4979 /* If both elements are equal then the vector described above is
4980 just a splat. */
4981 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4982 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4983 }
4984 else
4985 {
4986 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4987 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4988 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4989 {
4990 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4991 element 0. */
4992 init_def = gimple_build_vector_from_val (&stmts, vectype,
4993 neutral_op);
4994 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4995 vectype, init_def, init_val);
4996 }
4997 else
4998 {
4999 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5000 tree_vector_builder elts (vectype, 1, 2);
5001 elts.quick_push (init_val);
5002 elts.quick_push (neutral_op);
5003 init_def = gimple_build_vector (&stmts, &elts);
5004 }
5005 }
5006
5007 if (stmts)
5008 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5009 return init_def;
5010 }
5011
5012 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5013 which performs a reduction involving GROUP_SIZE scalar statements.
5014 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5015 is nonnull, introducing extra elements of that value will not change the
5016 result. */
5017
5018 static void
5019 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5020 stmt_vec_info reduc_info,
5021 vec<tree> *vec_oprnds,
5022 unsigned int number_of_vectors,
5023 unsigned int group_size, tree neutral_op)
5024 {
5025 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5026 unsigned HOST_WIDE_INT nunits;
5027 unsigned j, number_of_places_left_in_vector;
5028 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5029 unsigned int i;
5030
5031 gcc_assert (group_size == initial_values.length () || neutral_op);
5032
5033 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5034 created vectors. It is greater than 1 if unrolling is performed.
5035
5036 For example, we have two scalar operands, s1 and s2 (e.g., group of
5037 strided accesses of size two), while NUNITS is four (i.e., four scalars
5038 of this type can be packed in a vector). The output vector will contain
5039 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5040 will be 2).
5041
5042 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5043 vectors containing the operands.
5044
5045 For example, NUNITS is four as before, and the group size is 8
5046 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5047 {s5, s6, s7, s8}. */
5048
5049 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5050 nunits = group_size;
5051
5052 number_of_places_left_in_vector = nunits;
5053 bool constant_p = true;
5054 tree_vector_builder elts (vector_type, nunits, 1);
5055 elts.quick_grow (nunits);
5056 gimple_seq ctor_seq = NULL;
5057 for (j = 0; j < nunits * number_of_vectors; ++j)
5058 {
5059 tree op;
5060 i = j % group_size;
5061
5062 /* Get the def before the loop. In reduction chain we have only
5063 one initial value. Else we have as many as PHIs in the group. */
5064 if (i >= initial_values.length () || (j > i && neutral_op))
5065 op = neutral_op;
5066 else
5067 op = initial_values[i];
5068
5069 /* Create 'vect_ = {op0,op1,...,opn}'. */
5070 number_of_places_left_in_vector--;
5071 elts[nunits - number_of_places_left_in_vector - 1] = op;
5072 if (!CONSTANT_CLASS_P (op))
5073 constant_p = false;
5074
5075 if (number_of_places_left_in_vector == 0)
5076 {
5077 tree init;
5078 if (constant_p && !neutral_op
5079 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5080 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5081 /* Build the vector directly from ELTS. */
5082 init = gimple_build_vector (&ctor_seq, &elts);
5083 else if (neutral_op)
5084 {
5085 /* Build a vector of the neutral value and shift the
5086 other elements into place. */
5087 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5088 neutral_op);
5089 int k = nunits;
5090 while (k > 0 && elts[k - 1] == neutral_op)
5091 k -= 1;
5092 while (k > 0)
5093 {
5094 k -= 1;
5095 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5096 vector_type, init, elts[k]);
5097 }
5098 }
5099 else
5100 {
5101 /* First time round, duplicate ELTS to fill the
5102 required number of vectors. */
5103 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5104 elts, number_of_vectors, *vec_oprnds);
5105 break;
5106 }
5107 vec_oprnds->quick_push (init);
5108
5109 number_of_places_left_in_vector = nunits;
5110 elts.new_vector (vector_type, nunits, 1);
5111 elts.quick_grow (nunits);
5112 constant_p = true;
5113 }
5114 }
5115 if (ctor_seq != NULL)
5116 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5117 }
5118
5119 /* For a statement STMT_INFO taking part in a reduction operation return
5120 the stmt_vec_info the meta information is stored on. */
5121
5122 stmt_vec_info
5123 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5124 {
5125 stmt_info = vect_orig_stmt (stmt_info);
5126 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5127 if (!is_a <gphi *> (stmt_info->stmt)
5128 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5129 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5130 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5131 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5132 {
5133 if (gimple_phi_num_args (phi) == 1)
5134 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5135 }
5136 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5137 {
5138 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5139 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5140 stmt_info = info;
5141 }
5142 return stmt_info;
5143 }
5144
5145 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5146 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5147 return false. */
5148
5149 static bool
5150 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5151 stmt_vec_info reduc_info)
5152 {
5153 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5154 if (!main_loop_vinfo)
5155 return false;
5156
5157 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5158 return false;
5159
5160 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5161 auto_vec<tree, 16> main_loop_results (num_phis);
5162 auto_vec<tree, 16> initial_values (num_phis);
5163 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5164 {
5165 /* The epilogue loop can be entered either from the main loop or
5166 from an earlier guard block. */
5167 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5168 for (tree incoming_value : reduc_info->reduc_initial_values)
5169 {
5170 /* Look for:
5171
5172 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5173 INITIAL_VALUE(guard block)>. */
5174 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5175
5176 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5177 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5178
5179 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5180 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5181
5182 main_loop_results.quick_push (from_main_loop);
5183 initial_values.quick_push (from_skip);
5184 }
5185 }
5186 else
5187 /* The main loop dominates the epilogue loop. */
5188 main_loop_results.splice (reduc_info->reduc_initial_values);
5189
5190 /* See if the main loop has the kind of accumulator we need. */
5191 vect_reusable_accumulator *accumulator
5192 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5193 if (!accumulator
5194 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5195 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5196 accumulator->reduc_info->reduc_scalar_results.begin ()))
5197 return false;
5198
5199 /* Handle the case where we can reduce wider vectors to narrower ones. */
5200 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5201 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5202 unsigned HOST_WIDE_INT m;
5203 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5204 TYPE_VECTOR_SUBPARTS (vectype), &m))
5205 return false;
5206 /* Check the intermediate vector types and operations are available. */
5207 tree prev_vectype = old_vectype;
5208 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5209 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5210 {
5211 intermediate_nunits = exact_div (intermediate_nunits, 2);
5212 tree intermediate_vectype = get_related_vectype_for_scalar_type
5213 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5214 if (!intermediate_vectype
5215 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5216 intermediate_vectype)
5217 || !can_vec_extract (TYPE_MODE (prev_vectype),
5218 TYPE_MODE (intermediate_vectype)))
5219 return false;
5220 prev_vectype = intermediate_vectype;
5221 }
5222
5223 /* Non-SLP reductions might apply an adjustment after the reduction
5224 operation, in order to simplify the initialization of the accumulator.
5225 If the epilogue loop carries on from where the main loop left off,
5226 it should apply the same adjustment to the final reduction result.
5227
5228 If the epilogue loop can also be entered directly (rather than via
5229 the main loop), we need to be able to handle that case in the same way,
5230 with the same adjustment. (In principle we could add a PHI node
5231 to select the correct adjustment, but in practice that shouldn't be
5232 necessary.) */
5233 tree main_adjustment
5234 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5235 if (loop_vinfo->main_loop_edge && main_adjustment)
5236 {
5237 gcc_assert (num_phis == 1);
5238 tree initial_value = initial_values[0];
5239 /* Check that we can use INITIAL_VALUE as the adjustment and
5240 initialize the accumulator with a neutral value instead. */
5241 if (!operand_equal_p (initial_value, main_adjustment))
5242 return false;
5243 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5244 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5245 code, initial_value);
5246 }
5247 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5248 reduc_info->reduc_initial_values.truncate (0);
5249 reduc_info->reduc_initial_values.splice (initial_values);
5250 reduc_info->reused_accumulator = accumulator;
5251 return true;
5252 }
5253
5254 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5255 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5256
5257 static tree
5258 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5259 gimple_seq *seq)
5260 {
5261 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5262 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5263 tree stype = TREE_TYPE (vectype);
5264 tree new_temp = vec_def;
5265 while (nunits > nunits1)
5266 {
5267 nunits /= 2;
5268 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5269 stype, nunits);
5270 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5271
5272 /* The target has to make sure we support lowpart/highpart
5273 extraction, either via direct vector extract or through
5274 an integer mode punning. */
5275 tree dst1, dst2;
5276 gimple *epilog_stmt;
5277 if (convert_optab_handler (vec_extract_optab,
5278 TYPE_MODE (TREE_TYPE (new_temp)),
5279 TYPE_MODE (vectype1))
5280 != CODE_FOR_nothing)
5281 {
5282 /* Extract sub-vectors directly once vec_extract becomes
5283 a conversion optab. */
5284 dst1 = make_ssa_name (vectype1);
5285 epilog_stmt
5286 = gimple_build_assign (dst1, BIT_FIELD_REF,
5287 build3 (BIT_FIELD_REF, vectype1,
5288 new_temp, TYPE_SIZE (vectype1),
5289 bitsize_int (0)));
5290 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5291 dst2 = make_ssa_name (vectype1);
5292 epilog_stmt
5293 = gimple_build_assign (dst2, BIT_FIELD_REF,
5294 build3 (BIT_FIELD_REF, vectype1,
5295 new_temp, TYPE_SIZE (vectype1),
5296 bitsize_int (bitsize)));
5297 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5298 }
5299 else
5300 {
5301 /* Extract via punning to appropriately sized integer mode
5302 vector. */
5303 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5304 tree etype = build_vector_type (eltype, 2);
5305 gcc_assert (convert_optab_handler (vec_extract_optab,
5306 TYPE_MODE (etype),
5307 TYPE_MODE (eltype))
5308 != CODE_FOR_nothing);
5309 tree tem = make_ssa_name (etype);
5310 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5311 build1 (VIEW_CONVERT_EXPR,
5312 etype, new_temp));
5313 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5314 new_temp = tem;
5315 tem = make_ssa_name (eltype);
5316 epilog_stmt
5317 = gimple_build_assign (tem, BIT_FIELD_REF,
5318 build3 (BIT_FIELD_REF, eltype,
5319 new_temp, TYPE_SIZE (eltype),
5320 bitsize_int (0)));
5321 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5322 dst1 = make_ssa_name (vectype1);
5323 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5324 build1 (VIEW_CONVERT_EXPR,
5325 vectype1, tem));
5326 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5327 tem = make_ssa_name (eltype);
5328 epilog_stmt
5329 = gimple_build_assign (tem, BIT_FIELD_REF,
5330 build3 (BIT_FIELD_REF, eltype,
5331 new_temp, TYPE_SIZE (eltype),
5332 bitsize_int (bitsize)));
5333 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5334 dst2 = make_ssa_name (vectype1);
5335 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5336 build1 (VIEW_CONVERT_EXPR,
5337 vectype1, tem));
5338 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5339 }
5340
5341 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5342 }
5343
5344 return new_temp;
5345 }
5346
5347 /* Function vect_create_epilog_for_reduction
5348
5349 Create code at the loop-epilog to finalize the result of a reduction
5350 computation.
5351
5352 STMT_INFO is the scalar reduction stmt that is being vectorized.
5353 SLP_NODE is an SLP node containing a group of reduction statements. The
5354 first one in this group is STMT_INFO.
5355 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5356 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5357 (counting from 0)
5358
5359 This function:
5360 1. Completes the reduction def-use cycles.
5361 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5362 by calling the function specified by REDUC_FN if available, or by
5363 other means (whole-vector shifts or a scalar loop).
5364 The function also creates a new phi node at the loop exit to preserve
5365 loop-closed form, as illustrated below.
5366
5367 The flow at the entry to this function:
5368
5369 loop:
5370 vec_def = phi <vec_init, null> # REDUCTION_PHI
5371 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5372 s_loop = scalar_stmt # (scalar) STMT_INFO
5373 loop_exit:
5374 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5375 use <s_out0>
5376 use <s_out0>
5377
5378 The above is transformed by this function into:
5379
5380 loop:
5381 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5382 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5383 s_loop = scalar_stmt # (scalar) STMT_INFO
5384 loop_exit:
5385 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5386 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5387 v_out2 = reduce <v_out1>
5388 s_out3 = extract_field <v_out2, 0>
5389 s_out4 = adjust_result <s_out3>
5390 use <s_out4>
5391 use <s_out4>
5392 */
5393
5394 static void
5395 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5396 stmt_vec_info stmt_info,
5397 slp_tree slp_node,
5398 slp_instance slp_node_instance)
5399 {
5400 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5401 gcc_assert (reduc_info->is_reduc_info);
5402 /* For double reductions we need to get at the inner loop reduction
5403 stmt which has the meta info attached. Our stmt_info is that of the
5404 loop-closed PHI of the inner loop which we remember as
5405 def for the reduction PHI generation. */
5406 bool double_reduc = false;
5407 stmt_vec_info rdef_info = stmt_info;
5408 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5409 {
5410 gcc_assert (!slp_node);
5411 double_reduc = true;
5412 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5413 (stmt_info->stmt, 0));
5414 stmt_info = vect_stmt_to_vectorize (stmt_info);
5415 }
5416 gphi *reduc_def_stmt
5417 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5418 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5419 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5420 tree vectype;
5421 machine_mode mode;
5422 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5423 basic_block exit_bb;
5424 tree scalar_dest;
5425 tree scalar_type;
5426 gimple *new_phi = NULL, *phi;
5427 gimple_stmt_iterator exit_gsi;
5428 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5429 gimple *epilog_stmt = NULL;
5430 gimple *exit_phi;
5431 tree bitsize;
5432 tree def;
5433 tree orig_name, scalar_result;
5434 imm_use_iterator imm_iter, phi_imm_iter;
5435 use_operand_p use_p, phi_use_p;
5436 gimple *use_stmt;
5437 auto_vec<tree> reduc_inputs;
5438 int j, i;
5439 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5440 unsigned int group_size = 1, k;
5441 auto_vec<gimple *> phis;
5442 /* SLP reduction without reduction chain, e.g.,
5443 # a1 = phi <a2, a0>
5444 # b1 = phi <b2, b0>
5445 a2 = operation (a1)
5446 b2 = operation (b1) */
5447 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5448 bool direct_slp_reduc;
5449 tree induction_index = NULL_TREE;
5450
5451 if (slp_node)
5452 group_size = SLP_TREE_LANES (slp_node);
5453
5454 if (nested_in_vect_loop_p (loop, stmt_info))
5455 {
5456 outer_loop = loop;
5457 loop = loop->inner;
5458 gcc_assert (!slp_node && double_reduc);
5459 }
5460
5461 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5462 gcc_assert (vectype);
5463 mode = TYPE_MODE (vectype);
5464
5465 tree induc_val = NULL_TREE;
5466 tree adjustment_def = NULL;
5467 if (slp_node)
5468 ;
5469 else
5470 {
5471 /* Optimize: for induction condition reduction, if we can't use zero
5472 for induc_val, use initial_def. */
5473 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5474 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5475 else if (double_reduc)
5476 ;
5477 else
5478 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5479 }
5480
5481 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5482 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5483 if (slp_reduc)
5484 /* All statements produce live-out values. */
5485 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5486 else if (slp_node)
5487 {
5488 /* The last statement in the reduction chain produces the live-out
5489 value. Note SLP optimization can shuffle scalar stmts to
5490 optimize permutations so we have to search for the last stmt. */
5491 for (k = 0; k < group_size; ++k)
5492 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5493 {
5494 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5495 break;
5496 }
5497 }
5498
5499 unsigned vec_num;
5500 int ncopies;
5501 if (slp_node)
5502 {
5503 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5504 ncopies = 1;
5505 }
5506 else
5507 {
5508 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5509 vec_num = 1;
5510 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5511 }
5512
5513 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5514 which is updated with the current index of the loop for every match of
5515 the original loop's cond_expr (VEC_STMT). This results in a vector
5516 containing the last time the condition passed for that vector lane.
5517 The first match will be a 1 to allow 0 to be used for non-matching
5518 indexes. If there are no matches at all then the vector will be all
5519 zeroes.
5520
5521 PR92772: This algorithm is broken for architectures that support
5522 masked vectors, but do not provide fold_extract_last. */
5523 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5524 {
5525 auto_vec<std::pair<tree, bool>, 2> ccompares;
5526 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5527 cond_info = vect_stmt_to_vectorize (cond_info);
5528 while (cond_info != reduc_info)
5529 {
5530 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5531 {
5532 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5533 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5534 ccompares.safe_push
5535 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5536 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5537 }
5538 cond_info
5539 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5540 1 + STMT_VINFO_REDUC_IDX
5541 (cond_info)));
5542 cond_info = vect_stmt_to_vectorize (cond_info);
5543 }
5544 gcc_assert (ccompares.length () != 0);
5545
5546 tree indx_before_incr, indx_after_incr;
5547 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5548 int scalar_precision
5549 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5550 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5551 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5552 (TYPE_MODE (vectype), cr_index_scalar_type,
5553 TYPE_VECTOR_SUBPARTS (vectype));
5554
5555 /* First we create a simple vector induction variable which starts
5556 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5557 vector size (STEP). */
5558
5559 /* Create a {1,2,3,...} vector. */
5560 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5561
5562 /* Create a vector of the step value. */
5563 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5564 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5565
5566 /* Create an induction variable. */
5567 gimple_stmt_iterator incr_gsi;
5568 bool insert_after;
5569 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5570 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5571 insert_after, &indx_before_incr, &indx_after_incr);
5572
5573 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5574 filled with zeros (VEC_ZERO). */
5575
5576 /* Create a vector of 0s. */
5577 tree zero = build_zero_cst (cr_index_scalar_type);
5578 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5579
5580 /* Create a vector phi node. */
5581 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5582 new_phi = create_phi_node (new_phi_tree, loop->header);
5583 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5584 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5585
5586 /* Now take the condition from the loops original cond_exprs
5587 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5588 every match uses values from the induction variable
5589 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5590 (NEW_PHI_TREE).
5591 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5592 the new cond_expr (INDEX_COND_EXPR). */
5593 gimple_seq stmts = NULL;
5594 for (int i = ccompares.length () - 1; i != -1; --i)
5595 {
5596 tree ccompare = ccompares[i].first;
5597 if (ccompares[i].second)
5598 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5599 cr_index_vector_type,
5600 ccompare,
5601 indx_before_incr, new_phi_tree);
5602 else
5603 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5604 cr_index_vector_type,
5605 ccompare,
5606 new_phi_tree, indx_before_incr);
5607 }
5608 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5609
5610 /* Update the phi with the vec cond. */
5611 induction_index = new_phi_tree;
5612 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5613 loop_latch_edge (loop), UNKNOWN_LOCATION);
5614 }
5615
5616 /* 2. Create epilog code.
5617 The reduction epilog code operates across the elements of the vector
5618 of partial results computed by the vectorized loop.
5619 The reduction epilog code consists of:
5620
5621 step 1: compute the scalar result in a vector (v_out2)
5622 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5623 step 3: adjust the scalar result (s_out3) if needed.
5624
5625 Step 1 can be accomplished using one the following three schemes:
5626 (scheme 1) using reduc_fn, if available.
5627 (scheme 2) using whole-vector shifts, if available.
5628 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5629 combined.
5630
5631 The overall epilog code looks like this:
5632
5633 s_out0 = phi <s_loop> # original EXIT_PHI
5634 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5635 v_out2 = reduce <v_out1> # step 1
5636 s_out3 = extract_field <v_out2, 0> # step 2
5637 s_out4 = adjust_result <s_out3> # step 3
5638
5639 (step 3 is optional, and steps 1 and 2 may be combined).
5640 Lastly, the uses of s_out0 are replaced by s_out4. */
5641
5642
5643 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5644 v_out1 = phi <VECT_DEF>
5645 Store them in NEW_PHIS. */
5646 if (double_reduc)
5647 loop = outer_loop;
5648 exit_bb = single_exit (loop)->dest;
5649 exit_gsi = gsi_after_labels (exit_bb);
5650 reduc_inputs.create (slp_node ? vec_num : ncopies);
5651 for (unsigned i = 0; i < vec_num; i++)
5652 {
5653 gimple_seq stmts = NULL;
5654 if (slp_node)
5655 def = vect_get_slp_vect_def (slp_node, i);
5656 else
5657 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5658 for (j = 0; j < ncopies; j++)
5659 {
5660 tree new_def = copy_ssa_name (def);
5661 phi = create_phi_node (new_def, exit_bb);
5662 if (j)
5663 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5664 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5665 new_def = gimple_convert (&stmts, vectype, new_def);
5666 reduc_inputs.quick_push (new_def);
5667 }
5668 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5669 }
5670
5671 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5672 (i.e. when reduc_fn is not available) and in the final adjustment
5673 code (if needed). Also get the original scalar reduction variable as
5674 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5675 represents a reduction pattern), the tree-code and scalar-def are
5676 taken from the original stmt that the pattern-stmt (STMT) replaces.
5677 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5678 are taken from STMT. */
5679
5680 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5681 if (orig_stmt_info != stmt_info)
5682 {
5683 /* Reduction pattern */
5684 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5685 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5686 }
5687
5688 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5689 scalar_type = TREE_TYPE (scalar_dest);
5690 scalar_results.truncate (0);
5691 scalar_results.reserve_exact (group_size);
5692 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5693 bitsize = TYPE_SIZE (scalar_type);
5694
5695 /* True if we should implement SLP_REDUC using native reduction operations
5696 instead of scalar operations. */
5697 direct_slp_reduc = (reduc_fn != IFN_LAST
5698 && slp_reduc
5699 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5700
5701 /* In case of reduction chain, e.g.,
5702 # a1 = phi <a3, a0>
5703 a2 = operation (a1)
5704 a3 = operation (a2),
5705
5706 we may end up with more than one vector result. Here we reduce them
5707 to one vector.
5708
5709 The same is true for a SLP reduction, e.g.,
5710 # a1 = phi <a2, a0>
5711 # b1 = phi <b2, b0>
5712 a2 = operation (a1)
5713 b2 = operation (a2),
5714
5715 where we can end up with more than one vector as well. We can
5716 easily accumulate vectors when the number of vector elements is
5717 a multiple of the SLP group size.
5718
5719 The same is true if we couldn't use a single defuse cycle. */
5720 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5721 || direct_slp_reduc
5722 || (slp_reduc
5723 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
5724 || ncopies > 1)
5725 {
5726 gimple_seq stmts = NULL;
5727 tree single_input = reduc_inputs[0];
5728 for (k = 1; k < reduc_inputs.length (); k++)
5729 single_input = gimple_build (&stmts, code, vectype,
5730 single_input, reduc_inputs[k]);
5731 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5732
5733 reduc_inputs.truncate (0);
5734 reduc_inputs.safe_push (single_input);
5735 }
5736
5737 tree orig_reduc_input = reduc_inputs[0];
5738
5739 /* If this loop is an epilogue loop that can be skipped after the
5740 main loop, we can only share a reduction operation between the
5741 main loop and the epilogue if we put it at the target of the
5742 skip edge.
5743
5744 We can still reuse accumulators if this check fails. Doing so has
5745 the minor(?) benefit of making the epilogue loop's scalar result
5746 independent of the main loop's scalar result. */
5747 bool unify_with_main_loop_p = false;
5748 if (reduc_info->reused_accumulator
5749 && loop_vinfo->skip_this_loop_edge
5750 && single_succ_p (exit_bb)
5751 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5752 {
5753 unify_with_main_loop_p = true;
5754
5755 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5756 reduc_inputs[0] = make_ssa_name (vectype);
5757 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5758 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5759 UNKNOWN_LOCATION);
5760 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5761 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5762 exit_gsi = gsi_after_labels (reduc_block);
5763 }
5764
5765 /* Shouldn't be used beyond this point. */
5766 exit_bb = nullptr;
5767
5768 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5769 && reduc_fn != IFN_LAST)
5770 {
5771 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5772 various data values where the condition matched and another vector
5773 (INDUCTION_INDEX) containing all the indexes of those matches. We
5774 need to extract the last matching index (which will be the index with
5775 highest value) and use this to index into the data vector.
5776 For the case where there were no matches, the data vector will contain
5777 all default values and the index vector will be all zeros. */
5778
5779 /* Get various versions of the type of the vector of indexes. */
5780 tree index_vec_type = TREE_TYPE (induction_index);
5781 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5782 tree index_scalar_type = TREE_TYPE (index_vec_type);
5783 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5784
5785 /* Get an unsigned integer version of the type of the data vector. */
5786 int scalar_precision
5787 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5788 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5789 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5790 vectype);
5791
5792 /* First we need to create a vector (ZERO_VEC) of zeros and another
5793 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5794 can create using a MAX reduction and then expanding.
5795 In the case where the loop never made any matches, the max index will
5796 be zero. */
5797
5798 /* Vector of {0, 0, 0,...}. */
5799 tree zero_vec = build_zero_cst (vectype);
5800
5801 /* Find maximum value from the vector of found indexes. */
5802 tree max_index = make_ssa_name (index_scalar_type);
5803 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5804 1, induction_index);
5805 gimple_call_set_lhs (max_index_stmt, max_index);
5806 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5807
5808 /* Vector of {max_index, max_index, max_index,...}. */
5809 tree max_index_vec = make_ssa_name (index_vec_type);
5810 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5811 max_index);
5812 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5813 max_index_vec_rhs);
5814 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5815
5816 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5817 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5818 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5819 otherwise. Only one value should match, resulting in a vector
5820 (VEC_COND) with one data value and the rest zeros.
5821 In the case where the loop never made any matches, every index will
5822 match, resulting in a vector with all data values (which will all be
5823 the default value). */
5824
5825 /* Compare the max index vector to the vector of found indexes to find
5826 the position of the max value. */
5827 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5828 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5829 induction_index,
5830 max_index_vec);
5831 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5832
5833 /* Use the compare to choose either values from the data vector or
5834 zero. */
5835 tree vec_cond = make_ssa_name (vectype);
5836 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5837 vec_compare,
5838 reduc_inputs[0],
5839 zero_vec);
5840 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5841
5842 /* Finally we need to extract the data value from the vector (VEC_COND)
5843 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5844 reduction, but because this doesn't exist, we can use a MAX reduction
5845 instead. The data value might be signed or a float so we need to cast
5846 it first.
5847 In the case where the loop never made any matches, the data values are
5848 all identical, and so will reduce down correctly. */
5849
5850 /* Make the matched data values unsigned. */
5851 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5852 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5853 vec_cond);
5854 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5855 VIEW_CONVERT_EXPR,
5856 vec_cond_cast_rhs);
5857 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5858
5859 /* Reduce down to a scalar value. */
5860 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5861 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5862 1, vec_cond_cast);
5863 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5864 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5865
5866 /* Convert the reduced value back to the result type and set as the
5867 result. */
5868 gimple_seq stmts = NULL;
5869 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5870 data_reduc);
5871 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5872 scalar_results.safe_push (new_temp);
5873 }
5874 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5875 && reduc_fn == IFN_LAST)
5876 {
5877 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5878 idx = 0;
5879 idx_val = induction_index[0];
5880 val = data_reduc[0];
5881 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5882 if (induction_index[i] > idx_val)
5883 val = data_reduc[i], idx_val = induction_index[i];
5884 return val; */
5885
5886 tree data_eltype = TREE_TYPE (vectype);
5887 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5888 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5889 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5890 /* Enforced by vectorizable_reduction, which ensures we have target
5891 support before allowing a conditional reduction on variable-length
5892 vectors. */
5893 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5894 tree idx_val = NULL_TREE, val = NULL_TREE;
5895 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5896 {
5897 tree old_idx_val = idx_val;
5898 tree old_val = val;
5899 idx_val = make_ssa_name (idx_eltype);
5900 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5901 build3 (BIT_FIELD_REF, idx_eltype,
5902 induction_index,
5903 bitsize_int (el_size),
5904 bitsize_int (off)));
5905 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5906 val = make_ssa_name (data_eltype);
5907 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5908 build3 (BIT_FIELD_REF,
5909 data_eltype,
5910 reduc_inputs[0],
5911 bitsize_int (el_size),
5912 bitsize_int (off)));
5913 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5914 if (off != 0)
5915 {
5916 tree new_idx_val = idx_val;
5917 if (off != v_size - el_size)
5918 {
5919 new_idx_val = make_ssa_name (idx_eltype);
5920 epilog_stmt = gimple_build_assign (new_idx_val,
5921 MAX_EXPR, idx_val,
5922 old_idx_val);
5923 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5924 }
5925 tree cond = make_ssa_name (boolean_type_node);
5926 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5927 idx_val, old_idx_val);
5928 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5929 tree new_val = make_ssa_name (data_eltype);
5930 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5931 cond, val, old_val);
5932 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5933 idx_val = new_idx_val;
5934 val = new_val;
5935 }
5936 }
5937 /* Convert the reduced value back to the result type and set as the
5938 result. */
5939 gimple_seq stmts = NULL;
5940 val = gimple_convert (&stmts, scalar_type, val);
5941 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5942 scalar_results.safe_push (val);
5943 }
5944
5945 /* 2.3 Create the reduction code, using one of the three schemes described
5946 above. In SLP we simply need to extract all the elements from the
5947 vector (without reducing them), so we use scalar shifts. */
5948 else if (reduc_fn != IFN_LAST && !slp_reduc)
5949 {
5950 tree tmp;
5951 tree vec_elem_type;
5952
5953 /* Case 1: Create:
5954 v_out2 = reduc_expr <v_out1> */
5955
5956 if (dump_enabled_p ())
5957 dump_printf_loc (MSG_NOTE, vect_location,
5958 "Reduce using direct vector reduction.\n");
5959
5960 gimple_seq stmts = NULL;
5961 vec_elem_type = TREE_TYPE (vectype);
5962 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5963 vec_elem_type, reduc_inputs[0]);
5964 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5965 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5966
5967 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5968 && induc_val)
5969 {
5970 /* Earlier we set the initial value to be a vector if induc_val
5971 values. Check the result and if it is induc_val then replace
5972 with the original initial value, unless induc_val is
5973 the same as initial_def already. */
5974 tree zcompare = make_ssa_name (boolean_type_node);
5975 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5976 new_temp, induc_val);
5977 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5978 tree initial_def = reduc_info->reduc_initial_values[0];
5979 tmp = make_ssa_name (new_scalar_dest);
5980 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5981 initial_def, new_temp);
5982 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5983 new_temp = tmp;
5984 }
5985
5986 scalar_results.safe_push (new_temp);
5987 }
5988 else if (direct_slp_reduc)
5989 {
5990 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5991 with the elements for other SLP statements replaced with the
5992 neutral value. We can then do a normal reduction on each vector. */
5993
5994 /* Enforced by vectorizable_reduction. */
5995 gcc_assert (reduc_inputs.length () == 1);
5996 gcc_assert (pow2p_hwi (group_size));
5997
5998 gimple_seq seq = NULL;
5999
6000 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6001 and the same element size as VECTYPE. */
6002 tree index = build_index_vector (vectype, 0, 1);
6003 tree index_type = TREE_TYPE (index);
6004 tree index_elt_type = TREE_TYPE (index_type);
6005 tree mask_type = truth_type_for (index_type);
6006
6007 /* Create a vector that, for each element, identifies which of
6008 the REDUC_GROUP_SIZE results should use it. */
6009 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6010 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6011 build_vector_from_val (index_type, index_mask));
6012
6013 /* Get a neutral vector value. This is simply a splat of the neutral
6014 scalar value if we have one, otherwise the initial scalar value
6015 is itself a neutral value. */
6016 tree vector_identity = NULL_TREE;
6017 tree neutral_op = NULL_TREE;
6018 if (slp_node)
6019 {
6020 tree initial_value = NULL_TREE;
6021 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6022 initial_value = reduc_info->reduc_initial_values[0];
6023 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6024 initial_value);
6025 }
6026 if (neutral_op)
6027 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6028 neutral_op);
6029 for (unsigned int i = 0; i < group_size; ++i)
6030 {
6031 /* If there's no univeral neutral value, we can use the
6032 initial scalar value from the original PHI. This is used
6033 for MIN and MAX reduction, for example. */
6034 if (!neutral_op)
6035 {
6036 tree scalar_value = reduc_info->reduc_initial_values[i];
6037 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6038 scalar_value);
6039 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6040 scalar_value);
6041 }
6042
6043 /* Calculate the equivalent of:
6044
6045 sel[j] = (index[j] == i);
6046
6047 which selects the elements of REDUC_INPUTS[0] that should
6048 be included in the result. */
6049 tree compare_val = build_int_cst (index_elt_type, i);
6050 compare_val = build_vector_from_val (index_type, compare_val);
6051 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6052 index, compare_val);
6053
6054 /* Calculate the equivalent of:
6055
6056 vec = seq ? reduc_inputs[0] : vector_identity;
6057
6058 VEC is now suitable for a full vector reduction. */
6059 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6060 sel, reduc_inputs[0], vector_identity);
6061
6062 /* Do the reduction and convert it to the appropriate type. */
6063 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6064 TREE_TYPE (vectype), vec);
6065 scalar = gimple_convert (&seq, scalar_type, scalar);
6066 scalar_results.safe_push (scalar);
6067 }
6068 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6069 }
6070 else
6071 {
6072 bool reduce_with_shift;
6073 tree vec_temp;
6074
6075 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6076
6077 /* See if the target wants to do the final (shift) reduction
6078 in a vector mode of smaller size and first reduce upper/lower
6079 halves against each other. */
6080 enum machine_mode mode1 = mode;
6081 tree stype = TREE_TYPE (vectype);
6082 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6083 unsigned nunits1 = nunits;
6084 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6085 && reduc_inputs.length () == 1)
6086 {
6087 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6088 /* For SLP reductions we have to make sure lanes match up, but
6089 since we're doing individual element final reduction reducing
6090 vector width here is even more important.
6091 ??? We can also separate lanes with permutes, for the common
6092 case of power-of-two group-size odd/even extracts would work. */
6093 if (slp_reduc && nunits != nunits1)
6094 {
6095 nunits1 = least_common_multiple (nunits1, group_size);
6096 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6097 }
6098 }
6099 if (!slp_reduc
6100 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6101 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6102
6103 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6104 stype, nunits1);
6105 reduce_with_shift = have_whole_vector_shift (mode1);
6106 if (!VECTOR_MODE_P (mode1)
6107 || !directly_supported_p (code, vectype1))
6108 reduce_with_shift = false;
6109
6110 /* First reduce the vector to the desired vector size we should
6111 do shift reduction on by combining upper and lower halves. */
6112 gimple_seq stmts = NULL;
6113 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6114 code, &stmts);
6115 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6116 reduc_inputs[0] = new_temp;
6117
6118 if (reduce_with_shift && !slp_reduc)
6119 {
6120 int element_bitsize = tree_to_uhwi (bitsize);
6121 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6122 for variable-length vectors and also requires direct target support
6123 for loop reductions. */
6124 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6125 int nelements = vec_size_in_bits / element_bitsize;
6126 vec_perm_builder sel;
6127 vec_perm_indices indices;
6128
6129 int elt_offset;
6130
6131 tree zero_vec = build_zero_cst (vectype1);
6132 /* Case 2: Create:
6133 for (offset = nelements/2; offset >= 1; offset/=2)
6134 {
6135 Create: va' = vec_shift <va, offset>
6136 Create: va = vop <va, va'>
6137 } */
6138
6139 tree rhs;
6140
6141 if (dump_enabled_p ())
6142 dump_printf_loc (MSG_NOTE, vect_location,
6143 "Reduce using vector shifts\n");
6144
6145 gimple_seq stmts = NULL;
6146 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6147 for (elt_offset = nelements / 2;
6148 elt_offset >= 1;
6149 elt_offset /= 2)
6150 {
6151 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6152 indices.new_vector (sel, 2, nelements);
6153 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6154 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6155 new_temp, zero_vec, mask);
6156 new_temp = gimple_build (&stmts, code,
6157 vectype1, new_name, new_temp);
6158 }
6159 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6160
6161 /* 2.4 Extract the final scalar result. Create:
6162 s_out3 = extract_field <v_out2, bitpos> */
6163
6164 if (dump_enabled_p ())
6165 dump_printf_loc (MSG_NOTE, vect_location,
6166 "extract scalar result\n");
6167
6168 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6169 bitsize, bitsize_zero_node);
6170 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6171 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6172 gimple_assign_set_lhs (epilog_stmt, new_temp);
6173 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6174 scalar_results.safe_push (new_temp);
6175 }
6176 else
6177 {
6178 /* Case 3: Create:
6179 s = extract_field <v_out2, 0>
6180 for (offset = element_size;
6181 offset < vector_size;
6182 offset += element_size;)
6183 {
6184 Create: s' = extract_field <v_out2, offset>
6185 Create: s = op <s, s'> // For non SLP cases
6186 } */
6187
6188 if (dump_enabled_p ())
6189 dump_printf_loc (MSG_NOTE, vect_location,
6190 "Reduce using scalar code.\n");
6191
6192 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6193 int element_bitsize = tree_to_uhwi (bitsize);
6194 tree compute_type = TREE_TYPE (vectype);
6195 gimple_seq stmts = NULL;
6196 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6197 {
6198 int bit_offset;
6199 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6200 vec_temp, bitsize, bitsize_zero_node);
6201
6202 /* In SLP we don't need to apply reduction operation, so we just
6203 collect s' values in SCALAR_RESULTS. */
6204 if (slp_reduc)
6205 scalar_results.safe_push (new_temp);
6206
6207 for (bit_offset = element_bitsize;
6208 bit_offset < vec_size_in_bits;
6209 bit_offset += element_bitsize)
6210 {
6211 tree bitpos = bitsize_int (bit_offset);
6212 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6213 compute_type, vec_temp,
6214 bitsize, bitpos);
6215 if (slp_reduc)
6216 {
6217 /* In SLP we don't need to apply reduction operation, so
6218 we just collect s' values in SCALAR_RESULTS. */
6219 new_temp = new_name;
6220 scalar_results.safe_push (new_name);
6221 }
6222 else
6223 new_temp = gimple_build (&stmts, code, compute_type,
6224 new_name, new_temp);
6225 }
6226 }
6227
6228 /* The only case where we need to reduce scalar results in SLP, is
6229 unrolling. If the size of SCALAR_RESULTS is greater than
6230 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6231 REDUC_GROUP_SIZE. */
6232 if (slp_reduc)
6233 {
6234 tree res, first_res, new_res;
6235
6236 /* Reduce multiple scalar results in case of SLP unrolling. */
6237 for (j = group_size; scalar_results.iterate (j, &res);
6238 j++)
6239 {
6240 first_res = scalar_results[j % group_size];
6241 new_res = gimple_build (&stmts, code, compute_type,
6242 first_res, res);
6243 scalar_results[j % group_size] = new_res;
6244 }
6245 scalar_results.truncate (group_size);
6246 for (k = 0; k < group_size; k++)
6247 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6248 scalar_results[k]);
6249 }
6250 else
6251 {
6252 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6253 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6254 scalar_results.safe_push (new_temp);
6255 }
6256
6257 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6258 }
6259
6260 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6261 && induc_val)
6262 {
6263 /* Earlier we set the initial value to be a vector if induc_val
6264 values. Check the result and if it is induc_val then replace
6265 with the original initial value, unless induc_val is
6266 the same as initial_def already. */
6267 tree zcompare = make_ssa_name (boolean_type_node);
6268 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6269 induc_val);
6270 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6271 tree initial_def = reduc_info->reduc_initial_values[0];
6272 tree tmp = make_ssa_name (new_scalar_dest);
6273 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6274 initial_def, new_temp);
6275 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6276 scalar_results[0] = tmp;
6277 }
6278 }
6279
6280 /* 2.5 Adjust the final result by the initial value of the reduction
6281 variable. (When such adjustment is not needed, then
6282 'adjustment_def' is zero). For example, if code is PLUS we create:
6283 new_temp = loop_exit_def + adjustment_def */
6284
6285 if (adjustment_def)
6286 {
6287 gcc_assert (!slp_reduc);
6288 gimple_seq stmts = NULL;
6289 if (double_reduc)
6290 {
6291 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6292 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6293 new_temp = gimple_build (&stmts, code, vectype,
6294 reduc_inputs[0], adjustment_def);
6295 }
6296 else
6297 {
6298 new_temp = scalar_results[0];
6299 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6300 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6301 new_temp = gimple_build (&stmts, code, scalar_type,
6302 new_temp, adjustment_def);
6303 }
6304
6305 epilog_stmt = gimple_seq_last_stmt (stmts);
6306 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6307 scalar_results[0] = new_temp;
6308 }
6309
6310 /* Record this operation if it could be reused by the epilogue loop. */
6311 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6312 && reduc_inputs.length () == 1)
6313 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6314 { orig_reduc_input, reduc_info });
6315
6316 if (double_reduc)
6317 loop = outer_loop;
6318
6319 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6320 phis with new adjusted scalar results, i.e., replace use <s_out0>
6321 with use <s_out4>.
6322
6323 Transform:
6324 loop_exit:
6325 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6326 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6327 v_out2 = reduce <v_out1>
6328 s_out3 = extract_field <v_out2, 0>
6329 s_out4 = adjust_result <s_out3>
6330 use <s_out0>
6331 use <s_out0>
6332
6333 into:
6334
6335 loop_exit:
6336 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6337 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6338 v_out2 = reduce <v_out1>
6339 s_out3 = extract_field <v_out2, 0>
6340 s_out4 = adjust_result <s_out3>
6341 use <s_out4>
6342 use <s_out4> */
6343
6344 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6345 for (k = 0; k < live_out_stmts.size (); k++)
6346 {
6347 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6348 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6349
6350 phis.create (3);
6351 /* Find the loop-closed-use at the loop exit of the original scalar
6352 result. (The reduction result is expected to have two immediate uses,
6353 one at the latch block, and one at the loop exit). For double
6354 reductions we are looking for exit phis of the outer loop. */
6355 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6356 {
6357 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6358 {
6359 if (!is_gimple_debug (USE_STMT (use_p)))
6360 phis.safe_push (USE_STMT (use_p));
6361 }
6362 else
6363 {
6364 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6365 {
6366 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6367
6368 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6369 {
6370 if (!flow_bb_inside_loop_p (loop,
6371 gimple_bb (USE_STMT (phi_use_p)))
6372 && !is_gimple_debug (USE_STMT (phi_use_p)))
6373 phis.safe_push (USE_STMT (phi_use_p));
6374 }
6375 }
6376 }
6377 }
6378
6379 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6380 {
6381 /* Replace the uses: */
6382 orig_name = PHI_RESULT (exit_phi);
6383
6384 /* Look for a single use at the target of the skip edge. */
6385 if (unify_with_main_loop_p)
6386 {
6387 use_operand_p use_p;
6388 gimple *user;
6389 if (!single_imm_use (orig_name, &use_p, &user))
6390 gcc_unreachable ();
6391 orig_name = gimple_get_lhs (user);
6392 }
6393
6394 scalar_result = scalar_results[k];
6395 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6396 {
6397 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6398 SET_USE (use_p, scalar_result);
6399 update_stmt (use_stmt);
6400 }
6401 }
6402
6403 phis.release ();
6404 }
6405 }
6406
6407 /* Return a vector of type VECTYPE that is equal to the vector select
6408 operation "MASK ? VEC : IDENTITY". Insert the select statements
6409 before GSI. */
6410
6411 static tree
6412 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6413 tree vec, tree identity)
6414 {
6415 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6416 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6417 mask, vec, identity);
6418 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6419 return cond;
6420 }
6421
6422 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6423 order, starting with LHS. Insert the extraction statements before GSI and
6424 associate the new scalar SSA names with variable SCALAR_DEST.
6425 Return the SSA name for the result. */
6426
6427 static tree
6428 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6429 tree_code code, tree lhs, tree vector_rhs)
6430 {
6431 tree vectype = TREE_TYPE (vector_rhs);
6432 tree scalar_type = TREE_TYPE (vectype);
6433 tree bitsize = TYPE_SIZE (scalar_type);
6434 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6435 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6436
6437 for (unsigned HOST_WIDE_INT bit_offset = 0;
6438 bit_offset < vec_size_in_bits;
6439 bit_offset += element_bitsize)
6440 {
6441 tree bitpos = bitsize_int (bit_offset);
6442 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6443 bitsize, bitpos);
6444
6445 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6446 rhs = make_ssa_name (scalar_dest, stmt);
6447 gimple_assign_set_lhs (stmt, rhs);
6448 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6449
6450 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6451 tree new_name = make_ssa_name (scalar_dest, stmt);
6452 gimple_assign_set_lhs (stmt, new_name);
6453 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6454 lhs = new_name;
6455 }
6456 return lhs;
6457 }
6458
6459 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6460 type of the vector input. */
6461
6462 static internal_fn
6463 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6464 {
6465 internal_fn mask_reduc_fn;
6466
6467 switch (reduc_fn)
6468 {
6469 case IFN_FOLD_LEFT_PLUS:
6470 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6471 break;
6472
6473 default:
6474 return IFN_LAST;
6475 }
6476
6477 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6478 OPTIMIZE_FOR_SPEED))
6479 return mask_reduc_fn;
6480 return IFN_LAST;
6481 }
6482
6483 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6484 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6485 statement. CODE is the operation performed by STMT_INFO and OPS are
6486 its scalar operands. REDUC_INDEX is the index of the operand in
6487 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6488 implements in-order reduction, or IFN_LAST if we should open-code it.
6489 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6490 that should be used to control the operation in a fully-masked loop. */
6491
6492 static bool
6493 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6494 stmt_vec_info stmt_info,
6495 gimple_stmt_iterator *gsi,
6496 gimple **vec_stmt, slp_tree slp_node,
6497 gimple *reduc_def_stmt,
6498 tree_code code, internal_fn reduc_fn,
6499 tree ops[3], tree vectype_in,
6500 int reduc_index, vec_loop_masks *masks)
6501 {
6502 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6503 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6504 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6505
6506 int ncopies;
6507 if (slp_node)
6508 ncopies = 1;
6509 else
6510 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6511
6512 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6513 gcc_assert (ncopies == 1);
6514 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6515
6516 if (slp_node)
6517 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6518 TYPE_VECTOR_SUBPARTS (vectype_in)));
6519
6520 tree op0 = ops[1 - reduc_index];
6521
6522 int group_size = 1;
6523 stmt_vec_info scalar_dest_def_info;
6524 auto_vec<tree> vec_oprnds0;
6525 if (slp_node)
6526 {
6527 auto_vec<vec<tree> > vec_defs (2);
6528 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6529 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6530 vec_defs[0].release ();
6531 vec_defs[1].release ();
6532 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6533 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6534 }
6535 else
6536 {
6537 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6538 op0, &vec_oprnds0);
6539 scalar_dest_def_info = stmt_info;
6540 }
6541
6542 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6543 tree scalar_type = TREE_TYPE (scalar_dest);
6544 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6545
6546 int vec_num = vec_oprnds0.length ();
6547 gcc_assert (vec_num == 1 || slp_node);
6548 tree vec_elem_type = TREE_TYPE (vectype_out);
6549 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6550
6551 tree vector_identity = NULL_TREE;
6552 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6553 vector_identity = build_zero_cst (vectype_out);
6554
6555 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6556 int i;
6557 tree def0;
6558 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6559 {
6560 gimple *new_stmt;
6561 tree mask = NULL_TREE;
6562 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6563 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6564
6565 /* Handle MINUS by adding the negative. */
6566 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6567 {
6568 tree negated = make_ssa_name (vectype_out);
6569 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6570 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6571 def0 = negated;
6572 }
6573
6574 if (mask && mask_reduc_fn == IFN_LAST)
6575 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6576 vector_identity);
6577
6578 /* On the first iteration the input is simply the scalar phi
6579 result, and for subsequent iterations it is the output of
6580 the preceding operation. */
6581 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6582 {
6583 if (mask && mask_reduc_fn != IFN_LAST)
6584 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6585 def0, mask);
6586 else
6587 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6588 def0);
6589 /* For chained SLP reductions the output of the previous reduction
6590 operation serves as the input of the next. For the final statement
6591 the output cannot be a temporary - we reuse the original
6592 scalar destination of the last statement. */
6593 if (i != vec_num - 1)
6594 {
6595 gimple_set_lhs (new_stmt, scalar_dest_var);
6596 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6597 gimple_set_lhs (new_stmt, reduc_var);
6598 }
6599 }
6600 else
6601 {
6602 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6603 reduc_var, def0);
6604 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6605 /* Remove the statement, so that we can use the same code paths
6606 as for statements that we've just created. */
6607 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6608 gsi_remove (&tmp_gsi, true);
6609 }
6610
6611 if (i == vec_num - 1)
6612 {
6613 gimple_set_lhs (new_stmt, scalar_dest);
6614 vect_finish_replace_stmt (loop_vinfo,
6615 scalar_dest_def_info,
6616 new_stmt);
6617 }
6618 else
6619 vect_finish_stmt_generation (loop_vinfo,
6620 scalar_dest_def_info,
6621 new_stmt, gsi);
6622
6623 if (slp_node)
6624 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6625 else
6626 {
6627 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6628 *vec_stmt = new_stmt;
6629 }
6630 }
6631
6632 return true;
6633 }
6634
6635 /* Function is_nonwrapping_integer_induction.
6636
6637 Check if STMT_VINO (which is part of loop LOOP) both increments and
6638 does not cause overflow. */
6639
6640 static bool
6641 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6642 {
6643 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6644 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6645 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6646 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6647 widest_int ni, max_loop_value, lhs_max;
6648 wi::overflow_type overflow = wi::OVF_NONE;
6649
6650 /* Make sure the loop is integer based. */
6651 if (TREE_CODE (base) != INTEGER_CST
6652 || TREE_CODE (step) != INTEGER_CST)
6653 return false;
6654
6655 /* Check that the max size of the loop will not wrap. */
6656
6657 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6658 return true;
6659
6660 if (! max_stmt_executions (loop, &ni))
6661 return false;
6662
6663 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6664 &overflow);
6665 if (overflow)
6666 return false;
6667
6668 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6669 TYPE_SIGN (lhs_type), &overflow);
6670 if (overflow)
6671 return false;
6672
6673 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6674 <= TYPE_PRECISION (lhs_type));
6675 }
6676
6677 /* Check if masking can be supported by inserting a conditional expression.
6678 CODE is the code for the operation. COND_FN is the conditional internal
6679 function, if it exists. VECTYPE_IN is the type of the vector input. */
6680 static bool
6681 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6682 tree vectype_in)
6683 {
6684 if (cond_fn != IFN_LAST
6685 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6686 OPTIMIZE_FOR_SPEED))
6687 return false;
6688
6689 if (code.is_tree_code ())
6690 switch (tree_code (code))
6691 {
6692 case DOT_PROD_EXPR:
6693 case SAD_EXPR:
6694 return true;
6695
6696 default:
6697 break;
6698 }
6699 return false;
6700 }
6701
6702 /* Insert a conditional expression to enable masked vectorization. CODE is the
6703 code for the operation. VOP is the array of operands. MASK is the loop
6704 mask. GSI is a statement iterator used to place the new conditional
6705 expression. */
6706 static void
6707 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6708 gimple_stmt_iterator *gsi)
6709 {
6710 switch (tree_code (code))
6711 {
6712 case DOT_PROD_EXPR:
6713 {
6714 tree vectype = TREE_TYPE (vop[1]);
6715 tree zero = build_zero_cst (vectype);
6716 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6717 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6718 mask, vop[1], zero);
6719 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6720 vop[1] = masked_op1;
6721 break;
6722 }
6723
6724 case SAD_EXPR:
6725 {
6726 tree vectype = TREE_TYPE (vop[1]);
6727 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6728 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6729 mask, vop[1], vop[0]);
6730 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6731 vop[1] = masked_op1;
6732 break;
6733 }
6734
6735 default:
6736 gcc_unreachable ();
6737 }
6738 }
6739
6740 /* Function vectorizable_reduction.
6741
6742 Check if STMT_INFO performs a reduction operation that can be vectorized.
6743 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6744 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6745 Return true if STMT_INFO is vectorizable in this way.
6746
6747 This function also handles reduction idioms (patterns) that have been
6748 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6749 may be of this form:
6750 X = pattern_expr (arg0, arg1, ..., X)
6751 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6752 sequence that had been detected and replaced by the pattern-stmt
6753 (STMT_INFO).
6754
6755 This function also handles reduction of condition expressions, for example:
6756 for (int i = 0; i < N; i++)
6757 if (a[i] < value)
6758 last = a[i];
6759 This is handled by vectorising the loop and creating an additional vector
6760 containing the loop indexes for which "a[i] < value" was true. In the
6761 function epilogue this is reduced to a single max value and then used to
6762 index into the vector of results.
6763
6764 In some cases of reduction patterns, the type of the reduction variable X is
6765 different than the type of the other arguments of STMT_INFO.
6766 In such cases, the vectype that is used when transforming STMT_INFO into
6767 a vector stmt is different than the vectype that is used to determine the
6768 vectorization factor, because it consists of a different number of elements
6769 than the actual number of elements that are being operated upon in parallel.
6770
6771 For example, consider an accumulation of shorts into an int accumulator.
6772 On some targets it's possible to vectorize this pattern operating on 8
6773 shorts at a time (hence, the vectype for purposes of determining the
6774 vectorization factor should be V8HI); on the other hand, the vectype that
6775 is used to create the vector form is actually V4SI (the type of the result).
6776
6777 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6778 indicates what is the actual level of parallelism (V8HI in the example), so
6779 that the right vectorization factor would be derived. This vectype
6780 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6781 be used to create the vectorized stmt. The right vectype for the vectorized
6782 stmt is obtained from the type of the result X:
6783 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6784
6785 This means that, contrary to "regular" reductions (or "regular" stmts in
6786 general), the following equation:
6787 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6788 does *NOT* necessarily hold for reduction patterns. */
6789
6790 bool
6791 vectorizable_reduction (loop_vec_info loop_vinfo,
6792 stmt_vec_info stmt_info, slp_tree slp_node,
6793 slp_instance slp_node_instance,
6794 stmt_vector_for_cost *cost_vec)
6795 {
6796 tree vectype_in = NULL_TREE;
6797 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
6798 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6799 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6800 stmt_vec_info cond_stmt_vinfo = NULL;
6801 int i;
6802 int ncopies;
6803 bool single_defuse_cycle = false;
6804 bool nested_cycle = false;
6805 bool double_reduc = false;
6806 int vec_num;
6807 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6808 tree cond_reduc_val = NULL_TREE;
6809
6810 /* Make sure it was already recognized as a reduction computation. */
6811 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6812 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6813 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6814 return false;
6815
6816 /* The stmt we store reduction analysis meta on. */
6817 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6818 reduc_info->is_reduc_info = true;
6819
6820 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6821 {
6822 if (is_a <gphi *> (stmt_info->stmt))
6823 {
6824 if (slp_node)
6825 {
6826 /* We eventually need to set a vector type on invariant
6827 arguments. */
6828 unsigned j;
6829 slp_tree child;
6830 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6831 if (!vect_maybe_update_slp_op_vectype
6832 (child, SLP_TREE_VECTYPE (slp_node)))
6833 {
6834 if (dump_enabled_p ())
6835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6836 "incompatible vector types for "
6837 "invariants\n");
6838 return false;
6839 }
6840 }
6841 /* Analysis for double-reduction is done on the outer
6842 loop PHI, nested cycles have no further restrictions. */
6843 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6844 }
6845 else
6846 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6847 return true;
6848 }
6849
6850 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6851 stmt_vec_info phi_info = stmt_info;
6852 if (!is_a <gphi *> (stmt_info->stmt))
6853 {
6854 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6855 return true;
6856 }
6857 if (slp_node)
6858 {
6859 slp_node_instance->reduc_phis = slp_node;
6860 /* ??? We're leaving slp_node to point to the PHIs, we only
6861 need it to get at the number of vector stmts which wasn't
6862 yet initialized for the instance root. */
6863 }
6864 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6865 {
6866 use_operand_p use_p;
6867 gimple *use_stmt;
6868 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6869 &use_p, &use_stmt);
6870 gcc_assert (res);
6871 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6872 }
6873
6874 /* PHIs should not participate in patterns. */
6875 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6876 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6877
6878 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6879 and compute the reduction chain length. Discover the real
6880 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6881 tree reduc_def
6882 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6883 loop_latch_edge
6884 (gimple_bb (reduc_def_phi)->loop_father));
6885 unsigned reduc_chain_length = 0;
6886 bool only_slp_reduc_chain = true;
6887 stmt_info = NULL;
6888 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6889 while (reduc_def != PHI_RESULT (reduc_def_phi))
6890 {
6891 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6892 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6893 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6894 {
6895 if (dump_enabled_p ())
6896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6897 "reduction chain broken by patterns.\n");
6898 return false;
6899 }
6900 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6901 only_slp_reduc_chain = false;
6902 /* For epilogue generation live members of the chain need
6903 to point back to the PHI via their original stmt for
6904 info_for_reduction to work. For SLP we need to look at
6905 all lanes here - even though we only will vectorize from
6906 the SLP node with live lane zero the other live lanes also
6907 need to be identified as part of a reduction to be able
6908 to skip code generation for them. */
6909 if (slp_for_stmt_info)
6910 {
6911 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
6912 if (STMT_VINFO_LIVE_P (s))
6913 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
6914 }
6915 else if (STMT_VINFO_LIVE_P (vdef))
6916 STMT_VINFO_REDUC_DEF (def) = phi_info;
6917 gimple_match_op op;
6918 if (!gimple_extract_op (vdef->stmt, &op))
6919 {
6920 if (dump_enabled_p ())
6921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922 "reduction chain includes unsupported"
6923 " statement type.\n");
6924 return false;
6925 }
6926 if (CONVERT_EXPR_CODE_P (op.code))
6927 {
6928 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6929 {
6930 if (dump_enabled_p ())
6931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6932 "conversion in the reduction chain.\n");
6933 return false;
6934 }
6935 }
6936 else if (!stmt_info)
6937 /* First non-conversion stmt. */
6938 stmt_info = vdef;
6939 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6940 reduc_chain_length++;
6941 if (!stmt_info && slp_node)
6942 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6943 }
6944 /* PHIs should not participate in patterns. */
6945 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6946
6947 if (nested_in_vect_loop_p (loop, stmt_info))
6948 {
6949 loop = loop->inner;
6950 nested_cycle = true;
6951 }
6952
6953 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6954 element. */
6955 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6956 {
6957 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6958 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6959 }
6960 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6961 gcc_assert (slp_node
6962 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6963
6964 /* 1. Is vectorizable reduction? */
6965 /* Not supportable if the reduction variable is used in the loop, unless
6966 it's a reduction chain. */
6967 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6968 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6969 return false;
6970
6971 /* Reductions that are not used even in an enclosing outer-loop,
6972 are expected to be "live" (used out of the loop). */
6973 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6974 && !STMT_VINFO_LIVE_P (stmt_info))
6975 return false;
6976
6977 /* 2. Has this been recognized as a reduction pattern?
6978
6979 Check if STMT represents a pattern that has been recognized
6980 in earlier analysis stages. For stmts that represent a pattern,
6981 the STMT_VINFO_RELATED_STMT field records the last stmt in
6982 the original sequence that constitutes the pattern. */
6983
6984 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6985 if (orig_stmt_info)
6986 {
6987 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6988 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6989 }
6990
6991 /* 3. Check the operands of the operation. The first operands are defined
6992 inside the loop body. The last operand is the reduction variable,
6993 which is defined by the loop-header-phi. */
6994
6995 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6996 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6997 gimple_match_op op;
6998 if (!gimple_extract_op (stmt_info->stmt, &op))
6999 gcc_unreachable ();
7000 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7001 || op.code == WIDEN_SUM_EXPR
7002 || op.code == SAD_EXPR);
7003
7004 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7005 && !SCALAR_FLOAT_TYPE_P (op.type))
7006 return false;
7007
7008 /* Do not try to vectorize bit-precision reductions. */
7009 if (!type_has_mode_precision_p (op.type))
7010 return false;
7011
7012 /* For lane-reducing ops we're reducing the number of reduction PHIs
7013 which means the only use of that may be in the lane-reducing operation. */
7014 if (lane_reduc_code_p
7015 && reduc_chain_length != 1
7016 && !only_slp_reduc_chain)
7017 {
7018 if (dump_enabled_p ())
7019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7020 "lane-reducing reduction with extra stmts.\n");
7021 return false;
7022 }
7023
7024 /* All uses but the last are expected to be defined in the loop.
7025 The last use is the reduction variable. In case of nested cycle this
7026 assumption is not true: we use reduc_index to record the index of the
7027 reduction variable. */
7028 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7029 /* We need to skip an extra operand for COND_EXPRs with embedded
7030 comparison. */
7031 unsigned opno_adjust = 0;
7032 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7033 opno_adjust = 1;
7034 for (i = 0; i < (int) op.num_ops; i++)
7035 {
7036 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7037 if (i == 0 && op.code == COND_EXPR)
7038 continue;
7039
7040 stmt_vec_info def_stmt_info;
7041 enum vect_def_type dt;
7042 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7043 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7044 &vectype_op[i], &def_stmt_info))
7045 {
7046 if (dump_enabled_p ())
7047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7048 "use not simple.\n");
7049 return false;
7050 }
7051 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7052 continue;
7053
7054 /* There should be only one cycle def in the stmt, the one
7055 leading to reduc_def. */
7056 if (VECTORIZABLE_CYCLE_DEF (dt))
7057 return false;
7058
7059 if (!vectype_op[i])
7060 vectype_op[i]
7061 = get_vectype_for_scalar_type (loop_vinfo,
7062 TREE_TYPE (op.ops[i]), slp_op[i]);
7063
7064 /* To properly compute ncopies we are interested in the widest
7065 non-reduction input type in case we're looking at a widening
7066 accumulation that we later handle in vect_transform_reduction. */
7067 if (lane_reduc_code_p
7068 && vectype_op[i]
7069 && (!vectype_in
7070 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7071 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7072 vectype_in = vectype_op[i];
7073
7074 if (op.code == COND_EXPR)
7075 {
7076 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7077 if (dt == vect_constant_def)
7078 {
7079 cond_reduc_dt = dt;
7080 cond_reduc_val = op.ops[i];
7081 }
7082 if (dt == vect_induction_def
7083 && def_stmt_info
7084 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7085 {
7086 cond_reduc_dt = dt;
7087 cond_stmt_vinfo = def_stmt_info;
7088 }
7089 }
7090 }
7091 if (!vectype_in)
7092 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7093 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7094
7095 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7096 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7097 /* If we have a condition reduction, see if we can simplify it further. */
7098 if (v_reduc_type == COND_REDUCTION)
7099 {
7100 if (slp_node)
7101 return false;
7102
7103 /* When the condition uses the reduction value in the condition, fail. */
7104 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7105 {
7106 if (dump_enabled_p ())
7107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7108 "condition depends on previous iteration\n");
7109 return false;
7110 }
7111
7112 if (reduc_chain_length == 1
7113 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7114 vectype_in, OPTIMIZE_FOR_SPEED))
7115 {
7116 if (dump_enabled_p ())
7117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7118 "optimizing condition reduction with"
7119 " FOLD_EXTRACT_LAST.\n");
7120 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7121 }
7122 else if (cond_reduc_dt == vect_induction_def)
7123 {
7124 tree base
7125 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7126 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7127
7128 gcc_assert (TREE_CODE (base) == INTEGER_CST
7129 && TREE_CODE (step) == INTEGER_CST);
7130 cond_reduc_val = NULL_TREE;
7131 enum tree_code cond_reduc_op_code = ERROR_MARK;
7132 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7133 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7134 ;
7135 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7136 above base; punt if base is the minimum value of the type for
7137 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7138 else if (tree_int_cst_sgn (step) == -1)
7139 {
7140 cond_reduc_op_code = MIN_EXPR;
7141 if (tree_int_cst_sgn (base) == -1)
7142 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7143 else if (tree_int_cst_lt (base,
7144 TYPE_MAX_VALUE (TREE_TYPE (base))))
7145 cond_reduc_val
7146 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7147 }
7148 else
7149 {
7150 cond_reduc_op_code = MAX_EXPR;
7151 if (tree_int_cst_sgn (base) == 1)
7152 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7153 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7154 base))
7155 cond_reduc_val
7156 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7157 }
7158 if (cond_reduc_val)
7159 {
7160 if (dump_enabled_p ())
7161 dump_printf_loc (MSG_NOTE, vect_location,
7162 "condition expression based on "
7163 "integer induction.\n");
7164 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7165 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7166 = cond_reduc_val;
7167 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7168 }
7169 }
7170 else if (cond_reduc_dt == vect_constant_def)
7171 {
7172 enum vect_def_type cond_initial_dt;
7173 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7174 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7175 if (cond_initial_dt == vect_constant_def
7176 && types_compatible_p (TREE_TYPE (cond_initial_val),
7177 TREE_TYPE (cond_reduc_val)))
7178 {
7179 tree e = fold_binary (LE_EXPR, boolean_type_node,
7180 cond_initial_val, cond_reduc_val);
7181 if (e && (integer_onep (e) || integer_zerop (e)))
7182 {
7183 if (dump_enabled_p ())
7184 dump_printf_loc (MSG_NOTE, vect_location,
7185 "condition expression based on "
7186 "compile time constant.\n");
7187 /* Record reduction code at analysis stage. */
7188 STMT_VINFO_REDUC_CODE (reduc_info)
7189 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7190 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7191 }
7192 }
7193 }
7194 }
7195
7196 if (STMT_VINFO_LIVE_P (phi_info))
7197 return false;
7198
7199 if (slp_node)
7200 ncopies = 1;
7201 else
7202 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7203
7204 gcc_assert (ncopies >= 1);
7205
7206 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7207
7208 if (nested_cycle)
7209 {
7210 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7211 == vect_double_reduction_def);
7212 double_reduc = true;
7213 }
7214
7215 /* 4.2. Check support for the epilog operation.
7216
7217 If STMT represents a reduction pattern, then the type of the
7218 reduction variable may be different than the type of the rest
7219 of the arguments. For example, consider the case of accumulation
7220 of shorts into an int accumulator; The original code:
7221 S1: int_a = (int) short_a;
7222 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7223
7224 was replaced with:
7225 STMT: int_acc = widen_sum <short_a, int_acc>
7226
7227 This means that:
7228 1. The tree-code that is used to create the vector operation in the
7229 epilog code (that reduces the partial results) is not the
7230 tree-code of STMT, but is rather the tree-code of the original
7231 stmt from the pattern that STMT is replacing. I.e, in the example
7232 above we want to use 'widen_sum' in the loop, but 'plus' in the
7233 epilog.
7234 2. The type (mode) we use to check available target support
7235 for the vector operation to be created in the *epilog*, is
7236 determined by the type of the reduction variable (in the example
7237 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7238 However the type (mode) we use to check available target support
7239 for the vector operation to be created *inside the loop*, is
7240 determined by the type of the other arguments to STMT (in the
7241 example we'd check this: optab_handler (widen_sum_optab,
7242 vect_short_mode)).
7243
7244 This is contrary to "regular" reductions, in which the types of all
7245 the arguments are the same as the type of the reduction variable.
7246 For "regular" reductions we can therefore use the same vector type
7247 (and also the same tree-code) when generating the epilog code and
7248 when generating the code inside the loop. */
7249
7250 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7251 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7252
7253 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7254 if (reduction_type == TREE_CODE_REDUCTION)
7255 {
7256 /* Check whether it's ok to change the order of the computation.
7257 Generally, when vectorizing a reduction we change the order of the
7258 computation. This may change the behavior of the program in some
7259 cases, so we need to check that this is ok. One exception is when
7260 vectorizing an outer-loop: the inner-loop is executed sequentially,
7261 and therefore vectorizing reductions in the inner-loop during
7262 outer-loop vectorization is safe. Likewise when we are vectorizing
7263 a series of reductions using SLP and the VF is one the reductions
7264 are performed in scalar order. */
7265 if (slp_node
7266 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7267 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7268 ;
7269 else if (needs_fold_left_reduction_p (op.type, orig_code))
7270 {
7271 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7272 is not directy used in stmt. */
7273 if (!only_slp_reduc_chain
7274 && reduc_chain_length != 1)
7275 {
7276 if (dump_enabled_p ())
7277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7278 "in-order reduction chain without SLP.\n");
7279 return false;
7280 }
7281 STMT_VINFO_REDUC_TYPE (reduc_info)
7282 = reduction_type = FOLD_LEFT_REDUCTION;
7283 }
7284 else if (!commutative_binary_op_p (orig_code, op.type)
7285 || !associative_binary_op_p (orig_code, op.type))
7286 {
7287 if (dump_enabled_p ())
7288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7289 "reduction: not commutative/associative");
7290 return false;
7291 }
7292 }
7293
7294 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7295 && ncopies > 1)
7296 {
7297 if (dump_enabled_p ())
7298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7299 "multiple types in double reduction or condition "
7300 "reduction or fold-left reduction.\n");
7301 return false;
7302 }
7303
7304 internal_fn reduc_fn = IFN_LAST;
7305 if (reduction_type == TREE_CODE_REDUCTION
7306 || reduction_type == FOLD_LEFT_REDUCTION
7307 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7308 || reduction_type == CONST_COND_REDUCTION)
7309 {
7310 if (reduction_type == FOLD_LEFT_REDUCTION
7311 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7312 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7313 {
7314 if (reduc_fn != IFN_LAST
7315 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7316 OPTIMIZE_FOR_SPEED))
7317 {
7318 if (dump_enabled_p ())
7319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7320 "reduc op not supported by target.\n");
7321
7322 reduc_fn = IFN_LAST;
7323 }
7324 }
7325 else
7326 {
7327 if (!nested_cycle || double_reduc)
7328 {
7329 if (dump_enabled_p ())
7330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7331 "no reduc code for scalar code.\n");
7332
7333 return false;
7334 }
7335 }
7336 }
7337 else if (reduction_type == COND_REDUCTION)
7338 {
7339 int scalar_precision
7340 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7341 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7342 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7343 vectype_out);
7344
7345 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7346 OPTIMIZE_FOR_SPEED))
7347 reduc_fn = IFN_REDUC_MAX;
7348 }
7349 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7350
7351 if (reduction_type != EXTRACT_LAST_REDUCTION
7352 && (!nested_cycle || double_reduc)
7353 && reduc_fn == IFN_LAST
7354 && !nunits_out.is_constant ())
7355 {
7356 if (dump_enabled_p ())
7357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7358 "missing target support for reduction on"
7359 " variable-length vectors.\n");
7360 return false;
7361 }
7362
7363 /* For SLP reductions, see if there is a neutral value we can use. */
7364 tree neutral_op = NULL_TREE;
7365 if (slp_node)
7366 {
7367 tree initial_value = NULL_TREE;
7368 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7369 initial_value = vect_phi_initial_value (reduc_def_phi);
7370 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7371 orig_code, initial_value);
7372 }
7373
7374 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7375 {
7376 /* We can't support in-order reductions of code such as this:
7377
7378 for (int i = 0; i < n1; ++i)
7379 for (int j = 0; j < n2; ++j)
7380 l += a[j];
7381
7382 since GCC effectively transforms the loop when vectorizing:
7383
7384 for (int i = 0; i < n1 / VF; ++i)
7385 for (int j = 0; j < n2; ++j)
7386 for (int k = 0; k < VF; ++k)
7387 l += a[j];
7388
7389 which is a reassociation of the original operation. */
7390 if (dump_enabled_p ())
7391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7392 "in-order double reduction not supported.\n");
7393
7394 return false;
7395 }
7396
7397 if (reduction_type == FOLD_LEFT_REDUCTION
7398 && slp_node
7399 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7400 {
7401 /* We cannot use in-order reductions in this case because there is
7402 an implicit reassociation of the operations involved. */
7403 if (dump_enabled_p ())
7404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7405 "in-order unchained SLP reductions not supported.\n");
7406 return false;
7407 }
7408
7409 /* For double reductions, and for SLP reductions with a neutral value,
7410 we construct a variable-length initial vector by loading a vector
7411 full of the neutral value and then shift-and-inserting the start
7412 values into the low-numbered elements. */
7413 if ((double_reduc || neutral_op)
7414 && !nunits_out.is_constant ()
7415 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7416 vectype_out, OPTIMIZE_FOR_SPEED))
7417 {
7418 if (dump_enabled_p ())
7419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7420 "reduction on variable-length vectors requires"
7421 " target support for a vector-shift-and-insert"
7422 " operation.\n");
7423 return false;
7424 }
7425
7426 /* Check extra constraints for variable-length unchained SLP reductions. */
7427 if (slp_node
7428 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7429 && !nunits_out.is_constant ())
7430 {
7431 /* We checked above that we could build the initial vector when
7432 there's a neutral element value. Check here for the case in
7433 which each SLP statement has its own initial value and in which
7434 that value needs to be repeated for every instance of the
7435 statement within the initial vector. */
7436 unsigned int group_size = SLP_TREE_LANES (slp_node);
7437 if (!neutral_op
7438 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7439 TREE_TYPE (vectype_out)))
7440 {
7441 if (dump_enabled_p ())
7442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7443 "unsupported form of SLP reduction for"
7444 " variable-length vectors: cannot build"
7445 " initial vector.\n");
7446 return false;
7447 }
7448 /* The epilogue code relies on the number of elements being a multiple
7449 of the group size. The duplicate-and-interleave approach to setting
7450 up the initial vector does too. */
7451 if (!multiple_p (nunits_out, group_size))
7452 {
7453 if (dump_enabled_p ())
7454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7455 "unsupported form of SLP reduction for"
7456 " variable-length vectors: the vector size"
7457 " is not a multiple of the number of results.\n");
7458 return false;
7459 }
7460 }
7461
7462 if (reduction_type == COND_REDUCTION)
7463 {
7464 widest_int ni;
7465
7466 if (! max_loop_iterations (loop, &ni))
7467 {
7468 if (dump_enabled_p ())
7469 dump_printf_loc (MSG_NOTE, vect_location,
7470 "loop count not known, cannot create cond "
7471 "reduction.\n");
7472 return false;
7473 }
7474 /* Convert backedges to iterations. */
7475 ni += 1;
7476
7477 /* The additional index will be the same type as the condition. Check
7478 that the loop can fit into this less one (because we'll use up the
7479 zero slot for when there are no matches). */
7480 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7481 if (wi::geu_p (ni, wi::to_widest (max_index)))
7482 {
7483 if (dump_enabled_p ())
7484 dump_printf_loc (MSG_NOTE, vect_location,
7485 "loop size is greater than data size.\n");
7486 return false;
7487 }
7488 }
7489
7490 /* In case the vectorization factor (VF) is bigger than the number
7491 of elements that we can fit in a vectype (nunits), we have to generate
7492 more than one vector stmt - i.e - we need to "unroll" the
7493 vector stmt by a factor VF/nunits. For more details see documentation
7494 in vectorizable_operation. */
7495
7496 /* If the reduction is used in an outer loop we need to generate
7497 VF intermediate results, like so (e.g. for ncopies=2):
7498 r0 = phi (init, r0)
7499 r1 = phi (init, r1)
7500 r0 = x0 + r0;
7501 r1 = x1 + r1;
7502 (i.e. we generate VF results in 2 registers).
7503 In this case we have a separate def-use cycle for each copy, and therefore
7504 for each copy we get the vector def for the reduction variable from the
7505 respective phi node created for this copy.
7506
7507 Otherwise (the reduction is unused in the loop nest), we can combine
7508 together intermediate results, like so (e.g. for ncopies=2):
7509 r = phi (init, r)
7510 r = x0 + r;
7511 r = x1 + r;
7512 (i.e. we generate VF/2 results in a single register).
7513 In this case for each copy we get the vector def for the reduction variable
7514 from the vectorized reduction operation generated in the previous iteration.
7515
7516 This only works when we see both the reduction PHI and its only consumer
7517 in vectorizable_reduction and there are no intermediate stmts
7518 participating. When unrolling we want each unrolled iteration to have its
7519 own reduction accumulator since one of the main goals of unrolling a
7520 reduction is to reduce the aggregate loop-carried latency. */
7521 if (ncopies > 1
7522 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7523 && reduc_chain_length == 1
7524 && loop_vinfo->suggested_unroll_factor == 1)
7525 single_defuse_cycle = true;
7526
7527 if (single_defuse_cycle || lane_reduc_code_p)
7528 {
7529 gcc_assert (op.code != COND_EXPR);
7530
7531 /* 4. Supportable by target? */
7532 bool ok = true;
7533
7534 /* 4.1. check support for the operation in the loop
7535
7536 This isn't necessary for the lane reduction codes, since they
7537 can only be produced by pattern matching, and it's up to the
7538 pattern matcher to test for support. The main reason for
7539 specifically skipping this step is to avoid rechecking whether
7540 mixed-sign dot-products can be implemented using signed
7541 dot-products. */
7542 machine_mode vec_mode = TYPE_MODE (vectype_in);
7543 if (!lane_reduc_code_p
7544 && !directly_supported_p (op.code, vectype_in, optab_vector))
7545 {
7546 if (dump_enabled_p ())
7547 dump_printf (MSG_NOTE, "op not supported by target.\n");
7548 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7549 || !vect_can_vectorize_without_simd_p (op.code))
7550 ok = false;
7551 else
7552 if (dump_enabled_p ())
7553 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7554 }
7555
7556 if (vect_emulated_vector_p (vectype_in)
7557 && !vect_can_vectorize_without_simd_p (op.code))
7558 {
7559 if (dump_enabled_p ())
7560 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7561 return false;
7562 }
7563
7564 /* lane-reducing operations have to go through vect_transform_reduction.
7565 For the other cases try without the single cycle optimization. */
7566 if (!ok)
7567 {
7568 if (lane_reduc_code_p)
7569 return false;
7570 else
7571 single_defuse_cycle = false;
7572 }
7573 }
7574 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7575
7576 /* If the reduction stmt is one of the patterns that have lane
7577 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7578 if ((ncopies > 1 && ! single_defuse_cycle)
7579 && lane_reduc_code_p)
7580 {
7581 if (dump_enabled_p ())
7582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7583 "multi def-use cycle not possible for lane-reducing "
7584 "reduction operation\n");
7585 return false;
7586 }
7587
7588 if (slp_node
7589 && !(!single_defuse_cycle
7590 && !lane_reduc_code_p
7591 && reduction_type != FOLD_LEFT_REDUCTION))
7592 for (i = 0; i < (int) op.num_ops; i++)
7593 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7594 {
7595 if (dump_enabled_p ())
7596 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7597 "incompatible vector types for invariants\n");
7598 return false;
7599 }
7600
7601 if (slp_node)
7602 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7603 else
7604 vec_num = 1;
7605
7606 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7607 reduction_type, ncopies, cost_vec);
7608 /* Cost the reduction op inside the loop if transformed via
7609 vect_transform_reduction. Otherwise this is costed by the
7610 separate vectorizable_* routines. */
7611 if (single_defuse_cycle || lane_reduc_code_p)
7612 {
7613 int factor = 1;
7614 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7615 /* Three dot-products and a subtraction. */
7616 factor = 4;
7617 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7618 stmt_info, 0, vect_body);
7619 }
7620
7621 if (dump_enabled_p ()
7622 && reduction_type == FOLD_LEFT_REDUCTION)
7623 dump_printf_loc (MSG_NOTE, vect_location,
7624 "using an in-order (fold-left) reduction.\n");
7625 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7626 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7627 reductions go through their own vectorizable_* routines. */
7628 if (!single_defuse_cycle
7629 && !lane_reduc_code_p
7630 && reduction_type != FOLD_LEFT_REDUCTION)
7631 {
7632 stmt_vec_info tem
7633 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7634 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7635 {
7636 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7637 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7638 }
7639 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7640 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7641 }
7642 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7643 {
7644 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7645 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7646
7647 if (reduction_type != FOLD_LEFT_REDUCTION
7648 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7649 && (cond_fn == IFN_LAST
7650 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7651 OPTIMIZE_FOR_SPEED)))
7652 {
7653 if (dump_enabled_p ())
7654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7655 "can't operate on partial vectors because"
7656 " no conditional operation is available.\n");
7657 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7658 }
7659 else if (reduction_type == FOLD_LEFT_REDUCTION
7660 && reduc_fn == IFN_LAST
7661 && !expand_vec_cond_expr_p (vectype_in,
7662 truth_type_for (vectype_in),
7663 SSA_NAME))
7664 {
7665 if (dump_enabled_p ())
7666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7667 "can't operate on partial vectors because"
7668 " no conditional operation is available.\n");
7669 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7670 }
7671 else
7672 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7673 vectype_in, NULL);
7674 }
7675 return true;
7676 }
7677
7678 /* STMT_INFO is a dot-product reduction whose multiplication operands
7679 have different signs. Emit a sequence to emulate the operation
7680 using a series of signed DOT_PROD_EXPRs and return the last
7681 statement generated. VEC_DEST is the result of the vector operation
7682 and VOP lists its inputs. */
7683
7684 static gassign *
7685 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7686 gimple_stmt_iterator *gsi, tree vec_dest,
7687 tree vop[3])
7688 {
7689 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7690 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7691 tree narrow_elttype = TREE_TYPE (narrow_vectype);
7692 gimple *new_stmt;
7693
7694 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7695 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7696 std::swap (vop[0], vop[1]);
7697
7698 /* Convert all inputs to signed types. */
7699 for (int i = 0; i < 3; ++i)
7700 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7701 {
7702 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7703 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7704 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7705 vop[i] = tmp;
7706 }
7707
7708 /* In the comments below we assume 8-bit inputs for simplicity,
7709 but the approach works for any full integer type. */
7710
7711 /* Create a vector of -128. */
7712 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7713 tree min_narrow = build_vector_from_val (narrow_vectype,
7714 min_narrow_elttype);
7715
7716 /* Create a vector of 64. */
7717 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7718 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7719 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7720
7721 /* Emit: SUB_RES = VOP[0] - 128. */
7722 tree sub_res = make_ssa_name (narrow_vectype);
7723 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7724 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7725
7726 /* Emit:
7727
7728 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7729 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7730 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7731
7732 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7733 Doing the two 64 * y steps first allows more time to compute x. */
7734 tree stage1 = make_ssa_name (wide_vectype);
7735 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7736 vop[1], half_narrow, vop[2]);
7737 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7738
7739 tree stage2 = make_ssa_name (wide_vectype);
7740 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7741 vop[1], half_narrow, stage1);
7742 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7743
7744 tree stage3 = make_ssa_name (wide_vectype);
7745 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7746 sub_res, vop[1], stage2);
7747 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7748
7749 /* Convert STAGE3 to the reduction type. */
7750 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7751 }
7752
7753 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7754 value. */
7755
7756 bool
7757 vect_transform_reduction (loop_vec_info loop_vinfo,
7758 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7759 gimple **vec_stmt, slp_tree slp_node)
7760 {
7761 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7762 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7763 int i;
7764 int ncopies;
7765 int vec_num;
7766
7767 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7768 gcc_assert (reduc_info->is_reduc_info);
7769
7770 if (nested_in_vect_loop_p (loop, stmt_info))
7771 {
7772 loop = loop->inner;
7773 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7774 }
7775
7776 gimple_match_op op;
7777 if (!gimple_extract_op (stmt_info->stmt, &op))
7778 gcc_unreachable ();
7779
7780 /* All uses but the last are expected to be defined in the loop.
7781 The last use is the reduction variable. In case of nested cycle this
7782 assumption is not true: we use reduc_index to record the index of the
7783 reduction variable. */
7784 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7785 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7786 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7787 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7788
7789 if (slp_node)
7790 {
7791 ncopies = 1;
7792 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7793 }
7794 else
7795 {
7796 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7797 vec_num = 1;
7798 }
7799
7800 code_helper code = canonicalize_code (op.code, op.type);
7801 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
7802 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7803 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7804
7805 /* Transform. */
7806 tree new_temp = NULL_TREE;
7807 auto_vec<tree> vec_oprnds0;
7808 auto_vec<tree> vec_oprnds1;
7809 auto_vec<tree> vec_oprnds2;
7810 tree def0;
7811
7812 if (dump_enabled_p ())
7813 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7814
7815 /* FORNOW: Multiple types are not supported for condition. */
7816 if (code == COND_EXPR)
7817 gcc_assert (ncopies == 1);
7818
7819 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7820
7821 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7822 if (reduction_type == FOLD_LEFT_REDUCTION)
7823 {
7824 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7825 gcc_assert (code.is_tree_code ());
7826 return vectorize_fold_left_reduction
7827 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
7828 tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
7829 }
7830
7831 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7832 gcc_assert (single_defuse_cycle
7833 || code == DOT_PROD_EXPR
7834 || code == WIDEN_SUM_EXPR
7835 || code == SAD_EXPR);
7836
7837 /* Create the destination vector */
7838 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7839 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7840
7841 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7842 single_defuse_cycle && reduc_index == 0
7843 ? NULL_TREE : op.ops[0], &vec_oprnds0,
7844 single_defuse_cycle && reduc_index == 1
7845 ? NULL_TREE : op.ops[1], &vec_oprnds1,
7846 op.num_ops == 3
7847 && !(single_defuse_cycle && reduc_index == 2)
7848 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7849 if (single_defuse_cycle)
7850 {
7851 gcc_assert (!slp_node);
7852 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7853 op.ops[reduc_index],
7854 reduc_index == 0 ? &vec_oprnds0
7855 : (reduc_index == 1 ? &vec_oprnds1
7856 : &vec_oprnds2));
7857 }
7858
7859 bool emulated_mixed_dot_prod
7860 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
7861 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7862 {
7863 gimple *new_stmt;
7864 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7865 if (masked_loop_p && !mask_by_cond_expr)
7866 {
7867 /* No conditional ifns have been defined for dot-product yet. */
7868 gcc_assert (code != DOT_PROD_EXPR);
7869
7870 /* Make sure that the reduction accumulator is vop[0]. */
7871 if (reduc_index == 1)
7872 {
7873 gcc_assert (commutative_binary_op_p (code, op.type));
7874 std::swap (vop[0], vop[1]);
7875 }
7876 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7877 vectype_in, i);
7878 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7879 vop[0], vop[1], vop[0]);
7880 new_temp = make_ssa_name (vec_dest, call);
7881 gimple_call_set_lhs (call, new_temp);
7882 gimple_call_set_nothrow (call, true);
7883 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7884 new_stmt = call;
7885 }
7886 else
7887 {
7888 if (op.num_ops == 3)
7889 vop[2] = vec_oprnds2[i];
7890
7891 if (masked_loop_p && mask_by_cond_expr)
7892 {
7893 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7894 vectype_in, i);
7895 build_vect_cond_expr (code, vop, mask, gsi);
7896 }
7897
7898 if (emulated_mixed_dot_prod)
7899 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
7900 vec_dest, vop);
7901 else if (code.is_internal_fn ())
7902 new_stmt = gimple_build_call_internal (internal_fn (code),
7903 op.num_ops,
7904 vop[0], vop[1], vop[2]);
7905 else
7906 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
7907 vop[0], vop[1], vop[2]);
7908 new_temp = make_ssa_name (vec_dest, new_stmt);
7909 gimple_set_lhs (new_stmt, new_temp);
7910 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7911 }
7912
7913 if (slp_node)
7914 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7915 else if (single_defuse_cycle
7916 && i < ncopies - 1)
7917 {
7918 if (reduc_index == 0)
7919 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7920 else if (reduc_index == 1)
7921 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7922 else if (reduc_index == 2)
7923 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7924 }
7925 else
7926 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7927 }
7928
7929 if (!slp_node)
7930 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7931
7932 return true;
7933 }
7934
7935 /* Transform phase of a cycle PHI. */
7936
7937 bool
7938 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7939 stmt_vec_info stmt_info, gimple **vec_stmt,
7940 slp_tree slp_node, slp_instance slp_node_instance)
7941 {
7942 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7943 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7944 int i;
7945 int ncopies;
7946 int j;
7947 bool nested_cycle = false;
7948 int vec_num;
7949
7950 if (nested_in_vect_loop_p (loop, stmt_info))
7951 {
7952 loop = loop->inner;
7953 nested_cycle = true;
7954 }
7955
7956 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7957 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7958 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7959 gcc_assert (reduc_info->is_reduc_info);
7960
7961 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7962 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7963 /* Leave the scalar phi in place. */
7964 return true;
7965
7966 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7967 /* For a nested cycle we do not fill the above. */
7968 if (!vectype_in)
7969 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7970 gcc_assert (vectype_in);
7971
7972 if (slp_node)
7973 {
7974 /* The size vect_schedule_slp_instance computes is off for us. */
7975 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7976 * SLP_TREE_LANES (slp_node), vectype_in);
7977 ncopies = 1;
7978 }
7979 else
7980 {
7981 vec_num = 1;
7982 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7983 }
7984
7985 /* Check whether we should use a single PHI node and accumulate
7986 vectors to one before the backedge. */
7987 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7988 ncopies = 1;
7989
7990 /* Create the destination vector */
7991 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7992 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7993 vectype_out);
7994
7995 /* Get the loop-entry arguments. */
7996 tree vec_initial_def = NULL_TREE;
7997 auto_vec<tree> vec_initial_defs;
7998 if (slp_node)
7999 {
8000 vec_initial_defs.reserve (vec_num);
8001 if (nested_cycle)
8002 {
8003 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8004 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8005 &vec_initial_defs);
8006 }
8007 else
8008 {
8009 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8010 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8011 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8012
8013 unsigned int num_phis = stmts.length ();
8014 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8015 num_phis = 1;
8016 initial_values.reserve (num_phis);
8017 for (unsigned int i = 0; i < num_phis; ++i)
8018 {
8019 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8020 initial_values.quick_push (vect_phi_initial_value (this_phi));
8021 }
8022 if (vec_num == 1)
8023 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8024 if (!initial_values.is_empty ())
8025 {
8026 tree initial_value
8027 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8028 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8029 tree neutral_op
8030 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8031 code, initial_value);
8032 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8033 &vec_initial_defs, vec_num,
8034 stmts.length (), neutral_op);
8035 }
8036 }
8037 }
8038 else
8039 {
8040 /* Get at the scalar def before the loop, that defines the initial
8041 value of the reduction variable. */
8042 tree initial_def = vect_phi_initial_value (phi);
8043 reduc_info->reduc_initial_values.safe_push (initial_def);
8044 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8045 and we can't use zero for induc_val, use initial_def. Similarly
8046 for REDUC_MIN and initial_def larger than the base. */
8047 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8048 {
8049 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8050 if (TREE_CODE (initial_def) == INTEGER_CST
8051 && !integer_zerop (induc_val)
8052 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8053 && tree_int_cst_lt (initial_def, induc_val))
8054 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8055 && tree_int_cst_lt (induc_val, initial_def))))
8056 {
8057 induc_val = initial_def;
8058 /* Communicate we used the initial_def to epilouge
8059 generation. */
8060 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8061 }
8062 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8063 }
8064 else if (nested_cycle)
8065 {
8066 /* Do not use an adjustment def as that case is not supported
8067 correctly if ncopies is not one. */
8068 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8069 ncopies, initial_def,
8070 &vec_initial_defs);
8071 }
8072 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8073 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8074 /* Fill the initial vector with the initial scalar value. */
8075 vec_initial_def
8076 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8077 initial_def, initial_def);
8078 else
8079 {
8080 if (ncopies == 1)
8081 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8082 if (!reduc_info->reduc_initial_values.is_empty ())
8083 {
8084 initial_def = reduc_info->reduc_initial_values[0];
8085 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8086 tree neutral_op
8087 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8088 code, initial_def);
8089 gcc_assert (neutral_op);
8090 /* Try to simplify the vector initialization by applying an
8091 adjustment after the reduction has been performed. */
8092 if (!reduc_info->reused_accumulator
8093 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8094 && !operand_equal_p (neutral_op, initial_def))
8095 {
8096 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8097 = initial_def;
8098 initial_def = neutral_op;
8099 }
8100 vec_initial_def
8101 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8102 initial_def, neutral_op);
8103 }
8104 }
8105 }
8106
8107 if (vec_initial_def)
8108 {
8109 vec_initial_defs.create (ncopies);
8110 for (i = 0; i < ncopies; ++i)
8111 vec_initial_defs.quick_push (vec_initial_def);
8112 }
8113
8114 if (auto *accumulator = reduc_info->reused_accumulator)
8115 {
8116 tree def = accumulator->reduc_input;
8117 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8118 {
8119 unsigned int nreduc;
8120 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8121 (TREE_TYPE (def)),
8122 TYPE_VECTOR_SUBPARTS (vectype_out),
8123 &nreduc);
8124 gcc_assert (res);
8125 gimple_seq stmts = NULL;
8126 /* Reduce the single vector to a smaller one. */
8127 if (nreduc != 1)
8128 {
8129 /* Perform the reduction in the appropriate type. */
8130 tree rvectype = vectype_out;
8131 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8132 TREE_TYPE (TREE_TYPE (def))))
8133 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8134 TYPE_VECTOR_SUBPARTS
8135 (vectype_out));
8136 def = vect_create_partial_epilog (def, rvectype,
8137 STMT_VINFO_REDUC_CODE
8138 (reduc_info),
8139 &stmts);
8140 }
8141 /* The epilogue loop might use a different vector mode, like
8142 VNx2DI vs. V2DI. */
8143 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8144 {
8145 tree reduc_type = build_vector_type_for_mode
8146 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8147 def = gimple_convert (&stmts, reduc_type, def);
8148 }
8149 /* Adjust the input so we pick up the partially reduced value
8150 for the skip edge in vect_create_epilog_for_reduction. */
8151 accumulator->reduc_input = def;
8152 /* And the reduction could be carried out using a different sign. */
8153 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8154 def = gimple_convert (&stmts, vectype_out, def);
8155 if (loop_vinfo->main_loop_edge)
8156 {
8157 /* While we'd like to insert on the edge this will split
8158 blocks and disturb bookkeeping, we also will eventually
8159 need this on the skip edge. Rely on sinking to
8160 fixup optimal placement and insert in the pred. */
8161 gimple_stmt_iterator gsi
8162 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8163 /* Insert before a cond that eventually skips the
8164 epilogue. */
8165 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8166 gsi_prev (&gsi);
8167 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8168 }
8169 else
8170 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8171 stmts);
8172 }
8173 if (loop_vinfo->main_loop_edge)
8174 vec_initial_defs[0]
8175 = vect_get_main_loop_result (loop_vinfo, def,
8176 vec_initial_defs[0]);
8177 else
8178 vec_initial_defs.safe_push (def);
8179 }
8180
8181 /* Generate the reduction PHIs upfront. */
8182 for (i = 0; i < vec_num; i++)
8183 {
8184 tree vec_init_def = vec_initial_defs[i];
8185 for (j = 0; j < ncopies; j++)
8186 {
8187 /* Create the reduction-phi that defines the reduction
8188 operand. */
8189 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8190
8191 /* Set the loop-entry arg of the reduction-phi. */
8192 if (j != 0 && nested_cycle)
8193 vec_init_def = vec_initial_defs[j];
8194 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8195 UNKNOWN_LOCATION);
8196
8197 /* The loop-latch arg is set in epilogue processing. */
8198
8199 if (slp_node)
8200 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8201 else
8202 {
8203 if (j == 0)
8204 *vec_stmt = new_phi;
8205 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8206 }
8207 }
8208 }
8209
8210 return true;
8211 }
8212
8213 /* Vectorizes LC PHIs. */
8214
8215 bool
8216 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8217 stmt_vec_info stmt_info, gimple **vec_stmt,
8218 slp_tree slp_node)
8219 {
8220 if (!loop_vinfo
8221 || !is_a <gphi *> (stmt_info->stmt)
8222 || gimple_phi_num_args (stmt_info->stmt) != 1)
8223 return false;
8224
8225 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8226 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8227 return false;
8228
8229 if (!vec_stmt) /* transformation not required. */
8230 {
8231 /* Deal with copies from externs or constants that disguise as
8232 loop-closed PHI nodes (PR97886). */
8233 if (slp_node
8234 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8235 SLP_TREE_VECTYPE (slp_node)))
8236 {
8237 if (dump_enabled_p ())
8238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8239 "incompatible vector types for invariants\n");
8240 return false;
8241 }
8242 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8243 return true;
8244 }
8245
8246 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8247 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8248 basic_block bb = gimple_bb (stmt_info->stmt);
8249 edge e = single_pred_edge (bb);
8250 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8251 auto_vec<tree> vec_oprnds;
8252 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8253 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8254 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8255 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8256 {
8257 /* Create the vectorized LC PHI node. */
8258 gphi *new_phi = create_phi_node (vec_dest, bb);
8259 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8260 if (slp_node)
8261 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8262 else
8263 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8264 }
8265 if (!slp_node)
8266 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8267
8268 return true;
8269 }
8270
8271 /* Vectorizes PHIs. */
8272
8273 bool
8274 vectorizable_phi (vec_info *,
8275 stmt_vec_info stmt_info, gimple **vec_stmt,
8276 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8277 {
8278 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8279 return false;
8280
8281 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8282 return false;
8283
8284 tree vectype = SLP_TREE_VECTYPE (slp_node);
8285
8286 if (!vec_stmt) /* transformation not required. */
8287 {
8288 slp_tree child;
8289 unsigned i;
8290 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8291 if (!child)
8292 {
8293 if (dump_enabled_p ())
8294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8295 "PHI node with unvectorized backedge def\n");
8296 return false;
8297 }
8298 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8299 {
8300 if (dump_enabled_p ())
8301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8302 "incompatible vector types for invariants\n");
8303 return false;
8304 }
8305 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8306 && !useless_type_conversion_p (vectype,
8307 SLP_TREE_VECTYPE (child)))
8308 {
8309 /* With bools we can have mask and non-mask precision vectors
8310 or different non-mask precisions. while pattern recog is
8311 supposed to guarantee consistency here bugs in it can cause
8312 mismatches (PR103489 and PR103800 for example).
8313 Deal with them here instead of ICEing later. */
8314 if (dump_enabled_p ())
8315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8316 "incompatible vector type setup from "
8317 "bool pattern detection\n");
8318 return false;
8319 }
8320
8321 /* For single-argument PHIs assume coalescing which means zero cost
8322 for the scalar and the vector PHIs. This avoids artificially
8323 favoring the vector path (but may pessimize it in some cases). */
8324 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8325 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8326 vector_stmt, stmt_info, vectype, 0, vect_body);
8327 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8328 return true;
8329 }
8330
8331 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8332 basic_block bb = gimple_bb (stmt_info->stmt);
8333 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8334 auto_vec<gphi *> new_phis;
8335 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8336 {
8337 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8338
8339 /* Skip not yet vectorized defs. */
8340 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8341 && SLP_TREE_VEC_STMTS (child).is_empty ())
8342 continue;
8343
8344 auto_vec<tree> vec_oprnds;
8345 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8346 if (!new_phis.exists ())
8347 {
8348 new_phis.create (vec_oprnds.length ());
8349 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8350 {
8351 /* Create the vectorized LC PHI node. */
8352 new_phis.quick_push (create_phi_node (vec_dest, bb));
8353 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8354 }
8355 }
8356 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8357 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8358 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8359 }
8360 /* We should have at least one already vectorized child. */
8361 gcc_assert (new_phis.exists ());
8362
8363 return true;
8364 }
8365
8366 /* Vectorizes first order recurrences. An overview of the transformation
8367 is described below. Suppose we have the following loop.
8368
8369 int t = 0;
8370 for (int i = 0; i < n; ++i)
8371 {
8372 b[i] = a[i] - t;
8373 t = a[i];
8374 }
8375
8376 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8377 looks (simplified) like:
8378
8379 scalar.preheader:
8380 init = 0;
8381
8382 scalar.body:
8383 i = PHI <0(scalar.preheader), i+1(scalar.body)>
8384 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8385 _1 = a[i]
8386 b[i] = _1 - _2
8387 if (i < n) goto scalar.body
8388
8389 In this example, _2 is a recurrence because it's value depends on the
8390 previous iteration. We vectorize this as (VF = 4)
8391
8392 vector.preheader:
8393 vect_init = vect_cst(..., ..., ..., 0)
8394
8395 vector.body
8396 i = PHI <0(vector.preheader), i+4(vector.body)>
8397 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8398 vect_2 = a[i, i+1, i+2, i+3];
8399 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8400 b[i, i+1, i+2, i+3] = vect_2 - vect_3
8401 if (..) goto vector.body
8402
8403 In this function, vectorizable_recurr, we code generate both the
8404 vector PHI node and the permute since those together compute the
8405 vectorized value of the scalar PHI. We do not yet have the
8406 backedge value to fill in there nor into the vec_perm. Those
8407 are filled in maybe_set_vectorized_backedge_value and
8408 vect_schedule_scc.
8409
8410 TODO: Since the scalar loop does not have a use of the recurrence
8411 outside of the loop the natural way to implement peeling via
8412 vectorizing the live value doesn't work. For now peeling of loops
8413 with a recurrence is not implemented. For SLP the supported cases
8414 are restricted to those requiring a single vector recurrence PHI. */
8415
8416 bool
8417 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8418 gimple **vec_stmt, slp_tree slp_node,
8419 stmt_vector_for_cost *cost_vec)
8420 {
8421 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8422 return false;
8423
8424 gphi *phi = as_a<gphi *> (stmt_info->stmt);
8425
8426 /* So far we only support first-order recurrence auto-vectorization. */
8427 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8428 return false;
8429
8430 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8431 unsigned ncopies;
8432 if (slp_node)
8433 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8434 else
8435 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8436 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8437 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8438 /* We need to be able to make progress with a single vector. */
8439 if (maybe_gt (dist * 2, nunits))
8440 {
8441 if (dump_enabled_p ())
8442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8443 "first order recurrence exceeds half of "
8444 "a vector\n");
8445 return false;
8446 }
8447
8448 /* First-order recurrence autovectorization needs to handle permutation
8449 with indices = [nunits-1, nunits, nunits+1, ...]. */
8450 vec_perm_builder sel (nunits, 1, 3);
8451 for (int i = 0; i < 3; ++i)
8452 sel.quick_push (nunits - dist + i);
8453 vec_perm_indices indices (sel, 2, nunits);
8454
8455 if (!vec_stmt) /* transformation not required. */
8456 {
8457 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8458 indices))
8459 return false;
8460
8461 if (slp_node)
8462 {
8463 /* We eventually need to set a vector type on invariant
8464 arguments. */
8465 unsigned j;
8466 slp_tree child;
8467 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8468 if (!vect_maybe_update_slp_op_vectype
8469 (child, SLP_TREE_VECTYPE (slp_node)))
8470 {
8471 if (dump_enabled_p ())
8472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8473 "incompatible vector types for "
8474 "invariants\n");
8475 return false;
8476 }
8477 }
8478 /* The recurrence costs the initialization vector and one permute
8479 for each copy. */
8480 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8481 stmt_info, 0, vect_prologue);
8482 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8483 stmt_info, 0, vect_body);
8484 if (dump_enabled_p ())
8485 dump_printf_loc (MSG_NOTE, vect_location,
8486 "vectorizable_recurr: inside_cost = %d, "
8487 "prologue_cost = %d .\n", inside_cost,
8488 prologue_cost);
8489
8490 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8491 return true;
8492 }
8493
8494 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8495 basic_block bb = gimple_bb (phi);
8496 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8497 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8498 {
8499 gimple_seq stmts = NULL;
8500 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8501 gsi_insert_seq_on_edge_immediate (pe, stmts);
8502 }
8503 tree vec_init = build_vector_from_val (vectype, preheader);
8504 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8505
8506 /* Create the vectorized first-order PHI node. */
8507 tree vec_dest = vect_get_new_vect_var (vectype,
8508 vect_simple_var, "vec_recur_");
8509 gphi *new_phi = create_phi_node (vec_dest, bb);
8510 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8511
8512 /* Insert shuffles the first-order recurrence autovectorization.
8513 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8514 tree perm = vect_gen_perm_mask_checked (vectype, indices);
8515
8516 /* Insert the required permute after the latch definition. The
8517 second and later operands are tentative and will be updated when we have
8518 vectorized the latch definition. */
8519 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8520 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8521 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8522 gsi_next (&gsi2);
8523
8524 for (unsigned i = 0; i < ncopies; ++i)
8525 {
8526 vec_dest = make_ssa_name (vectype);
8527 gassign *vperm
8528 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8529 i == 0 ? gimple_phi_result (new_phi) : NULL,
8530 NULL, perm);
8531 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8532
8533 if (slp_node)
8534 SLP_TREE_VEC_STMTS (slp_node).quick_push (vperm);
8535 else
8536 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8537 }
8538
8539 if (!slp_node)
8540 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8541 return true;
8542 }
8543
8544 /* Return true if VECTYPE represents a vector that requires lowering
8545 by the vector lowering pass. */
8546
8547 bool
8548 vect_emulated_vector_p (tree vectype)
8549 {
8550 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8551 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8552 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8553 }
8554
8555 /* Return true if we can emulate CODE on an integer mode representation
8556 of a vector. */
8557
8558 bool
8559 vect_can_vectorize_without_simd_p (tree_code code)
8560 {
8561 switch (code)
8562 {
8563 case PLUS_EXPR:
8564 case MINUS_EXPR:
8565 case NEGATE_EXPR:
8566 case BIT_AND_EXPR:
8567 case BIT_IOR_EXPR:
8568 case BIT_XOR_EXPR:
8569 case BIT_NOT_EXPR:
8570 return true;
8571
8572 default:
8573 return false;
8574 }
8575 }
8576
8577 /* Likewise, but taking a code_helper. */
8578
8579 bool
8580 vect_can_vectorize_without_simd_p (code_helper code)
8581 {
8582 return (code.is_tree_code ()
8583 && vect_can_vectorize_without_simd_p (tree_code (code)));
8584 }
8585
8586 /* Create vector init for vectorized iv. */
8587 static tree
8588 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8589 tree step_expr, poly_uint64 nunits,
8590 tree vectype,
8591 enum vect_induction_op_type induction_type)
8592 {
8593 unsigned HOST_WIDE_INT const_nunits;
8594 tree vec_shift, vec_init, new_name;
8595 unsigned i;
8596 tree itype = TREE_TYPE (vectype);
8597
8598 /* iv_loop is the loop to be vectorized. Create:
8599 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8600 new_name = gimple_convert (stmts, itype, init_expr);
8601 switch (induction_type)
8602 {
8603 case vect_step_op_shr:
8604 case vect_step_op_shl:
8605 /* Build the Initial value from shift_expr. */
8606 vec_init = gimple_build_vector_from_val (stmts,
8607 vectype,
8608 new_name);
8609 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8610 build_zero_cst (itype), step_expr);
8611 vec_init = gimple_build (stmts,
8612 (induction_type == vect_step_op_shr
8613 ? RSHIFT_EXPR : LSHIFT_EXPR),
8614 vectype, vec_init, vec_shift);
8615 break;
8616
8617 case vect_step_op_neg:
8618 {
8619 vec_init = gimple_build_vector_from_val (stmts,
8620 vectype,
8621 new_name);
8622 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8623 vectype, vec_init);
8624 /* The encoding has 2 interleaved stepped patterns. */
8625 vec_perm_builder sel (nunits, 2, 3);
8626 sel.quick_grow (6);
8627 for (i = 0; i < 3; i++)
8628 {
8629 sel[2 * i] = i;
8630 sel[2 * i + 1] = i + nunits;
8631 }
8632 vec_perm_indices indices (sel, 2, nunits);
8633 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8634 fail when vec_init is const vector. In that situation vec_perm is not
8635 really needed. */
8636 tree perm_mask_even
8637 = vect_gen_perm_mask_any (vectype, indices);
8638 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8639 vectype,
8640 vec_init, vec_neg,
8641 perm_mask_even);
8642 }
8643 break;
8644
8645 case vect_step_op_mul:
8646 {
8647 /* Use unsigned mult to avoid UD integer overflow. */
8648 gcc_assert (nunits.is_constant (&const_nunits));
8649 tree utype = unsigned_type_for (itype);
8650 tree uvectype = build_vector_type (utype,
8651 TYPE_VECTOR_SUBPARTS (vectype));
8652 new_name = gimple_convert (stmts, utype, new_name);
8653 vec_init = gimple_build_vector_from_val (stmts,
8654 uvectype,
8655 new_name);
8656 tree_vector_builder elts (uvectype, const_nunits, 1);
8657 tree elt_step = build_one_cst (utype);
8658
8659 elts.quick_push (elt_step);
8660 for (i = 1; i < const_nunits; i++)
8661 {
8662 /* Create: new_name_i = new_name + step_expr. */
8663 elt_step = gimple_build (stmts, MULT_EXPR,
8664 utype, elt_step, step_expr);
8665 elts.quick_push (elt_step);
8666 }
8667 /* Create a vector from [new_name_0, new_name_1, ...,
8668 new_name_nunits-1]. */
8669 tree vec_mul = gimple_build_vector (stmts, &elts);
8670 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8671 vec_init, vec_mul);
8672 vec_init = gimple_convert (stmts, vectype, vec_init);
8673 }
8674 break;
8675
8676 default:
8677 gcc_unreachable ();
8678 }
8679
8680 return vec_init;
8681 }
8682
8683 /* Peel init_expr by skip_niter for induction_type. */
8684 tree
8685 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8686 tree skip_niters, tree step_expr,
8687 enum vect_induction_op_type induction_type)
8688 {
8689 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8690 tree type = TREE_TYPE (init_expr);
8691 unsigned prec = TYPE_PRECISION (type);
8692 switch (induction_type)
8693 {
8694 case vect_step_op_neg:
8695 if (TREE_INT_CST_LOW (skip_niters) % 2)
8696 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8697 /* else no change. */
8698 break;
8699
8700 case vect_step_op_shr:
8701 case vect_step_op_shl:
8702 skip_niters = gimple_convert (stmts, type, skip_niters);
8703 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8704 /* When shift mount >= precision, need to avoid UD.
8705 In the original loop, there's no UD, and according to semantic,
8706 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
8707 if (!tree_fits_uhwi_p (step_expr)
8708 || tree_to_uhwi (step_expr) >= prec)
8709 {
8710 if (induction_type == vect_step_op_shl
8711 || TYPE_UNSIGNED (type))
8712 init_expr = build_zero_cst (type);
8713 else
8714 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8715 init_expr,
8716 wide_int_to_tree (type, prec - 1));
8717 }
8718 else
8719 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8720 ? RSHIFT_EXPR : LSHIFT_EXPR),
8721 type, init_expr, step_expr);
8722 break;
8723
8724 case vect_step_op_mul:
8725 {
8726 tree utype = unsigned_type_for (type);
8727 init_expr = gimple_convert (stmts, utype, init_expr);
8728 unsigned skipn = TREE_INT_CST_LOW (skip_niters);
8729 wide_int begin = wi::to_wide (step_expr);
8730 for (unsigned i = 0; i != skipn - 1; i++)
8731 begin = wi::mul (begin, wi::to_wide (step_expr));
8732 tree mult_expr = wide_int_to_tree (utype, begin);
8733 init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
8734 init_expr = gimple_convert (stmts, type, init_expr);
8735 }
8736 break;
8737
8738 default:
8739 gcc_unreachable ();
8740 }
8741
8742 return init_expr;
8743 }
8744
8745 /* Create vector step for vectorized iv. */
8746 static tree
8747 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
8748 poly_uint64 vf,
8749 enum vect_induction_op_type induction_type)
8750 {
8751 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
8752 tree new_name = NULL;
8753 /* Step should be pow (step, vf) for mult induction. */
8754 if (induction_type == vect_step_op_mul)
8755 {
8756 gcc_assert (vf.is_constant ());
8757 wide_int begin = wi::to_wide (step_expr);
8758
8759 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
8760 begin = wi::mul (begin, wi::to_wide (step_expr));
8761
8762 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
8763 }
8764 else if (induction_type == vect_step_op_neg)
8765 /* Do nothing. */
8766 ;
8767 else
8768 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
8769 expr, step_expr);
8770 return new_name;
8771 }
8772
8773 static tree
8774 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
8775 stmt_vec_info stmt_info,
8776 tree new_name, tree vectype,
8777 enum vect_induction_op_type induction_type)
8778 {
8779 /* No step is needed for neg induction. */
8780 if (induction_type == vect_step_op_neg)
8781 return NULL;
8782
8783 tree t = unshare_expr (new_name);
8784 gcc_assert (CONSTANT_CLASS_P (new_name)
8785 || TREE_CODE (new_name) == SSA_NAME);
8786 tree new_vec = build_vector_from_val (vectype, t);
8787 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
8788 new_vec, vectype, NULL);
8789 return vec_step;
8790 }
8791
8792 /* Update vectorized iv with vect_step, induc_def is init. */
8793 static tree
8794 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
8795 tree induc_def, tree vec_step,
8796 enum vect_induction_op_type induction_type)
8797 {
8798 tree vec_def = induc_def;
8799 switch (induction_type)
8800 {
8801 case vect_step_op_mul:
8802 {
8803 /* Use unsigned mult to avoid UD integer overflow. */
8804 tree uvectype
8805 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
8806 TYPE_VECTOR_SUBPARTS (vectype));
8807 vec_def = gimple_convert (stmts, uvectype, vec_def);
8808 vec_step = gimple_convert (stmts, uvectype, vec_step);
8809 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
8810 vec_def, vec_step);
8811 vec_def = gimple_convert (stmts, vectype, vec_def);
8812 }
8813 break;
8814
8815 case vect_step_op_shr:
8816 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
8817 vec_def, vec_step);
8818 break;
8819
8820 case vect_step_op_shl:
8821 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
8822 vec_def, vec_step);
8823 break;
8824 case vect_step_op_neg:
8825 vec_def = induc_def;
8826 /* Do nothing. */
8827 break;
8828 default:
8829 gcc_unreachable ();
8830 }
8831
8832 return vec_def;
8833
8834 }
8835
8836 /* Function vectorizable_induction
8837
8838 Check if STMT_INFO performs an nonlinear induction computation that can be
8839 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
8840 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
8841 basic block.
8842 Return true if STMT_INFO is vectorizable in this way. */
8843
8844 static bool
8845 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
8846 stmt_vec_info stmt_info,
8847 gimple **vec_stmt, slp_tree slp_node,
8848 stmt_vector_for_cost *cost_vec)
8849 {
8850 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8851 unsigned ncopies;
8852 bool nested_in_vect_loop = false;
8853 class loop *iv_loop;
8854 tree vec_def;
8855 edge pe = loop_preheader_edge (loop);
8856 basic_block new_bb;
8857 tree vec_init, vec_step;
8858 tree new_name;
8859 gimple *new_stmt;
8860 gphi *induction_phi;
8861 tree induc_def, vec_dest;
8862 tree init_expr, step_expr;
8863 tree niters_skip;
8864 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8865 unsigned i;
8866 gimple_stmt_iterator si;
8867
8868 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8869
8870 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8871 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8872 enum vect_induction_op_type induction_type
8873 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
8874
8875 gcc_assert (induction_type > vect_step_op_add);
8876
8877 if (slp_node)
8878 ncopies = 1;
8879 else
8880 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8881 gcc_assert (ncopies >= 1);
8882
8883 /* FORNOW. Only handle nonlinear induction in the same loop. */
8884 if (nested_in_vect_loop_p (loop, stmt_info))
8885 {
8886 if (dump_enabled_p ())
8887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8888 "nonlinear induction in nested loop.\n");
8889 return false;
8890 }
8891
8892 iv_loop = loop;
8893 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8894
8895 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
8896 update for each iv and a permutation to generate wanted vector iv. */
8897 if (slp_node)
8898 {
8899 if (dump_enabled_p ())
8900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8901 "SLP induction not supported for nonlinear"
8902 " induction.\n");
8903 return false;
8904 }
8905
8906 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
8907 {
8908 if (dump_enabled_p ())
8909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8910 "floating point nonlinear induction vectorization"
8911 " not supported.\n");
8912 return false;
8913 }
8914
8915 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8916 init_expr = vect_phi_initial_value (phi);
8917 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
8918 && TREE_CODE (step_expr) == INTEGER_CST);
8919 /* step_expr should be aligned with init_expr,
8920 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
8921 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
8922
8923 if (TREE_CODE (init_expr) == INTEGER_CST)
8924 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
8925 else
8926 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
8927 TREE_TYPE (init_expr)));
8928
8929 switch (induction_type)
8930 {
8931 case vect_step_op_neg:
8932 if (TREE_CODE (init_expr) != INTEGER_CST
8933 && TREE_CODE (init_expr) != REAL_CST)
8934 {
8935 /* Check for backend support of NEGATE_EXPR and vec_perm. */
8936 if (!directly_supported_p (NEGATE_EXPR, vectype))
8937 return false;
8938
8939 /* The encoding has 2 interleaved stepped patterns. */
8940 vec_perm_builder sel (nunits, 2, 3);
8941 machine_mode mode = TYPE_MODE (vectype);
8942 sel.quick_grow (6);
8943 for (i = 0; i < 3; i++)
8944 {
8945 sel[i * 2] = i;
8946 sel[i * 2 + 1] = i + nunits;
8947 }
8948 vec_perm_indices indices (sel, 2, nunits);
8949 if (!can_vec_perm_const_p (mode, mode, indices))
8950 return false;
8951 }
8952 break;
8953
8954 case vect_step_op_mul:
8955 {
8956 /* Check for backend support of MULT_EXPR. */
8957 if (!directly_supported_p (MULT_EXPR, vectype))
8958 return false;
8959
8960 /* ?? How to construct vector step for variable number vector.
8961 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
8962 if (!vf.is_constant ())
8963 return false;
8964 }
8965 break;
8966
8967 case vect_step_op_shr:
8968 /* Check for backend support of RSHIFT_EXPR. */
8969 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
8970 return false;
8971
8972 /* Don't shift more than type precision to avoid UD. */
8973 if (!tree_fits_uhwi_p (step_expr)
8974 || maybe_ge (nunits * tree_to_uhwi (step_expr),
8975 TYPE_PRECISION (TREE_TYPE (init_expr))))
8976 return false;
8977 break;
8978
8979 case vect_step_op_shl:
8980 /* Check for backend support of RSHIFT_EXPR. */
8981 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
8982 return false;
8983
8984 /* Don't shift more than type precision to avoid UD. */
8985 if (!tree_fits_uhwi_p (step_expr)
8986 || maybe_ge (nunits * tree_to_uhwi (step_expr),
8987 TYPE_PRECISION (TREE_TYPE (init_expr))))
8988 return false;
8989
8990 break;
8991
8992 default:
8993 gcc_unreachable ();
8994 }
8995
8996 if (!vec_stmt) /* transformation not required. */
8997 {
8998 unsigned inside_cost = 0, prologue_cost = 0;
8999 /* loop cost for vec_loop. Neg induction doesn't have any
9000 inside_cost. */
9001 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9002 stmt_info, 0, vect_body);
9003
9004 /* loop cost for vec_loop. Neg induction doesn't have any
9005 inside_cost. */
9006 if (induction_type == vect_step_op_neg)
9007 inside_cost = 0;
9008
9009 /* prologue cost for vec_init and vec_step. */
9010 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9011 stmt_info, 0, vect_prologue);
9012
9013 if (dump_enabled_p ())
9014 dump_printf_loc (MSG_NOTE, vect_location,
9015 "vect_model_induction_cost: inside_cost = %d, "
9016 "prologue_cost = %d. \n", inside_cost,
9017 prologue_cost);
9018
9019 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9020 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9021 return true;
9022 }
9023
9024 /* Transform. */
9025
9026 /* Compute a vector variable, initialized with the first VF values of
9027 the induction variable. E.g., for an iv with IV_PHI='X' and
9028 evolution S, for a vector of 4 units, we want to compute:
9029 [X, X + S, X + 2*S, X + 3*S]. */
9030
9031 if (dump_enabled_p ())
9032 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9033
9034 pe = loop_preheader_edge (iv_loop);
9035 /* Find the first insertion point in the BB. */
9036 basic_block bb = gimple_bb (phi);
9037 si = gsi_after_labels (bb);
9038
9039 gimple_seq stmts = NULL;
9040
9041 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9042 /* If we are using the loop mask to "peel" for alignment then we need
9043 to adjust the start value here. */
9044 if (niters_skip != NULL_TREE)
9045 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9046 step_expr, induction_type);
9047
9048 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9049 step_expr, nunits, vectype,
9050 induction_type);
9051 if (stmts)
9052 {
9053 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9054 gcc_assert (!new_bb);
9055 }
9056
9057 stmts = NULL;
9058 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9059 vf, induction_type);
9060 if (stmts)
9061 {
9062 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9063 gcc_assert (!new_bb);
9064 }
9065
9066 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9067 new_name, vectype,
9068 induction_type);
9069 /* Create the following def-use cycle:
9070 loop prolog:
9071 vec_init = ...
9072 vec_step = ...
9073 loop:
9074 vec_iv = PHI <vec_init, vec_loop>
9075 ...
9076 STMT
9077 ...
9078 vec_loop = vec_iv + vec_step; */
9079
9080 /* Create the induction-phi that defines the induction-operand. */
9081 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9082 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9083 induc_def = PHI_RESULT (induction_phi);
9084
9085 /* Create the iv update inside the loop. */
9086 stmts = NULL;
9087 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9088 induc_def, vec_step,
9089 induction_type);
9090
9091 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9092 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9093
9094 /* Set the arguments of the phi node: */
9095 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9096 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9097 UNKNOWN_LOCATION);
9098
9099 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9100 *vec_stmt = induction_phi;
9101
9102 /* In case that vectorization factor (VF) is bigger than the number
9103 of elements that we can fit in a vectype (nunits), we have to generate
9104 more than one vector stmt - i.e - we need to "unroll" the
9105 vector stmt by a factor VF/nunits. For more details see documentation
9106 in vectorizable_operation. */
9107
9108 if (ncopies > 1)
9109 {
9110 stmts = NULL;
9111 /* FORNOW. This restriction should be relaxed. */
9112 gcc_assert (!nested_in_vect_loop);
9113
9114 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9115 nunits, induction_type);
9116
9117 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9118 new_name, vectype,
9119 induction_type);
9120 vec_def = induc_def;
9121 for (i = 1; i < ncopies; i++)
9122 {
9123 /* vec_i = vec_prev + vec_step. */
9124 stmts = NULL;
9125 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9126 vec_def, vec_step,
9127 induction_type);
9128 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9129 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9130 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9131 }
9132 }
9133
9134 if (dump_enabled_p ())
9135 dump_printf_loc (MSG_NOTE, vect_location,
9136 "transform induction: created def-use cycle: %G%G",
9137 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9138
9139 return true;
9140 }
9141
9142 /* Function vectorizable_induction
9143
9144 Check if STMT_INFO performs an induction computation that can be vectorized.
9145 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9146 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9147 Return true if STMT_INFO is vectorizable in this way. */
9148
9149 bool
9150 vectorizable_induction (loop_vec_info loop_vinfo,
9151 stmt_vec_info stmt_info,
9152 gimple **vec_stmt, slp_tree slp_node,
9153 stmt_vector_for_cost *cost_vec)
9154 {
9155 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9156 unsigned ncopies;
9157 bool nested_in_vect_loop = false;
9158 class loop *iv_loop;
9159 tree vec_def;
9160 edge pe = loop_preheader_edge (loop);
9161 basic_block new_bb;
9162 tree new_vec, vec_init, vec_step, t;
9163 tree new_name;
9164 gimple *new_stmt;
9165 gphi *induction_phi;
9166 tree induc_def, vec_dest;
9167 tree init_expr, step_expr;
9168 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9169 unsigned i;
9170 tree expr;
9171 gimple_stmt_iterator si;
9172 enum vect_induction_op_type induction_type
9173 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9174
9175 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9176 if (!phi)
9177 return false;
9178
9179 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9180 return false;
9181
9182 /* Make sure it was recognized as induction computation. */
9183 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9184 return false;
9185
9186 /* Handle nonlinear induction in a separate place. */
9187 if (induction_type != vect_step_op_add)
9188 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9189 vec_stmt, slp_node, cost_vec);
9190
9191 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9192 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9193
9194 if (slp_node)
9195 ncopies = 1;
9196 else
9197 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9198 gcc_assert (ncopies >= 1);
9199
9200 /* FORNOW. These restrictions should be relaxed. */
9201 if (nested_in_vect_loop_p (loop, stmt_info))
9202 {
9203 imm_use_iterator imm_iter;
9204 use_operand_p use_p;
9205 gimple *exit_phi;
9206 edge latch_e;
9207 tree loop_arg;
9208
9209 if (ncopies > 1)
9210 {
9211 if (dump_enabled_p ())
9212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9213 "multiple types in nested loop.\n");
9214 return false;
9215 }
9216
9217 exit_phi = NULL;
9218 latch_e = loop_latch_edge (loop->inner);
9219 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9220 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9221 {
9222 gimple *use_stmt = USE_STMT (use_p);
9223 if (is_gimple_debug (use_stmt))
9224 continue;
9225
9226 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9227 {
9228 exit_phi = use_stmt;
9229 break;
9230 }
9231 }
9232 if (exit_phi)
9233 {
9234 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9235 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9236 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9237 {
9238 if (dump_enabled_p ())
9239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9240 "inner-loop induction only used outside "
9241 "of the outer vectorized loop.\n");
9242 return false;
9243 }
9244 }
9245
9246 nested_in_vect_loop = true;
9247 iv_loop = loop->inner;
9248 }
9249 else
9250 iv_loop = loop;
9251 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9252
9253 if (slp_node && !nunits.is_constant ())
9254 {
9255 /* The current SLP code creates the step value element-by-element. */
9256 if (dump_enabled_p ())
9257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9258 "SLP induction not supported for variable-length"
9259 " vectors.\n");
9260 return false;
9261 }
9262
9263 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9264 {
9265 if (dump_enabled_p ())
9266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9267 "floating point induction vectorization disabled\n");
9268 return false;
9269 }
9270
9271 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9272 gcc_assert (step_expr != NULL_TREE);
9273 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9274
9275 /* Check for backend support of PLUS/MINUS_EXPR. */
9276 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9277 || !directly_supported_p (MINUS_EXPR, step_vectype))
9278 return false;
9279
9280 if (!vec_stmt) /* transformation not required. */
9281 {
9282 unsigned inside_cost = 0, prologue_cost = 0;
9283 if (slp_node)
9284 {
9285 /* We eventually need to set a vector type on invariant
9286 arguments. */
9287 unsigned j;
9288 slp_tree child;
9289 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9290 if (!vect_maybe_update_slp_op_vectype
9291 (child, SLP_TREE_VECTYPE (slp_node)))
9292 {
9293 if (dump_enabled_p ())
9294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9295 "incompatible vector types for "
9296 "invariants\n");
9297 return false;
9298 }
9299 /* loop cost for vec_loop. */
9300 inside_cost
9301 = record_stmt_cost (cost_vec,
9302 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9303 vector_stmt, stmt_info, 0, vect_body);
9304 /* prologue cost for vec_init (if not nested) and step. */
9305 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9306 scalar_to_vec,
9307 stmt_info, 0, vect_prologue);
9308 }
9309 else /* if (!slp_node) */
9310 {
9311 /* loop cost for vec_loop. */
9312 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9313 stmt_info, 0, vect_body);
9314 /* prologue cost for vec_init and vec_step. */
9315 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9316 stmt_info, 0, vect_prologue);
9317 }
9318 if (dump_enabled_p ())
9319 dump_printf_loc (MSG_NOTE, vect_location,
9320 "vect_model_induction_cost: inside_cost = %d, "
9321 "prologue_cost = %d .\n", inside_cost,
9322 prologue_cost);
9323
9324 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9325 DUMP_VECT_SCOPE ("vectorizable_induction");
9326 return true;
9327 }
9328
9329 /* Transform. */
9330
9331 /* Compute a vector variable, initialized with the first VF values of
9332 the induction variable. E.g., for an iv with IV_PHI='X' and
9333 evolution S, for a vector of 4 units, we want to compute:
9334 [X, X + S, X + 2*S, X + 3*S]. */
9335
9336 if (dump_enabled_p ())
9337 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9338
9339 pe = loop_preheader_edge (iv_loop);
9340 /* Find the first insertion point in the BB. */
9341 basic_block bb = gimple_bb (phi);
9342 si = gsi_after_labels (bb);
9343
9344 /* For SLP induction we have to generate several IVs as for example
9345 with group size 3 we need
9346 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9347 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9348 if (slp_node)
9349 {
9350 /* Enforced above. */
9351 unsigned int const_nunits = nunits.to_constant ();
9352
9353 /* The initial values are vectorized, but any lanes > group_size
9354 need adjustment. */
9355 slp_tree init_node
9356 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9357
9358 /* Gather steps. Since we do not vectorize inductions as
9359 cycles we have to reconstruct the step from SCEV data. */
9360 unsigned group_size = SLP_TREE_LANES (slp_node);
9361 tree *steps = XALLOCAVEC (tree, group_size);
9362 tree *inits = XALLOCAVEC (tree, group_size);
9363 stmt_vec_info phi_info;
9364 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9365 {
9366 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9367 if (!init_node)
9368 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9369 pe->dest_idx);
9370 }
9371
9372 /* Now generate the IVs. */
9373 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9374 gcc_assert ((const_nunits * nvects) % group_size == 0);
9375 unsigned nivs;
9376 if (nested_in_vect_loop)
9377 nivs = nvects;
9378 else
9379 {
9380 /* Compute the number of distinct IVs we need. First reduce
9381 group_size if it is a multiple of const_nunits so we get
9382 one IV for a group_size of 4 but const_nunits 2. */
9383 unsigned group_sizep = group_size;
9384 if (group_sizep % const_nunits == 0)
9385 group_sizep = group_sizep / const_nunits;
9386 nivs = least_common_multiple (group_sizep,
9387 const_nunits) / const_nunits;
9388 }
9389 tree stept = TREE_TYPE (step_vectype);
9390 tree lupdate_mul = NULL_TREE;
9391 if (!nested_in_vect_loop)
9392 {
9393 /* The number of iterations covered in one vector iteration. */
9394 unsigned lup_mul = (nvects * const_nunits) / group_size;
9395 lupdate_mul
9396 = build_vector_from_val (step_vectype,
9397 SCALAR_FLOAT_TYPE_P (stept)
9398 ? build_real_from_wide (stept, lup_mul,
9399 UNSIGNED)
9400 : build_int_cstu (stept, lup_mul));
9401 }
9402 tree peel_mul = NULL_TREE;
9403 gimple_seq init_stmts = NULL;
9404 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9405 {
9406 if (SCALAR_FLOAT_TYPE_P (stept))
9407 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9408 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9409 else
9410 peel_mul = gimple_convert (&init_stmts, stept,
9411 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9412 peel_mul = gimple_build_vector_from_val (&init_stmts,
9413 step_vectype, peel_mul);
9414 }
9415 unsigned ivn;
9416 auto_vec<tree> vec_steps;
9417 for (ivn = 0; ivn < nivs; ++ivn)
9418 {
9419 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9420 tree_vector_builder init_elts (vectype, const_nunits, 1);
9421 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9422 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9423 {
9424 /* The scalar steps of the IVs. */
9425 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9426 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9427 step_elts.quick_push (elt);
9428 if (!init_node)
9429 {
9430 /* The scalar inits of the IVs if not vectorized. */
9431 elt = inits[(ivn*const_nunits + eltn) % group_size];
9432 if (!useless_type_conversion_p (TREE_TYPE (vectype),
9433 TREE_TYPE (elt)))
9434 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9435 TREE_TYPE (vectype), elt);
9436 init_elts.quick_push (elt);
9437 }
9438 /* The number of steps to add to the initial values. */
9439 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9440 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9441 ? build_real_from_wide (stept,
9442 mul_elt, UNSIGNED)
9443 : build_int_cstu (stept, mul_elt));
9444 }
9445 vec_step = gimple_build_vector (&init_stmts, &step_elts);
9446 vec_steps.safe_push (vec_step);
9447 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9448 if (peel_mul)
9449 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9450 step_mul, peel_mul);
9451 if (!init_node)
9452 vec_init = gimple_build_vector (&init_stmts, &init_elts);
9453
9454 /* Create the induction-phi that defines the induction-operand. */
9455 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9456 "vec_iv_");
9457 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9458 induc_def = PHI_RESULT (induction_phi);
9459
9460 /* Create the iv update inside the loop */
9461 tree up = vec_step;
9462 if (lupdate_mul)
9463 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9464 vec_step, lupdate_mul);
9465 gimple_seq stmts = NULL;
9466 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9467 vec_def = gimple_build (&stmts,
9468 PLUS_EXPR, step_vectype, vec_def, up);
9469 vec_def = gimple_convert (&stmts, vectype, vec_def);
9470 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9471 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9472 UNKNOWN_LOCATION);
9473
9474 if (init_node)
9475 vec_init = vect_get_slp_vect_def (init_node, ivn);
9476 if (!nested_in_vect_loop
9477 && !integer_zerop (step_mul))
9478 {
9479 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9480 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9481 vec_step, step_mul);
9482 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9483 vec_def, up);
9484 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9485 }
9486
9487 /* Set the arguments of the phi node: */
9488 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9489
9490 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9491 }
9492 if (!nested_in_vect_loop)
9493 {
9494 /* Fill up to the number of vectors we need for the whole group. */
9495 nivs = least_common_multiple (group_size,
9496 const_nunits) / const_nunits;
9497 vec_steps.reserve (nivs-ivn);
9498 for (; ivn < nivs; ++ivn)
9499 {
9500 SLP_TREE_VEC_STMTS (slp_node)
9501 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9502 vec_steps.quick_push (vec_steps[0]);
9503 }
9504 }
9505
9506 /* Re-use IVs when we can. We are generating further vector
9507 stmts by adding VF' * stride to the IVs generated above. */
9508 if (ivn < nvects)
9509 {
9510 unsigned vfp
9511 = least_common_multiple (group_size, const_nunits) / group_size;
9512 tree lupdate_mul
9513 = build_vector_from_val (step_vectype,
9514 SCALAR_FLOAT_TYPE_P (stept)
9515 ? build_real_from_wide (stept,
9516 vfp, UNSIGNED)
9517 : build_int_cstu (stept, vfp));
9518 for (; ivn < nvects; ++ivn)
9519 {
9520 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9521 tree def = gimple_get_lhs (iv);
9522 if (ivn < 2*nivs)
9523 vec_steps[ivn - nivs]
9524 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9525 vec_steps[ivn - nivs], lupdate_mul);
9526 gimple_seq stmts = NULL;
9527 def = gimple_convert (&stmts, step_vectype, def);
9528 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9529 def, vec_steps[ivn % nivs]);
9530 def = gimple_convert (&stmts, vectype, def);
9531 if (gimple_code (iv) == GIMPLE_PHI)
9532 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9533 else
9534 {
9535 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9536 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9537 }
9538 SLP_TREE_VEC_STMTS (slp_node)
9539 .quick_push (SSA_NAME_DEF_STMT (def));
9540 }
9541 }
9542
9543 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9544 gcc_assert (!new_bb);
9545
9546 return true;
9547 }
9548
9549 init_expr = vect_phi_initial_value (phi);
9550
9551 gimple_seq stmts = NULL;
9552 if (!nested_in_vect_loop)
9553 {
9554 /* Convert the initial value to the IV update type. */
9555 tree new_type = TREE_TYPE (step_expr);
9556 init_expr = gimple_convert (&stmts, new_type, init_expr);
9557
9558 /* If we are using the loop mask to "peel" for alignment then we need
9559 to adjust the start value here. */
9560 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9561 if (skip_niters != NULL_TREE)
9562 {
9563 if (FLOAT_TYPE_P (vectype))
9564 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9565 skip_niters);
9566 else
9567 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9568 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9569 skip_niters, step_expr);
9570 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9571 init_expr, skip_step);
9572 }
9573 }
9574
9575 if (stmts)
9576 {
9577 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9578 gcc_assert (!new_bb);
9579 }
9580
9581 /* Create the vector that holds the initial_value of the induction. */
9582 if (nested_in_vect_loop)
9583 {
9584 /* iv_loop is nested in the loop to be vectorized. init_expr had already
9585 been created during vectorization of previous stmts. We obtain it
9586 from the STMT_VINFO_VEC_STMT of the defining stmt. */
9587 auto_vec<tree> vec_inits;
9588 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9589 init_expr, &vec_inits);
9590 vec_init = vec_inits[0];
9591 /* If the initial value is not of proper type, convert it. */
9592 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9593 {
9594 new_stmt
9595 = gimple_build_assign (vect_get_new_ssa_name (vectype,
9596 vect_simple_var,
9597 "vec_iv_"),
9598 VIEW_CONVERT_EXPR,
9599 build1 (VIEW_CONVERT_EXPR, vectype,
9600 vec_init));
9601 vec_init = gimple_assign_lhs (new_stmt);
9602 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9603 new_stmt);
9604 gcc_assert (!new_bb);
9605 }
9606 }
9607 else
9608 {
9609 /* iv_loop is the loop to be vectorized. Create:
9610 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
9611 stmts = NULL;
9612 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9613
9614 unsigned HOST_WIDE_INT const_nunits;
9615 if (nunits.is_constant (&const_nunits))
9616 {
9617 tree_vector_builder elts (step_vectype, const_nunits, 1);
9618 elts.quick_push (new_name);
9619 for (i = 1; i < const_nunits; i++)
9620 {
9621 /* Create: new_name_i = new_name + step_expr */
9622 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9623 new_name, step_expr);
9624 elts.quick_push (new_name);
9625 }
9626 /* Create a vector from [new_name_0, new_name_1, ...,
9627 new_name_nunits-1] */
9628 vec_init = gimple_build_vector (&stmts, &elts);
9629 }
9630 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9631 /* Build the initial value directly from a VEC_SERIES_EXPR. */
9632 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9633 new_name, step_expr);
9634 else
9635 {
9636 /* Build:
9637 [base, base, base, ...]
9638 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9639 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9640 gcc_assert (flag_associative_math);
9641 tree index = build_index_vector (step_vectype, 0, 1);
9642 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9643 new_name);
9644 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9645 step_expr);
9646 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9647 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9648 vec_init, step_vec);
9649 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9650 vec_init, base_vec);
9651 }
9652 vec_init = gimple_convert (&stmts, vectype, vec_init);
9653
9654 if (stmts)
9655 {
9656 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9657 gcc_assert (!new_bb);
9658 }
9659 }
9660
9661
9662 /* Create the vector that holds the step of the induction. */
9663 if (nested_in_vect_loop)
9664 /* iv_loop is nested in the loop to be vectorized. Generate:
9665 vec_step = [S, S, S, S] */
9666 new_name = step_expr;
9667 else
9668 {
9669 /* iv_loop is the loop to be vectorized. Generate:
9670 vec_step = [VF*S, VF*S, VF*S, VF*S] */
9671 gimple_seq seq = NULL;
9672 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9673 {
9674 expr = build_int_cst (integer_type_node, vf);
9675 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9676 }
9677 else
9678 expr = build_int_cst (TREE_TYPE (step_expr), vf);
9679 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9680 expr, step_expr);
9681 if (seq)
9682 {
9683 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9684 gcc_assert (!new_bb);
9685 }
9686 }
9687
9688 t = unshare_expr (new_name);
9689 gcc_assert (CONSTANT_CLASS_P (new_name)
9690 || TREE_CODE (new_name) == SSA_NAME);
9691 new_vec = build_vector_from_val (step_vectype, t);
9692 vec_step = vect_init_vector (loop_vinfo, stmt_info,
9693 new_vec, step_vectype, NULL);
9694
9695
9696 /* Create the following def-use cycle:
9697 loop prolog:
9698 vec_init = ...
9699 vec_step = ...
9700 loop:
9701 vec_iv = PHI <vec_init, vec_loop>
9702 ...
9703 STMT
9704 ...
9705 vec_loop = vec_iv + vec_step; */
9706
9707 /* Create the induction-phi that defines the induction-operand. */
9708 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9709 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9710 induc_def = PHI_RESULT (induction_phi);
9711
9712 /* Create the iv update inside the loop */
9713 stmts = NULL;
9714 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9715 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
9716 vec_def = gimple_convert (&stmts, vectype, vec_def);
9717 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9718 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9719
9720 /* Set the arguments of the phi node: */
9721 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9722 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9723 UNKNOWN_LOCATION);
9724
9725 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9726 *vec_stmt = induction_phi;
9727
9728 /* In case that vectorization factor (VF) is bigger than the number
9729 of elements that we can fit in a vectype (nunits), we have to generate
9730 more than one vector stmt - i.e - we need to "unroll" the
9731 vector stmt by a factor VF/nunits. For more details see documentation
9732 in vectorizable_operation. */
9733
9734 if (ncopies > 1)
9735 {
9736 gimple_seq seq = NULL;
9737 /* FORNOW. This restriction should be relaxed. */
9738 gcc_assert (!nested_in_vect_loop);
9739
9740 /* Create the vector that holds the step of the induction. */
9741 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9742 {
9743 expr = build_int_cst (integer_type_node, nunits);
9744 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9745 }
9746 else
9747 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
9748 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9749 expr, step_expr);
9750 if (seq)
9751 {
9752 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9753 gcc_assert (!new_bb);
9754 }
9755
9756 t = unshare_expr (new_name);
9757 gcc_assert (CONSTANT_CLASS_P (new_name)
9758 || TREE_CODE (new_name) == SSA_NAME);
9759 new_vec = build_vector_from_val (step_vectype, t);
9760 vec_step = vect_init_vector (loop_vinfo, stmt_info,
9761 new_vec, step_vectype, NULL);
9762
9763 vec_def = induc_def;
9764 for (i = 1; i < ncopies; i++)
9765 {
9766 /* vec_i = vec_prev + vec_step */
9767 gimple_seq stmts = NULL;
9768 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
9769 vec_def = gimple_build (&stmts,
9770 PLUS_EXPR, step_vectype, vec_def, vec_step);
9771 vec_def = gimple_convert (&stmts, vectype, vec_def);
9772
9773 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9774 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9775 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9776 }
9777 }
9778
9779 if (dump_enabled_p ())
9780 dump_printf_loc (MSG_NOTE, vect_location,
9781 "transform induction: created def-use cycle: %G%G",
9782 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9783
9784 return true;
9785 }
9786
9787 /* Function vectorizable_live_operation.
9788
9789 STMT_INFO computes a value that is used outside the loop. Check if
9790 it can be supported. */
9791
9792 bool
9793 vectorizable_live_operation (vec_info *vinfo,
9794 stmt_vec_info stmt_info,
9795 gimple_stmt_iterator *gsi,
9796 slp_tree slp_node, slp_instance slp_node_instance,
9797 int slp_index, bool vec_stmt_p,
9798 stmt_vector_for_cost *cost_vec)
9799 {
9800 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9801 imm_use_iterator imm_iter;
9802 tree lhs, lhs_type, bitsize;
9803 tree vectype = (slp_node
9804 ? SLP_TREE_VECTYPE (slp_node)
9805 : STMT_VINFO_VECTYPE (stmt_info));
9806 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9807 int ncopies;
9808 gimple *use_stmt;
9809 auto_vec<tree> vec_oprnds;
9810 int vec_entry = 0;
9811 poly_uint64 vec_index = 0;
9812
9813 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
9814
9815 /* If a stmt of a reduction is live, vectorize it via
9816 vect_create_epilog_for_reduction. vectorizable_reduction assessed
9817 validity so just trigger the transform here. */
9818 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
9819 {
9820 if (!vec_stmt_p)
9821 return true;
9822 if (slp_node)
9823 {
9824 /* For reduction chains the meta-info is attached to
9825 the group leader. */
9826 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
9827 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
9828 /* For SLP reductions we vectorize the epilogue for
9829 all involved stmts together. */
9830 else if (slp_index != 0)
9831 return true;
9832 }
9833 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
9834 gcc_assert (reduc_info->is_reduc_info);
9835 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
9836 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
9837 return true;
9838 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
9839 slp_node_instance);
9840 return true;
9841 }
9842
9843 /* If STMT is not relevant and it is a simple assignment and its inputs are
9844 invariant then it can remain in place, unvectorized. The original last
9845 scalar value that it computes will be used. */
9846 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9847 {
9848 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
9849 if (dump_enabled_p ())
9850 dump_printf_loc (MSG_NOTE, vect_location,
9851 "statement is simple and uses invariant. Leaving in "
9852 "place.\n");
9853 return true;
9854 }
9855
9856 if (slp_node)
9857 ncopies = 1;
9858 else
9859 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9860
9861 if (slp_node)
9862 {
9863 gcc_assert (slp_index >= 0);
9864
9865 /* Get the last occurrence of the scalar index from the concatenation of
9866 all the slp vectors. Calculate which slp vector it is and the index
9867 within. */
9868 int num_scalar = SLP_TREE_LANES (slp_node);
9869 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9870 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
9871
9872 /* Calculate which vector contains the result, and which lane of
9873 that vector we need. */
9874 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
9875 {
9876 if (dump_enabled_p ())
9877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9878 "Cannot determine which vector holds the"
9879 " final result.\n");
9880 return false;
9881 }
9882 }
9883
9884 if (!vec_stmt_p)
9885 {
9886 /* No transformation required. */
9887 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
9888 {
9889 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
9890 OPTIMIZE_FOR_SPEED))
9891 {
9892 if (dump_enabled_p ())
9893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9894 "can't operate on partial vectors "
9895 "because the target doesn't support extract "
9896 "last reduction.\n");
9897 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9898 }
9899 else if (slp_node)
9900 {
9901 if (dump_enabled_p ())
9902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9903 "can't operate on partial vectors "
9904 "because an SLP statement is live after "
9905 "the loop.\n");
9906 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9907 }
9908 else if (ncopies > 1)
9909 {
9910 if (dump_enabled_p ())
9911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9912 "can't operate on partial vectors "
9913 "because ncopies is greater than 1.\n");
9914 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9915 }
9916 else
9917 {
9918 gcc_assert (ncopies == 1 && !slp_node);
9919 vect_record_loop_mask (loop_vinfo,
9920 &LOOP_VINFO_MASKS (loop_vinfo),
9921 1, vectype, NULL);
9922 }
9923 }
9924 /* ??? Enable for loop costing as well. */
9925 if (!loop_vinfo)
9926 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
9927 0, vect_epilogue);
9928 return true;
9929 }
9930
9931 /* Use the lhs of the original scalar statement. */
9932 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
9933 if (dump_enabled_p ())
9934 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
9935 "stmt %G", stmt);
9936
9937 lhs = gimple_get_lhs (stmt);
9938 lhs_type = TREE_TYPE (lhs);
9939
9940 bitsize = vector_element_bits_tree (vectype);
9941
9942 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
9943 tree vec_lhs, bitstart;
9944 gimple *vec_stmt;
9945 if (slp_node)
9946 {
9947 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
9948
9949 /* Get the correct slp vectorized stmt. */
9950 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
9951 vec_lhs = gimple_get_lhs (vec_stmt);
9952
9953 /* Get entry to use. */
9954 bitstart = bitsize_int (vec_index);
9955 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
9956 }
9957 else
9958 {
9959 /* For multiple copies, get the last copy. */
9960 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
9961 vec_lhs = gimple_get_lhs (vec_stmt);
9962
9963 /* Get the last lane in the vector. */
9964 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
9965 }
9966
9967 if (loop_vinfo)
9968 {
9969 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
9970 requirement, insert one phi node for it. It looks like:
9971 loop;
9972 BB:
9973 # lhs' = PHI <lhs>
9974 ==>
9975 loop;
9976 BB:
9977 # vec_lhs' = PHI <vec_lhs>
9978 new_tree = lane_extract <vec_lhs', ...>;
9979 lhs' = new_tree; */
9980
9981 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9982 basic_block exit_bb = single_exit (loop)->dest;
9983 gcc_assert (single_pred_p (exit_bb));
9984
9985 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
9986 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
9987 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
9988
9989 gimple_seq stmts = NULL;
9990 tree new_tree;
9991 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9992 {
9993 /* Emit:
9994
9995 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
9996
9997 where VEC_LHS is the vectorized live-out result and MASK is
9998 the loop mask for the final iteration. */
9999 gcc_assert (ncopies == 1 && !slp_node);
10000 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10001 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
10002 1, vectype, 0);
10003 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10004 mask, vec_lhs_phi);
10005
10006 /* Convert the extracted vector element to the scalar type. */
10007 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10008 }
10009 else
10010 {
10011 tree bftype = TREE_TYPE (vectype);
10012 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10013 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10014 new_tree = build3 (BIT_FIELD_REF, bftype,
10015 vec_lhs_phi, bitsize, bitstart);
10016 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10017 &stmts, true, NULL_TREE);
10018 }
10019
10020 if (stmts)
10021 {
10022 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10023 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10024
10025 /* Remove existing phi from lhs and create one copy from new_tree. */
10026 tree lhs_phi = NULL_TREE;
10027 gimple_stmt_iterator gsi;
10028 for (gsi = gsi_start_phis (exit_bb);
10029 !gsi_end_p (gsi); gsi_next (&gsi))
10030 {
10031 gimple *phi = gsi_stmt (gsi);
10032 if ((gimple_phi_arg_def (phi, 0) == lhs))
10033 {
10034 remove_phi_node (&gsi, false);
10035 lhs_phi = gimple_phi_result (phi);
10036 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10037 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10038 break;
10039 }
10040 }
10041 }
10042
10043 /* Replace use of lhs with newly computed result. If the use stmt is a
10044 single arg PHI, just replace all uses of PHI result. It's necessary
10045 because lcssa PHI defining lhs may be before newly inserted stmt. */
10046 use_operand_p use_p;
10047 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10048 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10049 && !is_gimple_debug (use_stmt))
10050 {
10051 if (gimple_code (use_stmt) == GIMPLE_PHI
10052 && gimple_phi_num_args (use_stmt) == 1)
10053 {
10054 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10055 }
10056 else
10057 {
10058 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10059 SET_USE (use_p, new_tree);
10060 }
10061 update_stmt (use_stmt);
10062 }
10063 }
10064 else
10065 {
10066 /* For basic-block vectorization simply insert the lane-extraction. */
10067 tree bftype = TREE_TYPE (vectype);
10068 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10069 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10070 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10071 vec_lhs, bitsize, bitstart);
10072 gimple_seq stmts = NULL;
10073 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10074 &stmts, true, NULL_TREE);
10075 if (TREE_CODE (new_tree) == SSA_NAME
10076 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10077 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10078 if (is_a <gphi *> (vec_stmt))
10079 {
10080 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10081 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10082 }
10083 else
10084 {
10085 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10086 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10087 }
10088
10089 /* Replace use of lhs with newly computed result. If the use stmt is a
10090 single arg PHI, just replace all uses of PHI result. It's necessary
10091 because lcssa PHI defining lhs may be before newly inserted stmt. */
10092 use_operand_p use_p;
10093 stmt_vec_info use_stmt_info;
10094 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10095 if (!is_gimple_debug (use_stmt)
10096 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10097 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10098 {
10099 /* ??? This can happen when the live lane ends up being
10100 used in a vector construction code-generated by an
10101 external SLP node (and code-generation for that already
10102 happened). See gcc.dg/vect/bb-slp-47.c.
10103 Doing this is what would happen if that vector CTOR
10104 were not code-generated yet so it is not too bad.
10105 ??? In fact we'd likely want to avoid this situation
10106 in the first place. */
10107 if (TREE_CODE (new_tree) == SSA_NAME
10108 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10109 && gimple_code (use_stmt) != GIMPLE_PHI
10110 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10111 use_stmt))
10112 {
10113 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10114 gcc_assert (code == CONSTRUCTOR
10115 || code == VIEW_CONVERT_EXPR
10116 || CONVERT_EXPR_CODE_P (code));
10117 if (dump_enabled_p ())
10118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10119 "Using original scalar computation for "
10120 "live lane because use preceeds vector "
10121 "def\n");
10122 continue;
10123 }
10124 /* ??? It can also happen that we end up pulling a def into
10125 a loop where replacing out-of-loop uses would require
10126 a new LC SSA PHI node. Retain the original scalar in
10127 those cases as well. PR98064. */
10128 if (TREE_CODE (new_tree) == SSA_NAME
10129 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10130 && (gimple_bb (use_stmt)->loop_father
10131 != gimple_bb (vec_stmt)->loop_father)
10132 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10133 gimple_bb (use_stmt)->loop_father))
10134 {
10135 if (dump_enabled_p ())
10136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10137 "Using original scalar computation for "
10138 "live lane because there is an out-of-loop "
10139 "definition for it\n");
10140 continue;
10141 }
10142 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10143 SET_USE (use_p, new_tree);
10144 update_stmt (use_stmt);
10145 }
10146 }
10147
10148 return true;
10149 }
10150
10151 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10152
10153 static void
10154 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10155 {
10156 ssa_op_iter op_iter;
10157 imm_use_iterator imm_iter;
10158 def_operand_p def_p;
10159 gimple *ustmt;
10160
10161 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10162 {
10163 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10164 {
10165 basic_block bb;
10166
10167 if (!is_gimple_debug (ustmt))
10168 continue;
10169
10170 bb = gimple_bb (ustmt);
10171
10172 if (!flow_bb_inside_loop_p (loop, bb))
10173 {
10174 if (gimple_debug_bind_p (ustmt))
10175 {
10176 if (dump_enabled_p ())
10177 dump_printf_loc (MSG_NOTE, vect_location,
10178 "killing debug use\n");
10179
10180 gimple_debug_bind_reset_value (ustmt);
10181 update_stmt (ustmt);
10182 }
10183 else
10184 gcc_unreachable ();
10185 }
10186 }
10187 }
10188 }
10189
10190 /* Given loop represented by LOOP_VINFO, return true if computation of
10191 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10192 otherwise. */
10193
10194 static bool
10195 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10196 {
10197 /* Constant case. */
10198 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10199 {
10200 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10201 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10202
10203 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10204 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10205 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10206 return true;
10207 }
10208
10209 widest_int max;
10210 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10211 /* Check the upper bound of loop niters. */
10212 if (get_max_loop_iterations (loop, &max))
10213 {
10214 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10215 signop sgn = TYPE_SIGN (type);
10216 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10217 if (max < type_max)
10218 return true;
10219 }
10220 return false;
10221 }
10222
10223 /* Return a mask type with half the number of elements as OLD_TYPE,
10224 given that it should have mode NEW_MODE. */
10225
10226 tree
10227 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10228 {
10229 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10230 return build_truth_vector_type_for_mode (nunits, new_mode);
10231 }
10232
10233 /* Return a mask type with twice as many elements as OLD_TYPE,
10234 given that it should have mode NEW_MODE. */
10235
10236 tree
10237 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10238 {
10239 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10240 return build_truth_vector_type_for_mode (nunits, new_mode);
10241 }
10242
10243 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10244 contain a sequence of NVECTORS masks that each control a vector of type
10245 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10246 these vector masks with the vector version of SCALAR_MASK. */
10247
10248 void
10249 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10250 unsigned int nvectors, tree vectype, tree scalar_mask)
10251 {
10252 gcc_assert (nvectors != 0);
10253 if (masks->length () < nvectors)
10254 masks->safe_grow_cleared (nvectors, true);
10255 rgroup_controls *rgm = &(*masks)[nvectors - 1];
10256 /* The number of scalars per iteration and the number of vectors are
10257 both compile-time constants. */
10258 unsigned int nscalars_per_iter
10259 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10260 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10261
10262 if (scalar_mask)
10263 {
10264 scalar_cond_masked_key cond (scalar_mask, nvectors);
10265 loop_vinfo->scalar_cond_masked_set.add (cond);
10266 }
10267
10268 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
10269 {
10270 rgm->max_nscalars_per_iter = nscalars_per_iter;
10271 rgm->type = truth_type_for (vectype);
10272 rgm->factor = 1;
10273 }
10274 }
10275
10276 /* Given a complete set of masks MASKS, extract mask number INDEX
10277 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10278 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10279
10280 See the comment above vec_loop_masks for more details about the mask
10281 arrangement. */
10282
10283 tree
10284 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10285 unsigned int nvectors, tree vectype, unsigned int index)
10286 {
10287 rgroup_controls *rgm = &(*masks)[nvectors - 1];
10288 tree mask_type = rgm->type;
10289
10290 /* Populate the rgroup's mask array, if this is the first time we've
10291 used it. */
10292 if (rgm->controls.is_empty ())
10293 {
10294 rgm->controls.safe_grow_cleared (nvectors, true);
10295 for (unsigned int i = 0; i < nvectors; ++i)
10296 {
10297 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10298 /* Provide a dummy definition until the real one is available. */
10299 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10300 rgm->controls[i] = mask;
10301 }
10302 }
10303
10304 tree mask = rgm->controls[index];
10305 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10306 TYPE_VECTOR_SUBPARTS (vectype)))
10307 {
10308 /* A loop mask for data type X can be reused for data type Y
10309 if X has N times more elements than Y and if Y's elements
10310 are N times bigger than X's. In this case each sequence
10311 of N elements in the loop mask will be all-zero or all-one.
10312 We can then view-convert the mask so that each sequence of
10313 N elements is replaced by a single element. */
10314 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10315 TYPE_VECTOR_SUBPARTS (vectype)));
10316 gimple_seq seq = NULL;
10317 mask_type = truth_type_for (vectype);
10318 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10319 if (seq)
10320 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10321 }
10322 return mask;
10323 }
10324
10325 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10326 lengths for controlling an operation on VECTYPE. The operation splits
10327 each element of VECTYPE into FACTOR separate subelements, measuring the
10328 length as a number of these subelements. */
10329
10330 void
10331 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10332 unsigned int nvectors, tree vectype, unsigned int factor)
10333 {
10334 gcc_assert (nvectors != 0);
10335 if (lens->length () < nvectors)
10336 lens->safe_grow_cleared (nvectors, true);
10337 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10338
10339 /* The number of scalars per iteration, scalar occupied bytes and
10340 the number of vectors are both compile-time constants. */
10341 unsigned int nscalars_per_iter
10342 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10343 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10344
10345 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10346 {
10347 /* For now, we only support cases in which all loads and stores fall back
10348 to VnQI or none do. */
10349 gcc_assert (!rgl->max_nscalars_per_iter
10350 || (rgl->factor == 1 && factor == 1)
10351 || (rgl->max_nscalars_per_iter * rgl->factor
10352 == nscalars_per_iter * factor));
10353 rgl->max_nscalars_per_iter = nscalars_per_iter;
10354 rgl->type = vectype;
10355 rgl->factor = factor;
10356 }
10357 }
10358
10359 /* Given a complete set of length LENS, extract length number INDEX for an
10360 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
10361
10362 tree
10363 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10364 unsigned int nvectors, unsigned int index)
10365 {
10366 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10367 bool use_bias_adjusted_len =
10368 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10369
10370 /* Populate the rgroup's len array, if this is the first time we've
10371 used it. */
10372 if (rgl->controls.is_empty ())
10373 {
10374 rgl->controls.safe_grow_cleared (nvectors, true);
10375 for (unsigned int i = 0; i < nvectors; ++i)
10376 {
10377 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10378 gcc_assert (len_type != NULL_TREE);
10379
10380 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10381
10382 /* Provide a dummy definition until the real one is available. */
10383 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10384 rgl->controls[i] = len;
10385
10386 if (use_bias_adjusted_len)
10387 {
10388 gcc_assert (i == 0);
10389 tree adjusted_len =
10390 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10391 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10392 rgl->bias_adjusted_ctrl = adjusted_len;
10393 }
10394 }
10395 }
10396
10397 if (use_bias_adjusted_len)
10398 return rgl->bias_adjusted_ctrl;
10399 else
10400 return rgl->controls[index];
10401 }
10402
10403 /* Scale profiling counters by estimation for LOOP which is vectorized
10404 by factor VF. */
10405
10406 static void
10407 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
10408 {
10409 edge preheader = loop_preheader_edge (loop);
10410 /* Reduce loop iterations by the vectorization factor. */
10411 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
10412 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
10413
10414 if (freq_h.nonzero_p ())
10415 {
10416 profile_probability p;
10417
10418 /* Avoid dropping loop body profile counter to 0 because of zero count
10419 in loop's preheader. */
10420 if (!(freq_e == profile_count::zero ()))
10421 freq_e = freq_e.force_nonzero ();
10422 p = (freq_e * (new_est_niter + 1)).probability_in (freq_h);
10423 scale_loop_frequencies (loop, p);
10424 }
10425
10426 edge exit_e = single_exit (loop);
10427 exit_e->probability = profile_probability::always () / (new_est_niter + 1);
10428
10429 edge exit_l = single_pred_edge (loop->latch);
10430 profile_probability prob = exit_l->probability;
10431 exit_l->probability = exit_e->probability.invert ();
10432 if (prob.initialized_p () && exit_l->probability.initialized_p ())
10433 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
10434 }
10435
10436 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10437 latch edge values originally defined by it. */
10438
10439 static void
10440 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10441 stmt_vec_info def_stmt_info)
10442 {
10443 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10444 if (!def || TREE_CODE (def) != SSA_NAME)
10445 return;
10446 stmt_vec_info phi_info;
10447 imm_use_iterator iter;
10448 use_operand_p use_p;
10449 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10450 {
10451 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10452 if (!phi)
10453 continue;
10454 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10455 && (phi_info = loop_vinfo->lookup_stmt (phi))
10456 && STMT_VINFO_RELEVANT_P (phi_info)))
10457 continue;
10458 loop_p loop = gimple_bb (phi)->loop_father;
10459 edge e = loop_latch_edge (loop);
10460 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10461 continue;
10462
10463 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10464 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10465 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10466 {
10467 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10468 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10469 gcc_assert (phi_defs.length () == latch_defs.length ());
10470 for (unsigned i = 0; i < phi_defs.length (); ++i)
10471 add_phi_arg (as_a <gphi *> (phi_defs[i]),
10472 gimple_get_lhs (latch_defs[i]), e,
10473 gimple_phi_arg_location (phi, e->dest_idx));
10474 }
10475 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10476 {
10477 /* For first order recurrences we have to update both uses of
10478 the latch definition, the one in the PHI node and the one
10479 in the generated VEC_PERM_EXPR. */
10480 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10481 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10482 gcc_assert (phi_defs.length () == latch_defs.length ());
10483 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10484 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10485 for (unsigned i = 0; i < phi_defs.length (); ++i)
10486 {
10487 gassign *perm = as_a <gassign *> (phi_defs[i]);
10488 if (i > 0)
10489 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10490 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10491 update_stmt (perm);
10492 }
10493 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10494 gimple_phi_arg_location (phi, e->dest_idx));
10495 }
10496 }
10497 }
10498
10499 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10500 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10501 stmt_vec_info. */
10502
10503 static bool
10504 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10505 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10506 {
10507 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10508 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10509
10510 if (dump_enabled_p ())
10511 dump_printf_loc (MSG_NOTE, vect_location,
10512 "------>vectorizing statement: %G", stmt_info->stmt);
10513
10514 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10515 vect_loop_kill_debug_uses (loop, stmt_info);
10516
10517 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10518 && !STMT_VINFO_LIVE_P (stmt_info))
10519 return false;
10520
10521 if (STMT_VINFO_VECTYPE (stmt_info))
10522 {
10523 poly_uint64 nunits
10524 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10525 if (!STMT_SLP_TYPE (stmt_info)
10526 && maybe_ne (nunits, vf)
10527 && dump_enabled_p ())
10528 /* For SLP VF is set according to unrolling factor, and not
10529 to vector size, hence for SLP this print is not valid. */
10530 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10531 }
10532
10533 /* Pure SLP statements have already been vectorized. We still need
10534 to apply loop vectorization to hybrid SLP statements. */
10535 if (PURE_SLP_STMT (stmt_info))
10536 return false;
10537
10538 if (dump_enabled_p ())
10539 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10540
10541 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10542 *seen_store = stmt_info;
10543
10544 return true;
10545 }
10546
10547 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10548 in the hash_map with its corresponding values. */
10549
10550 static tree
10551 find_in_mapping (tree t, void *context)
10552 {
10553 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
10554
10555 tree *value = mapping->get (t);
10556 return value ? *value : t;
10557 }
10558
10559 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10560 original loop that has now been vectorized.
10561
10562 The inits of the data_references need to be advanced with the number of
10563 iterations of the main loop. This has been computed in vect_do_peeling and
10564 is stored in parameter ADVANCE. We first restore the data_references
10565 initial offset with the values recored in ORIG_DRS_INIT.
10566
10567 Since the loop_vec_info of this EPILOGUE was constructed for the original
10568 loop, its stmt_vec_infos all point to the original statements. These need
10569 to be updated to point to their corresponding copies as well as the SSA_NAMES
10570 in their PATTERN_DEF_SEQs and RELATED_STMTs.
10571
10572 The data_reference's connections also need to be updated. Their
10573 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10574 stmt_vec_infos, their statements need to point to their corresponding copy,
10575 if they are gather loads or scatter stores then their reference needs to be
10576 updated to point to its corresponding copy and finally we set
10577 'base_misaligned' to false as we have already peeled for alignment in the
10578 prologue of the main loop. */
10579
10580 static void
10581 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10582 {
10583 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10584 auto_vec<gimple *> stmt_worklist;
10585 hash_map<tree,tree> mapping;
10586 gimple *orig_stmt, *new_stmt;
10587 gimple_stmt_iterator epilogue_gsi;
10588 gphi_iterator epilogue_phi_gsi;
10589 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10590 basic_block *epilogue_bbs = get_loop_body (epilogue);
10591 unsigned i;
10592
10593 free (LOOP_VINFO_BBS (epilogue_vinfo));
10594 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10595
10596 /* Advance data_reference's with the number of iterations of the previous
10597 loop and its prologue. */
10598 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10599
10600
10601 /* The EPILOGUE loop is a copy of the original loop so they share the same
10602 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10603 point to the copied statements. We also create a mapping of all LHS' in
10604 the original loop and all the LHS' in the EPILOGUE and create worklists to
10605 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
10606 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10607 {
10608 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10609 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10610 {
10611 new_stmt = epilogue_phi_gsi.phi ();
10612
10613 gcc_assert (gimple_uid (new_stmt) > 0);
10614 stmt_vinfo
10615 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10616
10617 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10618 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10619
10620 mapping.put (gimple_phi_result (orig_stmt),
10621 gimple_phi_result (new_stmt));
10622 /* PHI nodes can not have patterns or related statements. */
10623 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
10624 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
10625 }
10626
10627 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10628 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10629 {
10630 new_stmt = gsi_stmt (epilogue_gsi);
10631 if (is_gimple_debug (new_stmt))
10632 continue;
10633
10634 gcc_assert (gimple_uid (new_stmt) > 0);
10635 stmt_vinfo
10636 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10637
10638 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10639 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10640
10641 if (tree old_lhs = gimple_get_lhs (orig_stmt))
10642 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
10643
10644 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
10645 {
10646 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
10647 for (gimple_stmt_iterator gsi = gsi_start (seq);
10648 !gsi_end_p (gsi); gsi_next (&gsi))
10649 stmt_worklist.safe_push (gsi_stmt (gsi));
10650 }
10651
10652 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10653 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10654 {
10655 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10656 stmt_worklist.safe_push (stmt);
10657 /* Set BB such that the assert in
10658 'get_initial_def_for_reduction' is able to determine that
10659 the BB of the related stmt is inside this loop. */
10660 gimple_set_bb (stmt,
10661 gimple_bb (new_stmt));
10662 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10663 gcc_assert (related_vinfo == NULL
10664 || related_vinfo == stmt_vinfo);
10665 }
10666 }
10667 }
10668
10669 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
10670 using the original main loop and thus need to be updated to refer to the
10671 cloned variables used in the epilogue. */
10672 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
10673 {
10674 gimple *stmt = stmt_worklist[i];
10675 tree *new_op;
10676
10677 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
10678 {
10679 tree op = gimple_op (stmt, j);
10680 if ((new_op = mapping.get(op)))
10681 gimple_set_op (stmt, j, *new_op);
10682 else
10683 {
10684 /* PR92429: The last argument of simplify_replace_tree disables
10685 folding when replacing arguments. This is required as
10686 otherwise you might end up with different statements than the
10687 ones analyzed in vect_loop_analyze, leading to different
10688 vectorization. */
10689 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
10690 &find_in_mapping, &mapping, false);
10691 gimple_set_op (stmt, j, op);
10692 }
10693 }
10694 }
10695
10696 struct data_reference *dr;
10697 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10698 FOR_EACH_VEC_ELT (datarefs, i, dr)
10699 {
10700 orig_stmt = DR_STMT (dr);
10701 gcc_assert (gimple_uid (orig_stmt) > 0);
10702 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10703 /* Data references for gather loads and scatter stores do not use the
10704 updated offset we set using ADVANCE. Instead we have to make sure the
10705 reference in the data references point to the corresponding copy of
10706 the original in the epilogue. */
10707 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
10708 == VMAT_GATHER_SCATTER)
10709 {
10710 DR_REF (dr)
10711 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
10712 &find_in_mapping, &mapping);
10713 DR_BASE_ADDRESS (dr)
10714 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
10715 &find_in_mapping, &mapping);
10716 }
10717 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10718 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
10719 /* The vector size of the epilogue is smaller than that of the main loop
10720 so the alignment is either the same or lower. This means the dr will
10721 thus by definition be aligned. */
10722 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
10723 }
10724
10725 epilogue_vinfo->shared->datarefs_copy.release ();
10726 epilogue_vinfo->shared->save_datarefs ();
10727 }
10728
10729 /* Function vect_transform_loop.
10730
10731 The analysis phase has determined that the loop is vectorizable.
10732 Vectorize the loop - created vectorized stmts to replace the scalar
10733 stmts in the loop, and update the loop exit condition.
10734 Returns scalar epilogue loop if any. */
10735
10736 class loop *
10737 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
10738 {
10739 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10740 class loop *epilogue = NULL;
10741 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
10742 int nbbs = loop->num_nodes;
10743 int i;
10744 tree niters_vector = NULL_TREE;
10745 tree step_vector = NULL_TREE;
10746 tree niters_vector_mult_vf = NULL_TREE;
10747 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10748 unsigned int lowest_vf = constant_lower_bound (vf);
10749 gimple *stmt;
10750 bool check_profitability = false;
10751 unsigned int th;
10752
10753 DUMP_VECT_SCOPE ("vec_transform_loop");
10754
10755 loop_vinfo->shared->check_datarefs ();
10756
10757 /* Use the more conservative vectorization threshold. If the number
10758 of iterations is constant assume the cost check has been performed
10759 by our caller. If the threshold makes all loops profitable that
10760 run at least the (estimated) vectorization factor number of times
10761 checking is pointless, too. */
10762 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
10763 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
10764 {
10765 if (dump_enabled_p ())
10766 dump_printf_loc (MSG_NOTE, vect_location,
10767 "Profitability threshold is %d loop iterations.\n",
10768 th);
10769 check_profitability = true;
10770 }
10771
10772 /* Make sure there exists a single-predecessor exit bb. Do this before
10773 versioning. */
10774 edge e = single_exit (loop);
10775 if (! single_pred_p (e->dest))
10776 {
10777 split_loop_exit_edge (e, true);
10778 if (dump_enabled_p ())
10779 dump_printf (MSG_NOTE, "split exit edge\n");
10780 }
10781
10782 /* Version the loop first, if required, so the profitability check
10783 comes first. */
10784
10785 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
10786 {
10787 class loop *sloop
10788 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
10789 sloop->force_vectorize = false;
10790 check_profitability = false;
10791 }
10792
10793 /* Make sure there exists a single-predecessor exit bb also on the
10794 scalar loop copy. Do this after versioning but before peeling
10795 so CFG structure is fine for both scalar and if-converted loop
10796 to make slpeel_duplicate_current_defs_from_edges face matched
10797 loop closed PHI nodes on the exit. */
10798 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
10799 {
10800 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
10801 if (! single_pred_p (e->dest))
10802 {
10803 split_loop_exit_edge (e, true);
10804 if (dump_enabled_p ())
10805 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
10806 }
10807 }
10808
10809 tree niters = vect_build_loop_niters (loop_vinfo);
10810 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
10811 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
10812 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
10813 tree advance;
10814 drs_init_vec orig_drs_init;
10815
10816 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
10817 &step_vector, &niters_vector_mult_vf, th,
10818 check_profitability, niters_no_overflow,
10819 &advance);
10820
10821 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
10822 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
10823 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
10824 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
10825
10826 if (niters_vector == NULL_TREE)
10827 {
10828 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
10829 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
10830 && known_eq (lowest_vf, vf))
10831 {
10832 niters_vector
10833 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
10834 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
10835 step_vector = build_one_cst (TREE_TYPE (niters));
10836 }
10837 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10838 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
10839 &step_vector, niters_no_overflow);
10840 else
10841 /* vect_do_peeling subtracted the number of peeled prologue
10842 iterations from LOOP_VINFO_NITERS. */
10843 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
10844 &niters_vector, &step_vector,
10845 niters_no_overflow);
10846 }
10847
10848 /* 1) Make sure the loop header has exactly two entries
10849 2) Make sure we have a preheader basic block. */
10850
10851 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
10852
10853 split_edge (loop_preheader_edge (loop));
10854
10855 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10856 /* This will deal with any possible peeling. */
10857 vect_prepare_for_masked_peels (loop_vinfo);
10858
10859 /* Schedule the SLP instances first, then handle loop vectorization
10860 below. */
10861 if (!loop_vinfo->slp_instances.is_empty ())
10862 {
10863 DUMP_VECT_SCOPE ("scheduling SLP instances");
10864 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
10865 }
10866
10867 /* FORNOW: the vectorizer supports only loops which body consist
10868 of one basic block (header + empty latch). When the vectorizer will
10869 support more involved loop forms, the order by which the BBs are
10870 traversed need to be reconsidered. */
10871
10872 for (i = 0; i < nbbs; i++)
10873 {
10874 basic_block bb = bbs[i];
10875 stmt_vec_info stmt_info;
10876
10877 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10878 gsi_next (&si))
10879 {
10880 gphi *phi = si.phi ();
10881 if (dump_enabled_p ())
10882 dump_printf_loc (MSG_NOTE, vect_location,
10883 "------>vectorizing phi: %G", (gimple *) phi);
10884 stmt_info = loop_vinfo->lookup_stmt (phi);
10885 if (!stmt_info)
10886 continue;
10887
10888 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10889 vect_loop_kill_debug_uses (loop, stmt_info);
10890
10891 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10892 && !STMT_VINFO_LIVE_P (stmt_info))
10893 continue;
10894
10895 if (STMT_VINFO_VECTYPE (stmt_info)
10896 && (maybe_ne
10897 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
10898 && dump_enabled_p ())
10899 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10900
10901 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10902 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10903 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10904 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10905 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
10906 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
10907 && ! PURE_SLP_STMT (stmt_info))
10908 {
10909 if (dump_enabled_p ())
10910 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
10911 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
10912 }
10913 }
10914
10915 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10916 gsi_next (&si))
10917 {
10918 gphi *phi = si.phi ();
10919 stmt_info = loop_vinfo->lookup_stmt (phi);
10920 if (!stmt_info)
10921 continue;
10922
10923 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10924 && !STMT_VINFO_LIVE_P (stmt_info))
10925 continue;
10926
10927 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10928 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10929 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10930 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10931 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
10932 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
10933 && ! PURE_SLP_STMT (stmt_info))
10934 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
10935 }
10936
10937 for (gimple_stmt_iterator si = gsi_start_bb (bb);
10938 !gsi_end_p (si);)
10939 {
10940 stmt = gsi_stmt (si);
10941 /* During vectorization remove existing clobber stmts. */
10942 if (gimple_clobber_p (stmt))
10943 {
10944 unlink_stmt_vdef (stmt);
10945 gsi_remove (&si, true);
10946 release_defs (stmt);
10947 }
10948 else
10949 {
10950 /* Ignore vector stmts created in the outer loop. */
10951 stmt_info = loop_vinfo->lookup_stmt (stmt);
10952
10953 /* vector stmts created in the outer-loop during vectorization of
10954 stmts in an inner-loop may not have a stmt_info, and do not
10955 need to be vectorized. */
10956 stmt_vec_info seen_store = NULL;
10957 if (stmt_info)
10958 {
10959 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
10960 {
10961 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
10962 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
10963 !gsi_end_p (subsi); gsi_next (&subsi))
10964 {
10965 stmt_vec_info pat_stmt_info
10966 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
10967 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10968 &si, &seen_store);
10969 }
10970 stmt_vec_info pat_stmt_info
10971 = STMT_VINFO_RELATED_STMT (stmt_info);
10972 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10973 &si, &seen_store))
10974 maybe_set_vectorized_backedge_value (loop_vinfo,
10975 pat_stmt_info);
10976 }
10977 else
10978 {
10979 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
10980 &seen_store))
10981 maybe_set_vectorized_backedge_value (loop_vinfo,
10982 stmt_info);
10983 }
10984 }
10985 gsi_next (&si);
10986 if (seen_store)
10987 {
10988 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
10989 /* Interleaving. If IS_STORE is TRUE, the
10990 vectorization of the interleaving chain was
10991 completed - free all the stores in the chain. */
10992 vect_remove_stores (loop_vinfo,
10993 DR_GROUP_FIRST_ELEMENT (seen_store));
10994 else
10995 /* Free the attached stmt_vec_info and remove the stmt. */
10996 loop_vinfo->remove_stmt (stmt_info);
10997 }
10998 }
10999 }
11000
11001 /* Stub out scalar statements that must not survive vectorization.
11002 Doing this here helps with grouped statements, or statements that
11003 are involved in patterns. */
11004 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11005 !gsi_end_p (gsi); gsi_next (&gsi))
11006 {
11007 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11008 if (!call || !gimple_call_internal_p (call))
11009 continue;
11010 internal_fn ifn = gimple_call_internal_fn (call);
11011 if (ifn == IFN_MASK_LOAD)
11012 {
11013 tree lhs = gimple_get_lhs (call);
11014 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11015 {
11016 tree zero = build_zero_cst (TREE_TYPE (lhs));
11017 gimple *new_stmt = gimple_build_assign (lhs, zero);
11018 gsi_replace (&gsi, new_stmt, true);
11019 }
11020 }
11021 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11022 {
11023 tree lhs = gimple_get_lhs (call);
11024 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11025 {
11026 tree else_arg
11027 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11028 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11029 gsi_replace (&gsi, new_stmt, true);
11030 }
11031 }
11032 }
11033 } /* BBs in loop */
11034
11035 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11036 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11037 if (integer_onep (step_vector))
11038 niters_no_overflow = true;
11039 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11040 niters_vector_mult_vf, !niters_no_overflow);
11041
11042 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11043 scale_profile_for_vect_loop (loop, assumed_vf);
11044
11045 /* True if the final iteration might not handle a full vector's
11046 worth of scalar iterations. */
11047 bool final_iter_may_be_partial
11048 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11049 /* The minimum number of iterations performed by the epilogue. This
11050 is 1 when peeling for gaps because we always need a final scalar
11051 iteration. */
11052 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11053 /* +1 to convert latch counts to loop iteration counts,
11054 -min_epilogue_iters to remove iterations that cannot be performed
11055 by the vector code. */
11056 int bias_for_lowest = 1 - min_epilogue_iters;
11057 int bias_for_assumed = bias_for_lowest;
11058 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11059 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11060 {
11061 /* When the amount of peeling is known at compile time, the first
11062 iteration will have exactly alignment_npeels active elements.
11063 In the worst case it will have at least one. */
11064 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11065 bias_for_lowest += lowest_vf - min_first_active;
11066 bias_for_assumed += assumed_vf - min_first_active;
11067 }
11068 /* In these calculations the "- 1" converts loop iteration counts
11069 back to latch counts. */
11070 if (loop->any_upper_bound)
11071 {
11072 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11073 loop->nb_iterations_upper_bound
11074 = (final_iter_may_be_partial
11075 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11076 lowest_vf) - 1
11077 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11078 lowest_vf) - 1);
11079 if (main_vinfo
11080 /* Both peeling for alignment and peeling for gaps can end up
11081 with the scalar epilogue running for more than VF-1 iterations. */
11082 && !main_vinfo->peeling_for_alignment
11083 && !main_vinfo->peeling_for_gaps)
11084 {
11085 unsigned int bound;
11086 poly_uint64 main_iters
11087 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11088 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11089 main_iters
11090 = upper_bound (main_iters,
11091 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11092 if (can_div_away_from_zero_p (main_iters,
11093 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11094 &bound))
11095 loop->nb_iterations_upper_bound
11096 = wi::umin ((widest_int) (bound - 1),
11097 loop->nb_iterations_upper_bound);
11098 }
11099 }
11100 if (loop->any_likely_upper_bound)
11101 loop->nb_iterations_likely_upper_bound
11102 = (final_iter_may_be_partial
11103 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11104 + bias_for_lowest, lowest_vf) - 1
11105 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11106 + bias_for_lowest, lowest_vf) - 1);
11107 if (loop->any_estimate)
11108 loop->nb_iterations_estimate
11109 = (final_iter_may_be_partial
11110 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11111 assumed_vf) - 1
11112 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11113 assumed_vf) - 1);
11114
11115 if (dump_enabled_p ())
11116 {
11117 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11118 {
11119 dump_printf_loc (MSG_NOTE, vect_location,
11120 "LOOP VECTORIZED\n");
11121 if (loop->inner)
11122 dump_printf_loc (MSG_NOTE, vect_location,
11123 "OUTER LOOP VECTORIZED\n");
11124 dump_printf (MSG_NOTE, "\n");
11125 }
11126 else
11127 dump_printf_loc (MSG_NOTE, vect_location,
11128 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11129 GET_MODE_NAME (loop_vinfo->vector_mode));
11130 }
11131
11132 /* Loops vectorized with a variable factor won't benefit from
11133 unrolling/peeling. */
11134 if (!vf.is_constant ())
11135 {
11136 loop->unroll = 1;
11137 if (dump_enabled_p ())
11138 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11139 " variable-length vectorization factor\n");
11140 }
11141 /* Free SLP instances here because otherwise stmt reference counting
11142 won't work. */
11143 slp_instance instance;
11144 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11145 vect_free_slp_instance (instance);
11146 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11147 /* Clear-up safelen field since its value is invalid after vectorization
11148 since vectorized loop can have loop-carried dependencies. */
11149 loop->safelen = 0;
11150
11151 if (epilogue)
11152 {
11153 update_epilogue_loop_vinfo (epilogue, advance);
11154
11155 epilogue->simduid = loop->simduid;
11156 epilogue->force_vectorize = loop->force_vectorize;
11157 epilogue->dont_vectorize = false;
11158 }
11159
11160 return epilogue;
11161 }
11162
11163 /* The code below is trying to perform simple optimization - revert
11164 if-conversion for masked stores, i.e. if the mask of a store is zero
11165 do not perform it and all stored value producers also if possible.
11166 For example,
11167 for (i=0; i<n; i++)
11168 if (c[i])
11169 {
11170 p1[i] += 1;
11171 p2[i] = p3[i] +2;
11172 }
11173 this transformation will produce the following semi-hammock:
11174
11175 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11176 {
11177 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11178 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11179 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11180 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11181 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11182 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11183 }
11184 */
11185
11186 void
11187 optimize_mask_stores (class loop *loop)
11188 {
11189 basic_block *bbs = get_loop_body (loop);
11190 unsigned nbbs = loop->num_nodes;
11191 unsigned i;
11192 basic_block bb;
11193 class loop *bb_loop;
11194 gimple_stmt_iterator gsi;
11195 gimple *stmt;
11196 auto_vec<gimple *> worklist;
11197 auto_purge_vect_location sentinel;
11198
11199 vect_location = find_loop_location (loop);
11200 /* Pick up all masked stores in loop if any. */
11201 for (i = 0; i < nbbs; i++)
11202 {
11203 bb = bbs[i];
11204 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11205 gsi_next (&gsi))
11206 {
11207 stmt = gsi_stmt (gsi);
11208 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11209 worklist.safe_push (stmt);
11210 }
11211 }
11212
11213 free (bbs);
11214 if (worklist.is_empty ())
11215 return;
11216
11217 /* Loop has masked stores. */
11218 while (!worklist.is_empty ())
11219 {
11220 gimple *last, *last_store;
11221 edge e, efalse;
11222 tree mask;
11223 basic_block store_bb, join_bb;
11224 gimple_stmt_iterator gsi_to;
11225 tree vdef, new_vdef;
11226 gphi *phi;
11227 tree vectype;
11228 tree zero;
11229
11230 last = worklist.pop ();
11231 mask = gimple_call_arg (last, 2);
11232 bb = gimple_bb (last);
11233 /* Create then_bb and if-then structure in CFG, then_bb belongs to
11234 the same loop as if_bb. It could be different to LOOP when two
11235 level loop-nest is vectorized and mask_store belongs to the inner
11236 one. */
11237 e = split_block (bb, last);
11238 bb_loop = bb->loop_father;
11239 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11240 join_bb = e->dest;
11241 store_bb = create_empty_bb (bb);
11242 add_bb_to_loop (store_bb, bb_loop);
11243 e->flags = EDGE_TRUE_VALUE;
11244 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11245 /* Put STORE_BB to likely part. */
11246 efalse->probability = profile_probability::unlikely ();
11247 store_bb->count = efalse->count ();
11248 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11249 if (dom_info_available_p (CDI_DOMINATORS))
11250 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11251 if (dump_enabled_p ())
11252 dump_printf_loc (MSG_NOTE, vect_location,
11253 "Create new block %d to sink mask stores.",
11254 store_bb->index);
11255 /* Create vector comparison with boolean result. */
11256 vectype = TREE_TYPE (mask);
11257 zero = build_zero_cst (vectype);
11258 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11259 gsi = gsi_last_bb (bb);
11260 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11261 /* Create new PHI node for vdef of the last masked store:
11262 .MEM_2 = VDEF <.MEM_1>
11263 will be converted to
11264 .MEM.3 = VDEF <.MEM_1>
11265 and new PHI node will be created in join bb
11266 .MEM_2 = PHI <.MEM_1, .MEM_3>
11267 */
11268 vdef = gimple_vdef (last);
11269 new_vdef = make_ssa_name (gimple_vop (cfun), last);
11270 gimple_set_vdef (last, new_vdef);
11271 phi = create_phi_node (vdef, join_bb);
11272 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11273
11274 /* Put all masked stores with the same mask to STORE_BB if possible. */
11275 while (true)
11276 {
11277 gimple_stmt_iterator gsi_from;
11278 gimple *stmt1 = NULL;
11279
11280 /* Move masked store to STORE_BB. */
11281 last_store = last;
11282 gsi = gsi_for_stmt (last);
11283 gsi_from = gsi;
11284 /* Shift GSI to the previous stmt for further traversal. */
11285 gsi_prev (&gsi);
11286 gsi_to = gsi_start_bb (store_bb);
11287 gsi_move_before (&gsi_from, &gsi_to);
11288 /* Setup GSI_TO to the non-empty block start. */
11289 gsi_to = gsi_start_bb (store_bb);
11290 if (dump_enabled_p ())
11291 dump_printf_loc (MSG_NOTE, vect_location,
11292 "Move stmt to created bb\n%G", last);
11293 /* Move all stored value producers if possible. */
11294 while (!gsi_end_p (gsi))
11295 {
11296 tree lhs;
11297 imm_use_iterator imm_iter;
11298 use_operand_p use_p;
11299 bool res;
11300
11301 /* Skip debug statements. */
11302 if (is_gimple_debug (gsi_stmt (gsi)))
11303 {
11304 gsi_prev (&gsi);
11305 continue;
11306 }
11307 stmt1 = gsi_stmt (gsi);
11308 /* Do not consider statements writing to memory or having
11309 volatile operand. */
11310 if (gimple_vdef (stmt1)
11311 || gimple_has_volatile_ops (stmt1))
11312 break;
11313 gsi_from = gsi;
11314 gsi_prev (&gsi);
11315 lhs = gimple_get_lhs (stmt1);
11316 if (!lhs)
11317 break;
11318
11319 /* LHS of vectorized stmt must be SSA_NAME. */
11320 if (TREE_CODE (lhs) != SSA_NAME)
11321 break;
11322
11323 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11324 {
11325 /* Remove dead scalar statement. */
11326 if (has_zero_uses (lhs))
11327 {
11328 gsi_remove (&gsi_from, true);
11329 continue;
11330 }
11331 }
11332
11333 /* Check that LHS does not have uses outside of STORE_BB. */
11334 res = true;
11335 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11336 {
11337 gimple *use_stmt;
11338 use_stmt = USE_STMT (use_p);
11339 if (is_gimple_debug (use_stmt))
11340 continue;
11341 if (gimple_bb (use_stmt) != store_bb)
11342 {
11343 res = false;
11344 break;
11345 }
11346 }
11347 if (!res)
11348 break;
11349
11350 if (gimple_vuse (stmt1)
11351 && gimple_vuse (stmt1) != gimple_vuse (last_store))
11352 break;
11353
11354 /* Can move STMT1 to STORE_BB. */
11355 if (dump_enabled_p ())
11356 dump_printf_loc (MSG_NOTE, vect_location,
11357 "Move stmt to created bb\n%G", stmt1);
11358 gsi_move_before (&gsi_from, &gsi_to);
11359 /* Shift GSI_TO for further insertion. */
11360 gsi_prev (&gsi_to);
11361 }
11362 /* Put other masked stores with the same mask to STORE_BB. */
11363 if (worklist.is_empty ()
11364 || gimple_call_arg (worklist.last (), 2) != mask
11365 || worklist.last () != stmt1)
11366 break;
11367 last = worklist.pop ();
11368 }
11369 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11370 }
11371 }
11372
11373 /* Decide whether it is possible to use a zero-based induction variable
11374 when vectorizing LOOP_VINFO with partial vectors. If it is, return
11375 the value that the induction variable must be able to hold in order
11376 to ensure that the rgroups eventually have no active vector elements.
11377 Return -1 otherwise. */
11378
11379 widest_int
11380 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11381 {
11382 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11383 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11384 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11385
11386 /* Calculate the value that the induction variable must be able
11387 to hit in order to ensure that we end the loop with an all-false mask.
11388 This involves adding the maximum number of inactive trailing scalar
11389 iterations. */
11390 widest_int iv_limit = -1;
11391 if (max_loop_iterations (loop, &iv_limit))
11392 {
11393 if (niters_skip)
11394 {
11395 /* Add the maximum number of skipped iterations to the
11396 maximum iteration count. */
11397 if (TREE_CODE (niters_skip) == INTEGER_CST)
11398 iv_limit += wi::to_widest (niters_skip);
11399 else
11400 iv_limit += max_vf - 1;
11401 }
11402 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11403 /* Make a conservatively-correct assumption. */
11404 iv_limit += max_vf - 1;
11405
11406 /* IV_LIMIT is the maximum number of latch iterations, which is also
11407 the maximum in-range IV value. Round this value down to the previous
11408 vector alignment boundary and then add an extra full iteration. */
11409 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11410 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11411 }
11412 return iv_limit;
11413 }
11414
11415 /* For the given rgroup_controls RGC, check whether an induction variable
11416 would ever hit a value that produces a set of all-false masks or zero
11417 lengths before wrapping around. Return true if it's possible to wrap
11418 around before hitting the desirable value, otherwise return false. */
11419
11420 bool
11421 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11422 {
11423 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11424
11425 if (iv_limit == -1)
11426 return true;
11427
11428 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11429 unsigned int compare_precision = TYPE_PRECISION (compare_type);
11430 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11431
11432 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11433 return true;
11434
11435 return false;
11436 }
This page took 0.5347 seconds and 5 git commands to generate.