2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
25 #include "coretypes.h"
32 #include "tree-pass.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
57 #include "case-cfn-macros.h"
59 /* Loop Vectorization Pass.
61 This pass tries to vectorize loops.
63 For example, the vectorizer transforms the following simple loop:
65 short a[N]; short b[N]; short c[N]; int i;
71 as if it was manually vectorized by rewriting the source code into:
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
78 for (i=0; i<N/8; i++){
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
119 For example, say stmt S1 was vectorized into stmt VS1:
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
157 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *,
159 static stmt_vec_info
vect_is_simple_reduction (loop_vec_info
, stmt_vec_info
,
160 bool *, bool *, bool);
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164 may already be set for general statements (not just data refs). */
167 vect_determine_vf_for_stmt_1 (vec_info
*vinfo
, stmt_vec_info stmt_info
,
168 bool vectype_maybe_set_p
,
171 gimple
*stmt
= stmt_info
->stmt
;
173 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
174 && !STMT_VINFO_LIVE_P (stmt_info
))
175 || gimple_clobber_p (stmt
))
177 if (dump_enabled_p ())
178 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
179 return opt_result::success ();
182 tree stmt_vectype
, nunits_vectype
;
183 opt_result res
= vect_get_vector_types_for_stmt (vinfo
, stmt_info
,
191 if (STMT_VINFO_VECTYPE (stmt_info
))
192 /* The only case when a vectype had been already set is for stmts
193 that contain a data ref, or for "pattern-stmts" (stmts generated
194 by the vectorizer to represent/replace a certain idiom). */
195 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info
)
196 || vectype_maybe_set_p
)
197 && STMT_VINFO_VECTYPE (stmt_info
) == stmt_vectype
);
199 STMT_VINFO_VECTYPE (stmt_info
) = stmt_vectype
;
203 vect_update_max_nunits (vf
, nunits_vectype
);
205 return opt_result::success ();
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. Return true on success
211 or false if something prevented vectorization. */
214 vect_determine_vf_for_stmt (vec_info
*vinfo
,
215 stmt_vec_info stmt_info
, poly_uint64
*vf
)
217 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining statement: %G",
220 opt_result res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, false, vf
);
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
225 && STMT_VINFO_RELATED_STMT (stmt_info
))
227 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
228 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
230 /* If a pattern statement has def stmts, analyze them too. */
231 for (gimple_stmt_iterator si
= gsi_start (pattern_def_seq
);
232 !gsi_end_p (si
); gsi_next (&si
))
234 stmt_vec_info def_stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_NOTE
, vect_location
,
237 "==> examining pattern def stmt: %G",
238 def_stmt_info
->stmt
);
239 res
= vect_determine_vf_for_stmt_1 (vinfo
, def_stmt_info
, true, vf
);
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE
, vect_location
,
246 "==> examining pattern statement: %G",
248 res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, true, vf
);
253 return opt_result::success ();
256 /* Function vect_determine_vectorization_factor
258 Determine the vectorization factor (VF). VF is the number of data elements
259 that are operated upon in parallel in a single iteration of the vectorized
260 loop. For example, when vectorizing a loop that operates on 4byte elements,
261 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262 elements can fit in a single vector register.
264 We currently support vectorization of loops in which all types operated upon
265 are of the same size. Therefore this function currently sets VF according to
266 the size of the types operated upon, and fails if there are multiple sizes
269 VF is also the factor by which the loop iterations are strip-mined, e.g.:
276 for (i=0; i<N; i+=VF){
277 a[i:VF] = b[i:VF] + c[i:VF];
282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
284 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
285 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
286 unsigned nbbs
= loop
->num_nodes
;
287 poly_uint64 vectorization_factor
= 1;
288 tree scalar_type
= NULL_TREE
;
291 stmt_vec_info stmt_info
;
294 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
296 for (i
= 0; i
< nbbs
; i
++)
298 basic_block bb
= bbs
[i
];
300 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
304 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
305 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: %G",
309 gcc_assert (stmt_info
);
311 if (STMT_VINFO_RELEVANT_P (stmt_info
)
312 || STMT_VINFO_LIVE_P (stmt_info
))
314 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
315 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
317 if (dump_enabled_p ())
318 dump_printf_loc (MSG_NOTE
, vect_location
,
319 "get vectype for scalar type: %T\n",
322 vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
324 return opt_result::failure_at (phi
,
325 "not vectorized: unsupported "
328 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
330 if (dump_enabled_p ())
331 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: %T\n",
334 if (dump_enabled_p ())
336 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
337 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
338 dump_printf (MSG_NOTE
, "\n");
341 vect_update_max_nunits (&vectorization_factor
, vectype
);
345 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
348 if (is_gimple_debug (gsi_stmt (si
)))
350 stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
352 = vect_determine_vf_for_stmt (loop_vinfo
,
353 stmt_info
, &vectorization_factor
);
359 /* TODO: Analyze cost. Decide if worth while to vectorize. */
360 if (dump_enabled_p ())
362 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
363 dump_dec (MSG_NOTE
, vectorization_factor
);
364 dump_printf (MSG_NOTE
, "\n");
367 if (known_le (vectorization_factor
, 1U))
368 return opt_result::failure_at (vect_location
,
369 "not vectorized: unsupported data-type\n");
370 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
371 return opt_result::success ();
375 /* Function vect_is_simple_iv_evolution.
377 FORNOW: A simple evolution of an induction variables in the loop is
378 considered a polynomial evolution. */
381 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
386 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
389 /* When there is no evolution in this loop, the evolution function
391 if (evolution_part
== NULL_TREE
)
394 /* When the evolution is a polynomial of degree >= 2
395 the evolution function is not "simple". */
396 if (tree_is_chrec (evolution_part
))
399 step_expr
= evolution_part
;
400 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
402 if (dump_enabled_p ())
403 dump_printf_loc (MSG_NOTE
, vect_location
, "step: %T, init: %T\n",
404 step_expr
, init_expr
);
409 if (TREE_CODE (step_expr
) != INTEGER_CST
410 && (TREE_CODE (step_expr
) != SSA_NAME
411 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
412 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
413 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
414 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
415 || !flag_associative_math
)))
416 && (TREE_CODE (step_expr
) != REAL_CST
417 || !flag_associative_math
))
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
428 /* Function vect_is_nonlinear_iv_evolution
430 Only support nonlinear induction for integer type
433 3. lshift/rshift by constant.
435 For neg induction, return a fake step as integer -1. */
437 vect_is_nonlinear_iv_evolution (class loop
* loop
, stmt_vec_info stmt_info
,
438 gphi
* loop_phi_node
, tree
*init
, tree
*step
)
440 tree init_expr
, ev_expr
, result
, op1
, op2
;
443 if (gimple_phi_num_args (loop_phi_node
) != 2)
446 init_expr
= PHI_ARG_DEF_FROM_EDGE (loop_phi_node
, loop_preheader_edge (loop
));
447 ev_expr
= PHI_ARG_DEF_FROM_EDGE (loop_phi_node
, loop_latch_edge (loop
));
449 /* Support nonlinear induction only for integer type. */
450 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr
)))
454 result
= PHI_RESULT (loop_phi_node
);
456 if (TREE_CODE (ev_expr
) != SSA_NAME
457 || ((def
= SSA_NAME_DEF_STMT (ev_expr
)), false)
458 || !is_gimple_assign (def
))
461 enum tree_code t_code
= gimple_assign_rhs_code (def
);
465 if (gimple_assign_rhs1 (def
) != result
)
467 *step
= build_int_cst (TREE_TYPE (init_expr
), -1);
468 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_neg
;
474 op1
= gimple_assign_rhs1 (def
);
475 op2
= gimple_assign_rhs2 (def
);
476 if (TREE_CODE (op2
) != INTEGER_CST
480 if (t_code
== LSHIFT_EXPR
)
481 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_shl
;
482 else if (t_code
== RSHIFT_EXPR
)
483 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_shr
;
484 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
) = vect_step_op_mul
;
493 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info
) = *init
;
494 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
) = *step
;
499 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
500 what we are assuming is a double reduction. For example, given
501 a structure like this:
504 x_1 = PHI <x_4(outer2), ...>;
508 x_2 = PHI <x_1(outer1), ...>;
514 x_4 = PHI <x_3(inner)>;
517 outer loop analysis would treat x_1 as a double reduction phi and
518 this function would then return true for x_2. */
521 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo
, gphi
*phi
)
525 FOR_EACH_PHI_ARG (use_p
, phi
, op_iter
, SSA_OP_USE
)
526 if (stmt_vec_info def_info
= loop_vinfo
->lookup_def (USE_FROM_PTR (use_p
)))
527 if (STMT_VINFO_DEF_TYPE (def_info
) == vect_double_reduction_def
)
532 /* Returns true if Phi is a first-order recurrence. A first-order
533 recurrence is a non-reduction recurrence relation in which the value of
534 the recurrence in the current loop iteration equals a value defined in
535 the previous iteration. */
538 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo
, class loop
*loop
,
541 /* A nested cycle isn't vectorizable as first order recurrence. */
542 if (LOOP_VINFO_LOOP (loop_vinfo
) != loop
)
545 /* Ensure the loop latch definition is from within the loop. */
546 edge latch
= loop_latch_edge (loop
);
547 tree ldef
= PHI_ARG_DEF_FROM_EDGE (phi
, latch
);
548 if (TREE_CODE (ldef
) != SSA_NAME
549 || SSA_NAME_IS_DEFAULT_DEF (ldef
)
550 || is_a
<gphi
*> (SSA_NAME_DEF_STMT (ldef
))
551 || !flow_bb_inside_loop_p (loop
, gimple_bb (SSA_NAME_DEF_STMT (ldef
))))
554 tree def
= gimple_phi_result (phi
);
556 /* Ensure every use_stmt of the phi node is dominated by the latch
558 imm_use_iterator imm_iter
;
560 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, def
)
561 if (!is_gimple_debug (USE_STMT (use_p
))
562 && (SSA_NAME_DEF_STMT (ldef
) == USE_STMT (use_p
)
563 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef
),
567 /* First-order recurrence autovectorization needs shuffle vector. */
568 tree scalar_type
= TREE_TYPE (def
);
569 tree vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
576 /* Function vect_analyze_scalar_cycles_1.
578 Examine the cross iteration def-use cycles of scalar variables
579 in LOOP. LOOP_VINFO represents the loop that is now being
580 considered for vectorization (can be LOOP, or an outer-loop
581 enclosing LOOP). SLP indicates there will be some subsequent
582 slp analyses or not. */
585 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, class loop
*loop
,
588 basic_block bb
= loop
->header
;
590 auto_vec
<stmt_vec_info
, 64> worklist
;
592 bool double_reduc
, reduc_chain
;
594 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
596 /* First - identify all inductions. Reduction detection assumes that all the
597 inductions have been identified, therefore, this order must not be
599 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
601 gphi
*phi
= gsi
.phi ();
602 tree access_fn
= NULL
;
603 tree def
= PHI_RESULT (phi
);
604 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (phi
);
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G",
610 /* Skip virtual phi's. The data dependences that are associated with
611 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
612 if (virtual_operand_p (def
))
615 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
617 /* Analyze the evolution function. */
618 access_fn
= analyze_scalar_evolution (loop
, def
);
621 STRIP_NOPS (access_fn
);
622 if (dump_enabled_p ())
623 dump_printf_loc (MSG_NOTE
, vect_location
,
624 "Access function of PHI: %T\n", access_fn
);
625 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
626 = initial_condition_in_loop_num (access_fn
, loop
->num
);
627 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
628 = evolution_part_in_loop_num (access_fn
, loop
->num
);
632 || vect_inner_phi_in_double_reduction_p (loop_vinfo
, phi
)
633 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
,
635 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
636 && TREE_CODE (step
) != INTEGER_CST
))
637 /* Only handle nonlinear iv for same loop. */
638 && (LOOP_VINFO_LOOP (loop_vinfo
) != loop
639 || !vect_is_nonlinear_iv_evolution (loop
, stmt_vinfo
,
642 worklist
.safe_push (stmt_vinfo
);
646 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
648 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
650 if (dump_enabled_p ())
651 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
652 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
656 /* Second - identify all reductions and nested cycles. */
657 while (worklist
.length () > 0)
659 stmt_vec_info stmt_vinfo
= worklist
.pop ();
660 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
661 tree def
= PHI_RESULT (phi
);
663 if (dump_enabled_p ())
664 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G",
667 gcc_assert (!virtual_operand_p (def
)
668 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
670 stmt_vec_info reduc_stmt_info
671 = vect_is_simple_reduction (loop_vinfo
, stmt_vinfo
, &double_reduc
,
675 STMT_VINFO_REDUC_DEF (stmt_vinfo
) = reduc_stmt_info
;
676 STMT_VINFO_REDUC_DEF (reduc_stmt_info
) = stmt_vinfo
;
679 if (dump_enabled_p ())
680 dump_printf_loc (MSG_NOTE
, vect_location
,
681 "Detected double reduction.\n");
683 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
684 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_double_reduction_def
;
688 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
690 if (dump_enabled_p ())
691 dump_printf_loc (MSG_NOTE
, vect_location
,
692 "Detected vectorizable nested cycle.\n");
694 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
698 if (dump_enabled_p ())
699 dump_printf_loc (MSG_NOTE
, vect_location
,
700 "Detected reduction.\n");
702 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
703 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_reduction_def
;
704 /* Store the reduction cycles for possible vectorization in
705 loop-aware SLP if it was not detected as reduction
708 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push
713 else if (vect_phi_first_order_recurrence_p (loop_vinfo
, loop
, phi
))
714 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_first_order_recurrence
;
716 if (dump_enabled_p ())
717 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
718 "Unknown def-use cycle pattern.\n");
723 /* Function vect_analyze_scalar_cycles.
725 Examine the cross iteration def-use cycles of scalar variables, by
726 analyzing the loop-header PHIs of scalar variables. Classify each
727 cycle as one of the following: invariant, induction, reduction, unknown.
728 We do that for the loop represented by LOOP_VINFO, and also to its
729 inner-loop, if exists.
730 Examples for scalar cycles:
745 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
, bool slp
)
747 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
749 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
, slp
);
751 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
752 Reductions in such inner-loop therefore have different properties than
753 the reductions in the nest that gets vectorized:
754 1. When vectorized, they are executed in the same order as in the original
755 scalar loop, so we can't change the order of computation when
757 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
758 current checks are too strict. */
761 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
, slp
);
764 /* Transfer group and reduction information from STMT_INFO to its
768 vect_fixup_reduc_chain (stmt_vec_info stmt_info
)
770 stmt_vec_info firstp
= STMT_VINFO_RELATED_STMT (stmt_info
);
772 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp
)
773 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
774 REDUC_GROUP_SIZE (firstp
) = REDUC_GROUP_SIZE (stmt_info
);
777 stmtp
= STMT_VINFO_RELATED_STMT (stmt_info
);
778 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp
)
779 == STMT_VINFO_DEF_TYPE (stmt_info
));
780 REDUC_GROUP_FIRST_ELEMENT (stmtp
) = firstp
;
781 stmt_info
= REDUC_GROUP_NEXT_ELEMENT (stmt_info
);
783 REDUC_GROUP_NEXT_ELEMENT (stmtp
)
784 = STMT_VINFO_RELATED_STMT (stmt_info
);
789 /* Fixup scalar cycles that now have their stmts detected as patterns. */
792 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
797 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
799 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (first
);
802 if ((STMT_VINFO_IN_PATTERN_P (next
)
803 != STMT_VINFO_IN_PATTERN_P (first
))
804 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next
)) == -1)
806 next
= REDUC_GROUP_NEXT_ELEMENT (next
);
808 /* If all reduction chain members are well-formed patterns adjust
809 the group to group the pattern stmts instead. */
811 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first
)) != -1)
813 if (STMT_VINFO_IN_PATTERN_P (first
))
815 vect_fixup_reduc_chain (first
);
816 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
817 = STMT_VINFO_RELATED_STMT (first
);
820 /* If not all stmt in the chain are patterns or if we failed
821 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
822 it as regular reduction instead. */
825 stmt_vec_info vinfo
= first
;
826 stmt_vec_info last
= NULL
;
829 next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
830 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
831 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
835 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first
))
837 loop_vinfo
->reductions
.safe_push (vect_stmt_to_vectorize (last
));
838 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).unordered_remove (i
);
844 /* Function vect_get_loop_niters.
846 Determine how many iterations the loop is executed and place it
847 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
848 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
849 niter information holds in ASSUMPTIONS.
851 Return the loop exit condition. */
855 vect_get_loop_niters (class loop
*loop
, tree
*assumptions
,
856 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
858 edge exit
= single_exit (loop
);
859 class tree_niter_desc niter_desc
;
860 tree niter_assumptions
, niter
, may_be_zero
;
861 gcond
*cond
= get_loop_exit_condition (loop
);
863 *assumptions
= boolean_true_node
;
864 *number_of_iterationsm1
= chrec_dont_know
;
865 *number_of_iterations
= chrec_dont_know
;
866 DUMP_VECT_SCOPE ("get_loop_niters");
871 may_be_zero
= NULL_TREE
;
872 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
873 || chrec_contains_undetermined (niter_desc
.niter
))
876 niter_assumptions
= niter_desc
.assumptions
;
877 may_be_zero
= niter_desc
.may_be_zero
;
878 niter
= niter_desc
.niter
;
880 if (may_be_zero
&& integer_zerop (may_be_zero
))
881 may_be_zero
= NULL_TREE
;
885 if (COMPARISON_CLASS_P (may_be_zero
))
887 /* Try to combine may_be_zero with assumptions, this can simplify
888 computation of niter expression. */
889 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
890 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
892 fold_build1 (TRUTH_NOT_EXPR
,
896 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
897 build_int_cst (TREE_TYPE (niter
), 0),
898 rewrite_to_non_trapping_overflow (niter
));
900 may_be_zero
= NULL_TREE
;
902 else if (integer_nonzerop (may_be_zero
))
904 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
905 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
912 *assumptions
= niter_assumptions
;
913 *number_of_iterationsm1
= niter
;
915 /* We want the number of loop header executions which is the number
916 of latch executions plus one.
917 ??? For UINT_MAX latch executions this number overflows to zero
918 for loops like do { n++; } while (n != 0); */
919 if (niter
&& !chrec_contains_undetermined (niter
))
920 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), unshare_expr (niter
),
921 build_int_cst (TREE_TYPE (niter
), 1));
922 *number_of_iterations
= niter
;
927 /* Function bb_in_loop_p
929 Used as predicate for dfs order traversal of the loop bbs. */
932 bb_in_loop_p (const_basic_block bb
, const void *data
)
934 const class loop
*const loop
= (const class loop
*)data
;
935 if (flow_bb_inside_loop_p (loop
, bb
))
941 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
942 stmt_vec_info structs for all the stmts in LOOP_IN. */
944 _loop_vec_info::_loop_vec_info (class loop
*loop_in
, vec_info_shared
*shared
)
945 : vec_info (vec_info::loop
, shared
),
947 bbs (XCNEWVEC (basic_block
, loop
->num_nodes
)),
948 num_itersm1 (NULL_TREE
),
949 num_iters (NULL_TREE
),
950 num_iters_unchanged (NULL_TREE
),
951 num_iters_assumptions (NULL_TREE
),
952 vector_costs (nullptr),
953 scalar_costs (nullptr),
955 versioning_threshold (0),
956 vectorization_factor (0),
957 main_loop_edge (nullptr),
958 skip_main_loop_edge (nullptr),
959 skip_this_loop_edge (nullptr),
960 reusable_accumulators (),
961 suggested_unroll_factor (1),
962 max_vectorization_factor (0),
963 mask_skip_niters (NULL_TREE
),
964 rgroup_compare_type (NULL_TREE
),
965 simd_if_cond (NULL_TREE
),
967 peeling_for_alignment (0),
971 slp_unrolling_factor (1),
972 inner_loop_cost_factor (param_vect_inner_loop_cost_factor
),
973 vectorizable (false),
974 can_use_partial_vectors_p (param_vect_partial_vector_usage
!= 0),
975 using_partial_vectors_p (false),
976 epil_using_partial_vectors_p (false),
977 partial_load_store_bias (0),
978 peeling_for_gaps (false),
979 peeling_for_niter (false),
980 no_data_dependencies (false),
981 has_mask_store (false),
982 scalar_loop_scaling (profile_probability::uninitialized ()),
984 orig_loop_info (NULL
)
986 /* CHECKME: We want to visit all BBs before their successors (except for
987 latch blocks, for which this assertion wouldn't hold). In the simple
988 case of the loop forms we allow, a dfs order of the BBs would the same
989 as reversed postorder traversal, so we are safe. */
991 unsigned int nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
992 bbs
, loop
->num_nodes
, loop
);
993 gcc_assert (nbbs
== loop
->num_nodes
);
995 for (unsigned int i
= 0; i
< nbbs
; i
++)
997 basic_block bb
= bbs
[i
];
998 gimple_stmt_iterator si
;
1000 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
1002 gimple
*phi
= gsi_stmt (si
);
1003 gimple_set_uid (phi
, 0);
1007 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1009 gimple
*stmt
= gsi_stmt (si
);
1010 gimple_set_uid (stmt
, 0);
1011 if (is_gimple_debug (stmt
))
1014 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1015 third argument is the #pragma omp simd if (x) condition, when 0,
1016 loop shouldn't be vectorized, when non-zero constant, it should
1017 be vectorized normally, otherwise versioned with vectorized loop
1018 done if the condition is non-zero at runtime. */
1019 if (loop_in
->simduid
1020 && is_gimple_call (stmt
)
1021 && gimple_call_internal_p (stmt
)
1022 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
1023 && gimple_call_num_args (stmt
) >= 3
1024 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
1025 && (loop_in
->simduid
1026 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0))))
1028 tree arg
= gimple_call_arg (stmt
, 2);
1029 if (integer_zerop (arg
) || TREE_CODE (arg
) == SSA_NAME
)
1032 gcc_assert (integer_nonzerop (arg
));
1037 epilogue_vinfos
.create (6);
1040 /* Free all levels of rgroup CONTROLS. */
1043 release_vec_loop_controls (vec
<rgroup_controls
> *controls
)
1045 rgroup_controls
*rgc
;
1047 FOR_EACH_VEC_ELT (*controls
, i
, rgc
)
1048 rgc
->controls
.release ();
1049 controls
->release ();
1052 /* Free all memory used by the _loop_vec_info, as well as all the
1053 stmt_vec_info structs of all the stmts in the loop. */
1055 _loop_vec_info::~_loop_vec_info ()
1059 release_vec_loop_controls (&masks
);
1060 release_vec_loop_controls (&lens
);
1063 epilogue_vinfos
.release ();
1064 delete scalar_costs
;
1065 delete vector_costs
;
1067 /* When we release an epiloge vinfo that we do not intend to use
1068 avoid clearing AUX of the main loop which should continue to
1069 point to the main loop vinfo since otherwise we'll leak that. */
1070 if (loop
->aux
== this)
1074 /* Return an invariant or register for EXPR and emit necessary
1075 computations in the LOOP_VINFO loop preheader. */
1078 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo
, tree expr
)
1080 if (is_gimple_reg (expr
)
1081 || is_gimple_min_invariant (expr
))
1084 if (! loop_vinfo
->ivexpr_map
)
1085 loop_vinfo
->ivexpr_map
= new hash_map
<tree_operand_hash
, tree
>;
1086 tree
&cached
= loop_vinfo
->ivexpr_map
->get_or_insert (expr
);
1089 gimple_seq stmts
= NULL
;
1090 cached
= force_gimple_operand (unshare_expr (expr
),
1091 &stmts
, true, NULL_TREE
);
1094 edge e
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
1095 gsi_insert_seq_on_edge_immediate (e
, stmts
);
1101 /* Return true if we can use CMP_TYPE as the comparison type to produce
1102 all masks required to mask LOOP_VINFO. */
1105 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
1107 rgroup_controls
*rgm
;
1109 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
1110 if (rgm
->type
!= NULL_TREE
1111 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
1112 cmp_type
, rgm
->type
,
1113 OPTIMIZE_FOR_SPEED
))
1118 /* Calculate the maximum number of scalars per iteration for every
1119 rgroup in LOOP_VINFO. */
1122 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
1124 unsigned int res
= 1;
1126 rgroup_controls
*rgm
;
1127 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
1128 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
1132 /* Calculate the minimum precision necessary to represent:
1136 as an unsigned integer, where MAX_NITERS is the maximum number of
1137 loop header iterations for the original scalar form of LOOP_VINFO. */
1140 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo
, unsigned int factor
)
1142 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1144 /* Get the maximum number of iterations that is representable
1145 in the counter type. */
1146 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
1147 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
1149 /* Get a more refined estimate for the number of iterations. */
1150 widest_int max_back_edges
;
1151 if (max_loop_iterations (loop
, &max_back_edges
))
1152 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
1154 /* Work out how many bits we need to represent the limit. */
1155 return wi::min_precision (max_ni
* factor
, UNSIGNED
);
1158 /* True if the loop needs peeling or partial vectors when vectorized. */
1161 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo
)
1163 unsigned HOST_WIDE_INT const_vf
;
1164 HOST_WIDE_INT max_niter
1165 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1167 unsigned th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
1168 if (!th
&& LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
))
1169 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1172 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1173 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0)
1175 /* Work out the (constant) number of iterations that need to be
1176 peeled for reasons other than niters. */
1177 unsigned int peel_niter
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1178 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
1180 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
) - peel_niter
,
1181 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1184 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1185 /* ??? When peeling for gaps but not alignment, we could
1186 try to check whether the (variable) niters is known to be
1187 VF * N + 1. That's something of a niche case though. */
1188 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1189 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
1190 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1191 < (unsigned) exact_log2 (const_vf
))
1192 /* In case of versioning, check if the maximum number of
1193 iterations is greater than th. If they are identical,
1194 the epilogue is unnecessary. */
1195 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1196 || ((unsigned HOST_WIDE_INT
) max_niter
1197 > (th
/ const_vf
) * const_vf
))))
1203 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1204 whether we can actually generate the masks required. Return true if so,
1205 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1208 vect_verify_full_masking (loop_vec_info loop_vinfo
)
1210 unsigned int min_ni_width
;
1211 unsigned int max_nscalars_per_iter
1212 = vect_get_max_nscalars_per_iter (loop_vinfo
);
1214 /* Use a normal loop if there are no statements that need masking.
1215 This only happens in rare degenerate cases: it means that the loop
1216 has no loads, no stores, and no live-out values. */
1217 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1220 /* Work out how many bits we need to represent the limit. */
1222 = vect_min_prec_for_max_niters (loop_vinfo
, max_nscalars_per_iter
);
1224 /* Find a scalar mode for which WHILE_ULT is supported. */
1225 opt_scalar_int_mode cmp_mode_iter
;
1226 tree cmp_type
= NULL_TREE
;
1227 tree iv_type
= NULL_TREE
;
1228 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
1229 unsigned int iv_precision
= UINT_MAX
;
1232 iv_precision
= wi::min_precision (iv_limit
* max_nscalars_per_iter
,
1235 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1237 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1238 if (cmp_bits
>= min_ni_width
1239 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1241 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1243 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1245 /* Although we could stop as soon as we find a valid mode,
1246 there are at least two reasons why that's not always the
1249 - An IV that's Pmode or wider is more likely to be reusable
1250 in address calculations than an IV that's narrower than
1253 - Doing the comparison in IV_PRECISION or wider allows
1254 a natural 0-based IV, whereas using a narrower comparison
1255 type requires mitigations against wrap-around.
1257 Conversely, if the IV limit is variable, doing the comparison
1258 in a wider type than the original type can introduce
1259 unnecessary extensions, so picking the widest valid mode
1260 is not always a good choice either.
1262 Here we prefer the first IV type that's Pmode or wider,
1263 and the first comparison type that's IV_PRECISION or wider.
1264 (The comparison type must be no wider than the IV type,
1265 to avoid extensions in the vector loop.)
1267 ??? We might want to try continuing beyond Pmode for ILP32
1268 targets if CMP_BITS < IV_PRECISION. */
1269 iv_type
= this_type
;
1270 if (!cmp_type
|| iv_precision
> TYPE_PRECISION (cmp_type
))
1271 cmp_type
= this_type
;
1272 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1281 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1282 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1286 /* Check whether we can use vector access with length based on precison
1287 comparison. So far, to keep it simple, we only allow the case that the
1288 precision of the target supported length is larger than the precision
1289 required by loop niters. */
1292 vect_verify_loop_lens (loop_vec_info loop_vinfo
)
1294 if (LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
1297 machine_mode len_load_mode
= get_len_load_store_mode
1298 (loop_vinfo
->vector_mode
, true).require ();
1299 machine_mode len_store_mode
= get_len_load_store_mode
1300 (loop_vinfo
->vector_mode
, false).require ();
1302 signed char partial_load_bias
= internal_len_load_store_bias
1303 (IFN_LEN_LOAD
, len_load_mode
);
1305 signed char partial_store_bias
= internal_len_load_store_bias
1306 (IFN_LEN_STORE
, len_store_mode
);
1308 gcc_assert (partial_load_bias
== partial_store_bias
);
1310 if (partial_load_bias
== VECT_PARTIAL_BIAS_UNSUPPORTED
)
1313 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1314 len_loads with a length of zero. In order to avoid that we prohibit
1315 more than one loop length here. */
1316 if (partial_load_bias
== -1
1317 && LOOP_VINFO_LENS (loop_vinfo
).length () > 1)
1320 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) = partial_load_bias
;
1322 unsigned int max_nitems_per_iter
= 1;
1324 rgroup_controls
*rgl
;
1325 /* Find the maximum number of items per iteration for every rgroup. */
1326 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), i
, rgl
)
1328 unsigned nitems_per_iter
= rgl
->max_nscalars_per_iter
* rgl
->factor
;
1329 max_nitems_per_iter
= MAX (max_nitems_per_iter
, nitems_per_iter
);
1332 /* Work out how many bits we need to represent the length limit. */
1333 unsigned int min_ni_prec
1334 = vect_min_prec_for_max_niters (loop_vinfo
, max_nitems_per_iter
);
1336 /* Now use the maximum of below precisions for one suitable IV type:
1337 - the IV's natural precision
1338 - the precision needed to hold: the maximum number of scalar
1339 iterations multiplied by the scale factor (min_ni_prec above)
1340 - the Pmode precision
1342 If min_ni_prec is less than the precision of the current niters,
1343 we perfer to still use the niters type. Prefer to use Pmode and
1344 wider IV to avoid narrow conversions. */
1346 unsigned int ni_prec
1347 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)));
1348 min_ni_prec
= MAX (min_ni_prec
, ni_prec
);
1349 min_ni_prec
= MAX (min_ni_prec
, GET_MODE_BITSIZE (Pmode
));
1351 tree iv_type
= NULL_TREE
;
1352 opt_scalar_int_mode tmode_iter
;
1353 FOR_EACH_MODE_IN_CLASS (tmode_iter
, MODE_INT
)
1355 scalar_mode tmode
= tmode_iter
.require ();
1356 unsigned int tbits
= GET_MODE_BITSIZE (tmode
);
1358 /* ??? Do we really want to construct one IV whose precision exceeds
1360 if (tbits
> BITS_PER_WORD
)
1363 /* Find the first available standard integral type. */
1364 if (tbits
>= min_ni_prec
&& targetm
.scalar_mode_supported_p (tmode
))
1366 iv_type
= build_nonstandard_integer_type (tbits
, true);
1373 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1375 "can't vectorize with length-based partial vectors"
1376 " because there is no suitable iv type.\n");
1380 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
) = iv_type
;
1381 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo
) = iv_type
;
1386 /* Calculate the cost of one scalar iteration of the loop. */
1388 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1390 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1391 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1392 int nbbs
= loop
->num_nodes
, factor
;
1393 int innerloop_iters
, i
;
1395 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1397 /* Gather costs for statements in the scalar loop. */
1400 innerloop_iters
= 1;
1402 innerloop_iters
= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo
);
1404 for (i
= 0; i
< nbbs
; i
++)
1406 gimple_stmt_iterator si
;
1407 basic_block bb
= bbs
[i
];
1409 if (bb
->loop_father
== loop
->inner
)
1410 factor
= innerloop_iters
;
1414 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1416 gimple
*stmt
= gsi_stmt (si
);
1417 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
1419 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1422 /* Skip stmts that are not vectorized inside the loop. */
1423 stmt_vec_info vstmt_info
= vect_stmt_to_vectorize (stmt_info
);
1424 if (!STMT_VINFO_RELEVANT_P (vstmt_info
)
1425 && (!STMT_VINFO_LIVE_P (vstmt_info
)
1426 || !VECTORIZABLE_CYCLE_DEF
1427 (STMT_VINFO_DEF_TYPE (vstmt_info
))))
1430 vect_cost_for_stmt kind
;
1431 if (STMT_VINFO_DATA_REF (stmt_info
))
1433 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1436 kind
= scalar_store
;
1438 else if (vect_nop_conversion_p (stmt_info
))
1443 /* We are using vect_prologue here to avoid scaling twice
1444 by the inner loop factor. */
1445 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1446 factor
, kind
, stmt_info
, 0, vect_prologue
);
1450 /* Now accumulate cost. */
1451 loop_vinfo
->scalar_costs
= init_cost (loop_vinfo
, true);
1452 add_stmt_costs (loop_vinfo
->scalar_costs
,
1453 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
));
1454 loop_vinfo
->scalar_costs
->finish_cost (nullptr);
1458 /* Function vect_analyze_loop_form.
1460 Verify that certain CFG restrictions hold, including:
1461 - the loop has a pre-header
1462 - the loop has a single entry and exit
1463 - the loop exit condition is simple enough
1464 - the number of iterations can be analyzed, i.e, a countable loop. The
1465 niter could be analyzed under some assumptions. */
1468 vect_analyze_loop_form (class loop
*loop
, vect_loop_form_info
*info
)
1470 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1472 /* Different restrictions apply when we are considering an inner-most loop,
1473 vs. an outer (nested) loop.
1474 (FORNOW. May want to relax some of these restrictions in the future). */
1476 info
->inner_loop_cond
= NULL
;
1479 /* Inner-most loop. We currently require that the number of BBs is
1480 exactly 2 (the header and latch). Vectorizable inner-most loops
1491 if (loop
->num_nodes
!= 2)
1492 return opt_result::failure_at (vect_location
,
1494 " control flow in loop.\n");
1496 if (empty_block_p (loop
->header
))
1497 return opt_result::failure_at (vect_location
,
1498 "not vectorized: empty loop.\n");
1502 class loop
*innerloop
= loop
->inner
;
1505 /* Nested loop. We currently require that the loop is doubly-nested,
1506 contains a single inner loop, and the number of BBs is exactly 5.
1507 Vectorizable outer-loops look like this:
1519 The inner-loop has the properties expected of inner-most loops
1520 as described above. */
1522 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1523 return opt_result::failure_at (vect_location
,
1525 " multiple nested loops.\n");
1527 if (loop
->num_nodes
!= 5)
1528 return opt_result::failure_at (vect_location
,
1530 " control flow in loop.\n");
1532 entryedge
= loop_preheader_edge (innerloop
);
1533 if (entryedge
->src
!= loop
->header
1534 || !single_exit (innerloop
)
1535 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1536 return opt_result::failure_at (vect_location
,
1538 " unsupported outerloop form.\n");
1540 /* Analyze the inner-loop. */
1541 vect_loop_form_info inner
;
1542 opt_result res
= vect_analyze_loop_form (loop
->inner
, &inner
);
1545 if (dump_enabled_p ())
1546 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1547 "not vectorized: Bad inner loop.\n");
1551 /* Don't support analyzing niter under assumptions for inner
1553 if (!integer_onep (inner
.assumptions
))
1554 return opt_result::failure_at (vect_location
,
1555 "not vectorized: Bad inner loop.\n");
1557 if (!expr_invariant_in_loop_p (loop
, inner
.number_of_iterations
))
1558 return opt_result::failure_at (vect_location
,
1559 "not vectorized: inner-loop count not"
1562 if (dump_enabled_p ())
1563 dump_printf_loc (MSG_NOTE
, vect_location
,
1564 "Considering outer-loop vectorization.\n");
1565 info
->inner_loop_cond
= inner
.loop_cond
;
1568 if (!single_exit (loop
))
1569 return opt_result::failure_at (vect_location
,
1570 "not vectorized: multiple exits.\n");
1571 if (EDGE_COUNT (loop
->header
->preds
) != 2)
1572 return opt_result::failure_at (vect_location
,
1574 " too many incoming edges.\n");
1576 /* We assume that the loop exit condition is at the end of the loop. i.e,
1577 that the loop is represented as a do-while (with a proper if-guard
1578 before the loop if needed), where the loop header contains all the
1579 executable statements, and the latch is empty. */
1580 if (!empty_block_p (loop
->latch
)
1581 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1582 return opt_result::failure_at (vect_location
,
1583 "not vectorized: latch block not empty.\n");
1585 /* Make sure the exit is not abnormal. */
1586 edge e
= single_exit (loop
);
1587 if (e
->flags
& EDGE_ABNORMAL
)
1588 return opt_result::failure_at (vect_location
,
1590 " abnormal loop exit edge.\n");
1593 = vect_get_loop_niters (loop
, &info
->assumptions
,
1594 &info
->number_of_iterations
,
1595 &info
->number_of_iterationsm1
);
1596 if (!info
->loop_cond
)
1597 return opt_result::failure_at
1599 "not vectorized: complicated exit condition.\n");
1601 if (integer_zerop (info
->assumptions
)
1602 || !info
->number_of_iterations
1603 || chrec_contains_undetermined (info
->number_of_iterations
))
1604 return opt_result::failure_at
1606 "not vectorized: number of iterations cannot be computed.\n");
1608 if (integer_zerop (info
->number_of_iterations
))
1609 return opt_result::failure_at
1611 "not vectorized: number of iterations = 0.\n");
1613 if (!(tree_fits_shwi_p (info
->number_of_iterations
)
1614 && tree_to_shwi (info
->number_of_iterations
) > 0))
1616 if (dump_enabled_p ())
1618 dump_printf_loc (MSG_NOTE
, vect_location
,
1619 "Symbolic number of iterations is ");
1620 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, info
->number_of_iterations
);
1621 dump_printf (MSG_NOTE
, "\n");
1625 return opt_result::success ();
1628 /* Create a loop_vec_info for LOOP with SHARED and the
1629 vect_analyze_loop_form result. */
1632 vect_create_loop_vinfo (class loop
*loop
, vec_info_shared
*shared
,
1633 const vect_loop_form_info
*info
,
1634 loop_vec_info main_loop_info
)
1636 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
, shared
);
1637 LOOP_VINFO_NITERSM1 (loop_vinfo
) = info
->number_of_iterationsm1
;
1638 LOOP_VINFO_NITERS (loop_vinfo
) = info
->number_of_iterations
;
1639 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = info
->number_of_iterations
;
1640 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = main_loop_info
;
1641 /* Also record the assumptions for versioning. */
1642 if (!integer_onep (info
->assumptions
) && !main_loop_info
)
1643 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = info
->assumptions
;
1645 stmt_vec_info loop_cond_info
= loop_vinfo
->lookup_stmt (info
->loop_cond
);
1646 STMT_VINFO_TYPE (loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1647 if (info
->inner_loop_cond
)
1649 stmt_vec_info inner_loop_cond_info
1650 = loop_vinfo
->lookup_stmt (info
->inner_loop_cond
);
1651 STMT_VINFO_TYPE (inner_loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1652 /* If we have an estimate on the number of iterations of the inner
1653 loop use that to limit the scale for costing, otherwise use
1654 --param vect-inner-loop-cost-factor literally. */
1656 if (estimated_stmt_executions (loop
->inner
, &nit
))
1657 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo
)
1658 = wi::smin (nit
, param_vect_inner_loop_cost_factor
).to_uhwi ();
1666 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1667 statements update the vectorization factor. */
1670 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1672 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1673 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1674 int nbbs
= loop
->num_nodes
;
1675 poly_uint64 vectorization_factor
;
1678 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1680 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1681 gcc_assert (known_ne (vectorization_factor
, 0U));
1683 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1684 vectorization factor of the loop is the unrolling factor required by
1685 the SLP instances. If that unrolling factor is 1, we say, that we
1686 perform pure SLP on loop - cross iteration parallelism is not
1688 bool only_slp_in_loop
= true;
1689 for (i
= 0; i
< nbbs
; i
++)
1691 basic_block bb
= bbs
[i
];
1692 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1695 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (si
.phi ());
1698 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1699 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1700 && !PURE_SLP_STMT (stmt_info
))
1701 /* STMT needs both SLP and loop-based vectorization. */
1702 only_slp_in_loop
= false;
1704 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1707 if (is_gimple_debug (gsi_stmt (si
)))
1709 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
1710 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
1711 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1712 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1713 && !PURE_SLP_STMT (stmt_info
))
1714 /* STMT needs both SLP and loop-based vectorization. */
1715 only_slp_in_loop
= false;
1719 if (only_slp_in_loop
)
1721 if (dump_enabled_p ())
1722 dump_printf_loc (MSG_NOTE
, vect_location
,
1723 "Loop contains only SLP stmts\n");
1724 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_NOTE
, vect_location
,
1730 "Loop contains SLP and non-SLP stmts\n");
1731 /* Both the vectorization factor and unroll factor have the form
1732 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1733 so they must have a common multiple. */
1734 vectorization_factor
1735 = force_common_multiple (vectorization_factor
,
1736 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1739 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1740 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_NOTE
, vect_location
,
1743 "Updating vectorization factor to ");
1744 dump_dec (MSG_NOTE
, vectorization_factor
);
1745 dump_printf (MSG_NOTE
, ".\n");
1749 /* Return true if STMT_INFO describes a double reduction phi and if
1750 the other phi in the reduction is also relevant for vectorization.
1751 This rejects cases such as:
1754 x_1 = PHI <x_3(outer2), ...>;
1762 x_3 = PHI <x_2(inner)>;
1764 if nothing in x_2 or elsewhere makes x_1 relevant. */
1767 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
1769 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1772 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info
));
1775 /* Function vect_analyze_loop_operations.
1777 Scan the loop stmts and make sure they are all vectorizable. */
1780 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1782 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1783 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1784 int nbbs
= loop
->num_nodes
;
1786 stmt_vec_info stmt_info
;
1787 bool need_to_vectorize
= false;
1790 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1792 auto_vec
<stmt_info_for_cost
> cost_vec
;
1794 for (i
= 0; i
< nbbs
; i
++)
1796 basic_block bb
= bbs
[i
];
1798 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1801 gphi
*phi
= si
.phi ();
1804 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
1805 if (dump_enabled_p ())
1806 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: %G",
1808 if (virtual_operand_p (gimple_phi_result (phi
)))
1811 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1812 (i.e., a phi in the tail of the outer-loop). */
1813 if (! is_loop_header_bb_p (bb
))
1815 /* FORNOW: we currently don't support the case that these phis
1816 are not used in the outerloop (unless it is double reduction,
1817 i.e., this phi is vect_reduction_def), cause this case
1818 requires to actually do something here. */
1819 if (STMT_VINFO_LIVE_P (stmt_info
)
1820 && !vect_active_double_reduction_p (stmt_info
))
1821 return opt_result::failure_at (phi
,
1822 "Unsupported loop-closed phi"
1823 " in outer-loop.\n");
1825 /* If PHI is used in the outer loop, we check that its operand
1826 is defined in the inner loop. */
1827 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1831 if (gimple_phi_num_args (phi
) != 1)
1832 return opt_result::failure_at (phi
, "unsupported phi");
1834 phi_op
= PHI_ARG_DEF (phi
, 0);
1835 stmt_vec_info op_def_info
= loop_vinfo
->lookup_def (phi_op
);
1837 return opt_result::failure_at (phi
, "unsupported phi\n");
1839 if (STMT_VINFO_RELEVANT (op_def_info
) != vect_used_in_outer
1840 && (STMT_VINFO_RELEVANT (op_def_info
)
1841 != vect_used_in_outer_by_reduction
))
1842 return opt_result::failure_at (phi
, "unsupported phi\n");
1844 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
1845 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1846 == vect_double_reduction_def
))
1847 && !vectorizable_lc_phi (loop_vinfo
,
1848 stmt_info
, NULL
, NULL
))
1849 return opt_result::failure_at (phi
, "unsupported phi\n");
1855 gcc_assert (stmt_info
);
1857 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1858 || STMT_VINFO_LIVE_P (stmt_info
))
1859 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
1860 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_first_order_recurrence
)
1861 /* A scalar-dependence cycle that we don't support. */
1862 return opt_result::failure_at (phi
,
1864 " scalar dependence cycle.\n");
1866 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1868 need_to_vectorize
= true;
1869 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
1870 && ! PURE_SLP_STMT (stmt_info
))
1871 ok
= vectorizable_induction (loop_vinfo
,
1872 stmt_info
, NULL
, NULL
,
1874 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1875 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1876 == vect_double_reduction_def
)
1877 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
1878 && ! PURE_SLP_STMT (stmt_info
))
1879 ok
= vectorizable_reduction (loop_vinfo
,
1880 stmt_info
, NULL
, NULL
, &cost_vec
);
1881 else if ((STMT_VINFO_DEF_TYPE (stmt_info
)
1882 == vect_first_order_recurrence
)
1883 && ! PURE_SLP_STMT (stmt_info
))
1884 ok
= vectorizable_recurr (loop_vinfo
, stmt_info
, NULL
, NULL
,
1888 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1890 && STMT_VINFO_LIVE_P (stmt_info
)
1891 && !PURE_SLP_STMT (stmt_info
))
1892 ok
= vectorizable_live_operation (loop_vinfo
,
1893 stmt_info
, NULL
, NULL
, NULL
,
1894 -1, false, &cost_vec
);
1897 return opt_result::failure_at (phi
,
1898 "not vectorized: relevant phi not "
1900 static_cast <gimple
*> (phi
));
1903 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1906 gimple
*stmt
= gsi_stmt (si
);
1907 if (!gimple_clobber_p (stmt
)
1908 && !is_gimple_debug (stmt
))
1911 = vect_analyze_stmt (loop_vinfo
,
1912 loop_vinfo
->lookup_stmt (stmt
),
1914 NULL
, NULL
, &cost_vec
);
1921 add_stmt_costs (loop_vinfo
->vector_costs
, &cost_vec
);
1923 /* All operations in the loop are either irrelevant (deal with loop
1924 control, or dead), or only used outside the loop and can be moved
1925 out of the loop (e.g. invariants, inductions). The loop can be
1926 optimized away by scalar optimizations. We're better off not
1927 touching this loop. */
1928 if (!need_to_vectorize
)
1930 if (dump_enabled_p ())
1931 dump_printf_loc (MSG_NOTE
, vect_location
,
1932 "All the computation can be taken out of the loop.\n");
1933 return opt_result::failure_at
1935 "not vectorized: redundant loop. no profit to vectorize.\n");
1938 return opt_result::success ();
1941 /* Return true if we know that the iteration count is smaller than the
1942 vectorization factor. Return false if it isn't, or if we can't be sure
1946 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo
)
1948 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1950 HOST_WIDE_INT max_niter
;
1951 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1952 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
1954 max_niter
= max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1956 if (max_niter
!= -1 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
1962 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1963 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1964 definitely no, or -1 if it's worth retrying. */
1967 vect_analyze_loop_costing (loop_vec_info loop_vinfo
,
1968 unsigned *suggested_unroll_factor
)
1970 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1971 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1973 /* Only loops that can handle partially-populated vectors can have iteration
1974 counts less than the vectorization factor. */
1975 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
1977 if (vect_known_niters_smaller_than_vf (loop_vinfo
))
1979 if (dump_enabled_p ())
1980 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1981 "not vectorized: iteration count smaller than "
1982 "vectorization factor.\n");
1987 /* If using the "very cheap" model. reject cases in which we'd keep
1988 a copy of the scalar code (even if we might be able to vectorize it). */
1989 if (loop_cost_model (loop
) == VECT_COST_MODEL_VERY_CHEAP
1990 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1991 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1992 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)))
1994 if (dump_enabled_p ())
1995 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1996 "some scalar iterations would need to be peeled\n");
2000 int min_profitable_iters
, min_profitable_estimate
;
2001 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
2002 &min_profitable_estimate
,
2003 suggested_unroll_factor
);
2005 if (min_profitable_iters
< 0)
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2009 "not vectorized: vectorization not profitable.\n");
2010 if (dump_enabled_p ())
2011 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2012 "not vectorized: vector version will never be "
2017 int min_scalar_loop_bound
= (param_min_vect_loop_bound
2020 /* Use the cost model only if it is more conservative than user specified
2022 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
2023 min_profitable_iters
);
2025 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
2027 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2028 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
2030 if (dump_enabled_p ())
2031 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2032 "not vectorized: vectorization not profitable.\n");
2033 if (dump_enabled_p ())
2034 dump_printf_loc (MSG_NOTE
, vect_location
,
2035 "not vectorized: iteration count smaller than user "
2036 "specified loop bound parameter or minimum profitable "
2037 "iterations (whichever is more conservative).\n");
2041 /* The static profitablity threshold min_profitable_estimate includes
2042 the cost of having to check at runtime whether the scalar loop
2043 should be used instead. If it turns out that we don't need or want
2044 such a check, the threshold we should use for the static estimate
2045 is simply the point at which the vector loop becomes more profitable
2046 than the scalar loop. */
2047 if (min_profitable_estimate
> min_profitable_iters
2048 && !LOOP_REQUIRES_VERSIONING (loop_vinfo
)
2049 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
2050 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2051 && !vect_apply_runtime_profitability_check_p (loop_vinfo
))
2053 if (dump_enabled_p ())
2054 dump_printf_loc (MSG_NOTE
, vect_location
, "no need for a runtime"
2055 " choice between the scalar and vector loops\n");
2056 min_profitable_estimate
= min_profitable_iters
;
2059 /* If the vector loop needs multiple iterations to be beneficial then
2060 things are probably too close to call, and the conservative thing
2061 would be to stick with the scalar code. */
2062 if (loop_cost_model (loop
) == VECT_COST_MODEL_VERY_CHEAP
2063 && min_profitable_estimate
> (int) vect_vf_for_cost (loop_vinfo
))
2065 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2067 "one iteration of the vector loop would be"
2068 " more expensive than the equivalent number of"
2069 " iterations of the scalar loop\n");
2073 HOST_WIDE_INT estimated_niter
;
2075 /* If we are vectorizing an epilogue then we know the maximum number of
2076 scalar iterations it will cover is at least one lower than the
2077 vectorization factor of the main loop. */
2078 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2080 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
)) - 1;
2083 estimated_niter
= estimated_stmt_executions_int (loop
);
2084 if (estimated_niter
== -1)
2085 estimated_niter
= likely_max_stmt_executions_int (loop
);
2087 if (estimated_niter
!= -1
2088 && ((unsigned HOST_WIDE_INT
) estimated_niter
2089 < MAX (th
, (unsigned) min_profitable_estimate
)))
2091 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2093 "not vectorized: estimated iteration count too "
2095 if (dump_enabled_p ())
2096 dump_printf_loc (MSG_NOTE
, vect_location
,
2097 "not vectorized: estimated iteration count smaller "
2098 "than specified loop bound parameter or minimum "
2099 "profitable iterations (whichever is more "
2100 "conservative).\n");
2108 vect_get_datarefs_in_loop (loop_p loop
, basic_block
*bbs
,
2109 vec
<data_reference_p
> *datarefs
,
2110 unsigned int *n_stmts
)
2113 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
2114 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
2115 !gsi_end_p (gsi
); gsi_next (&gsi
))
2117 gimple
*stmt
= gsi_stmt (gsi
);
2118 if (is_gimple_debug (stmt
))
2121 opt_result res
= vect_find_stmt_data_reference (loop
, stmt
, datarefs
,
2125 if (is_gimple_call (stmt
) && loop
->safelen
)
2127 tree fndecl
= gimple_call_fndecl (stmt
), op
;
2128 if (fndecl
== NULL_TREE
2129 && gimple_call_internal_p (stmt
, IFN_MASK_CALL
))
2131 fndecl
= gimple_call_arg (stmt
, 0);
2132 gcc_checking_assert (TREE_CODE (fndecl
) == ADDR_EXPR
);
2133 fndecl
= TREE_OPERAND (fndecl
, 0);
2134 gcc_checking_assert (TREE_CODE (fndecl
) == FUNCTION_DECL
);
2136 if (fndecl
!= NULL_TREE
)
2138 cgraph_node
*node
= cgraph_node::get (fndecl
);
2139 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
2141 unsigned int j
, n
= gimple_call_num_args (stmt
);
2142 for (j
= 0; j
< n
; j
++)
2144 op
= gimple_call_arg (stmt
, j
);
2146 || (REFERENCE_CLASS_P (op
)
2147 && get_base_address (op
)))
2150 op
= gimple_call_lhs (stmt
);
2151 /* Ignore #pragma omp declare simd functions
2152 if they don't have data references in the
2153 call stmt itself. */
2157 || (REFERENCE_CLASS_P (op
)
2158 && get_base_address (op
)))))
2165 /* If dependence analysis will give up due to the limit on the
2166 number of datarefs stop here and fail fatally. */
2167 if (datarefs
->length ()
2168 > (unsigned)param_loop_max_datarefs_for_datadeps
)
2169 return opt_result::failure_at (stmt
, "exceeded param "
2170 "loop-max-datarefs-for-datadeps\n");
2172 return opt_result::success ();
2175 /* Look for SLP-only access groups and turn each individual access into its own
2178 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo
)
2181 struct data_reference
*dr
;
2183 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2185 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2186 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2188 gcc_assert (DR_REF (dr
));
2189 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (DR_STMT (dr
));
2191 /* Check if the load is a part of an interleaving chain. */
2192 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2194 stmt_vec_info first_element
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
2195 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_element
);
2196 unsigned int group_size
= DR_GROUP_SIZE (first_element
);
2198 /* Check if SLP-only groups. */
2199 if (!STMT_SLP_TYPE (stmt_info
)
2200 && STMT_VINFO_SLP_VECT_ONLY (first_element
))
2202 /* Dissolve the group. */
2203 STMT_VINFO_SLP_VECT_ONLY (first_element
) = false;
2205 stmt_vec_info vinfo
= first_element
;
2208 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (vinfo
);
2209 DR_GROUP_FIRST_ELEMENT (vinfo
) = vinfo
;
2210 DR_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2211 DR_GROUP_SIZE (vinfo
) = 1;
2212 if (STMT_VINFO_STRIDED_P (first_element
))
2213 DR_GROUP_GAP (vinfo
) = 0;
2215 DR_GROUP_GAP (vinfo
) = group_size
- 1;
2216 /* Duplicate and adjust alignment info, it needs to
2217 be present on each group leader, see dr_misalignment. */
2218 if (vinfo
!= first_element
)
2220 dr_vec_info
*dr_info2
= STMT_VINFO_DR_INFO (vinfo
);
2221 dr_info2
->target_alignment
= dr_info
->target_alignment
;
2222 int misalignment
= dr_info
->misalignment
;
2223 if (misalignment
!= DR_MISALIGNMENT_UNKNOWN
)
2226 = (TREE_INT_CST_LOW (DR_INIT (dr_info2
->dr
))
2227 - TREE_INT_CST_LOW (DR_INIT (dr_info
->dr
)));
2228 unsigned HOST_WIDE_INT align_c
2229 = dr_info
->target_alignment
.to_constant ();
2230 misalignment
= (misalignment
+ diff
) % align_c
;
2232 dr_info2
->misalignment
= misalignment
;
2241 /* Determine if operating on full vectors for LOOP_VINFO might leave
2242 some scalar iterations still to do. If so, decide how we should
2243 handle those scalar iterations. The possibilities are:
2245 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2248 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2249 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2250 LOOP_VINFO_PEELING_FOR_NITER == false
2252 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2253 to handle the remaining scalar iterations. In this case:
2255 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2256 LOOP_VINFO_PEELING_FOR_NITER == true
2258 There are two choices:
2260 (2a) Consider vectorizing the epilogue loop at the same VF as the
2261 main loop, but using partial vectors instead of full vectors.
2264 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2266 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2269 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2271 When FOR_EPILOGUE_P is true, make this determination based on the
2272 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2273 based on the assumption that LOOP_VINFO is the main loop. The caller
2274 has made sure that the number of iterations is set appropriately for
2275 this value of FOR_EPILOGUE_P. */
2278 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo
,
2279 bool for_epilogue_p
)
2281 /* Determine whether there would be any scalar iterations left over. */
2282 bool need_peeling_or_partial_vectors_p
2283 = vect_need_peeling_or_partial_vectors_p (loop_vinfo
);
2285 /* Decide whether to vectorize the loop with partial vectors. */
2286 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2287 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2288 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2289 && need_peeling_or_partial_vectors_p
)
2291 /* For partial-vector-usage=1, try to push the handling of partial
2292 vectors to the epilogue, with the main loop continuing to operate
2295 If we are unrolling we also do not want to use partial vectors. This
2296 is to avoid the overhead of generating multiple masks and also to
2297 avoid having to execute entire iterations of FALSE masked instructions
2298 when dealing with one or less full iterations.
2300 ??? We could then end up failing to use partial vectors if we
2301 decide to peel iterations into a prologue, and if the main loop
2302 then ends up processing fewer than VF iterations. */
2303 if ((param_vect_partial_vector_usage
== 1
2304 || loop_vinfo
->suggested_unroll_factor
> 1)
2305 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2306 && !vect_known_niters_smaller_than_vf (loop_vinfo
))
2307 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2309 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
) = true;
2312 if (dump_enabled_p ())
2314 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2315 dump_printf_loc (MSG_NOTE
, vect_location
,
2316 "operating on partial vectors%s.\n",
2317 for_epilogue_p
? " for epilogue loop" : "");
2319 dump_printf_loc (MSG_NOTE
, vect_location
,
2320 "operating only on full vectors%s.\n",
2321 for_epilogue_p
? " for epilogue loop" : "");
2326 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2327 gcc_assert (orig_loop_vinfo
);
2328 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2329 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2330 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
)));
2333 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2334 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2336 /* Check that the loop processes at least one full vector. */
2337 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2338 tree scalar_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
2339 if (known_lt (wi::to_widest (scalar_niters
), vf
))
2340 return opt_result::failure_at (vect_location
,
2341 "loop does not have enough iterations"
2342 " to support vectorization.\n");
2344 /* If we need to peel an extra epilogue iteration to handle data
2345 accesses with gaps, check that there are enough scalar iterations
2348 The check above is redundant with this one when peeling for gaps,
2349 but the distinction is useful for diagnostics. */
2350 tree scalar_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
2351 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2352 && known_lt (wi::to_widest (scalar_nitersm1
), vf
))
2353 return opt_result::failure_at (vect_location
,
2354 "loop does not have enough iterations"
2355 " to support peeling for gaps.\n");
2358 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
2359 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
2360 && need_peeling_or_partial_vectors_p
);
2362 return opt_result::success ();
2365 /* Function vect_analyze_loop_2.
2367 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2368 analyses will record information in some members of LOOP_VINFO. FATAL
2369 indicates if some analysis meets fatal error. If one non-NULL pointer
2370 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2371 worked out suggested unroll factor, while one NULL pointer shows it's
2372 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2373 is to hold the slp decision when the suggested unroll factor is worked
2376 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
,
2377 unsigned *suggested_unroll_factor
,
2378 bool& slp_done_for_suggested_uf
)
2380 opt_result ok
= opt_result::success ();
2382 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
2383 poly_uint64 min_vf
= 2;
2384 loop_vec_info orig_loop_vinfo
= NULL
;
2386 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2387 loop_vec_info of the first vectorized loop. */
2388 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2389 orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
2391 orig_loop_vinfo
= loop_vinfo
;
2392 gcc_assert (orig_loop_vinfo
);
2394 /* The first group of checks is independent of the vector size. */
2397 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)
2398 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)))
2399 return opt_result::failure_at (vect_location
,
2400 "not vectorized: simd if(0)\n");
2402 /* Find all data references in the loop (which correspond to vdefs/vuses)
2403 and analyze their evolution in the loop. */
2405 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2407 /* Gather the data references and count stmts in the loop. */
2408 if (!LOOP_VINFO_DATAREFS (loop_vinfo
).exists ())
2411 = vect_get_datarefs_in_loop (loop
, LOOP_VINFO_BBS (loop_vinfo
),
2412 &LOOP_VINFO_DATAREFS (loop_vinfo
),
2413 &LOOP_VINFO_N_STMTS (loop_vinfo
));
2416 if (dump_enabled_p ())
2417 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2418 "not vectorized: loop contains function "
2419 "calls or data references that cannot "
2423 loop_vinfo
->shared
->save_datarefs ();
2426 loop_vinfo
->shared
->check_datarefs ();
2428 /* Analyze the data references and also adjust the minimal
2429 vectorization factor according to the loads and stores. */
2431 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
, &fatal
);
2434 if (dump_enabled_p ())
2435 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2436 "bad data references.\n");
2440 /* Check if we are applying unroll factor now. */
2441 bool applying_suggested_uf
= loop_vinfo
->suggested_unroll_factor
> 1;
2442 gcc_assert (!applying_suggested_uf
|| !suggested_unroll_factor
);
2444 /* If the slp decision is false when suggested unroll factor is worked
2445 out, and we are applying suggested unroll factor, we can simply skip
2446 all slp related analyses this time. */
2447 bool slp
= !applying_suggested_uf
|| slp_done_for_suggested_uf
;
2449 /* Classify all cross-iteration scalar data-flow cycles.
2450 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2451 vect_analyze_scalar_cycles (loop_vinfo
, slp
);
2453 vect_pattern_recog (loop_vinfo
);
2455 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
2457 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2458 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2460 ok
= vect_analyze_data_ref_accesses (loop_vinfo
, NULL
);
2463 if (dump_enabled_p ())
2464 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2465 "bad data access.\n");
2469 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2471 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
, &fatal
);
2474 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2476 "unexpected pattern.\n");
2480 /* While the rest of the analysis below depends on it in some way. */
2483 /* Analyze data dependences between the data-refs in the loop
2484 and adjust the maximum vectorization factor according to
2486 FORNOW: fail at the first data dependence that we encounter. */
2488 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
2491 if (dump_enabled_p ())
2492 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2493 "bad data dependence.\n");
2496 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2497 && maybe_lt (max_vf
, min_vf
))
2498 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2499 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
2501 ok
= vect_determine_vectorization_factor (loop_vinfo
);
2504 if (dump_enabled_p ())
2505 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2506 "can't determine vectorization factor.\n");
2509 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2510 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2511 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2513 /* Compute the scalar iteration cost. */
2514 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
2516 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2520 /* Check the SLP opportunities in the loop, analyze and build
2522 ok
= vect_analyze_slp (loop_vinfo
, LOOP_VINFO_N_STMTS (loop_vinfo
));
2526 /* If there are any SLP instances mark them as pure_slp. */
2527 slp
= vect_make_slp_decision (loop_vinfo
);
2530 /* Find stmts that need to be both vectorized and SLPed. */
2531 vect_detect_hybrid_slp (loop_vinfo
);
2533 /* Update the vectorization factor based on the SLP decision. */
2534 vect_update_vf_for_slp (loop_vinfo
);
2536 /* Optimize the SLP graph with the vectorization factor fixed. */
2537 vect_optimize_slp (loop_vinfo
);
2539 /* Gather the loads reachable from the SLP graph entries. */
2540 vect_gather_slp_loads (loop_vinfo
);
2544 bool saved_can_use_partial_vectors_p
2545 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
);
2547 /* We don't expect to have to roll back to anything other than an empty
2549 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
2551 /* This is the point where we can re-start analysis with SLP forced off. */
2554 /* Apply the suggested unrolling factor, this was determined by the backend
2555 during finish_cost the first time we ran the analyzis for this
2557 if (applying_suggested_uf
)
2558 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) *= loop_vinfo
->suggested_unroll_factor
;
2560 /* Now the vectorization factor is final. */
2561 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2562 gcc_assert (known_ne (vectorization_factor
, 0U));
2564 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
2566 dump_printf_loc (MSG_NOTE
, vect_location
,
2567 "vectorization_factor = ");
2568 dump_dec (MSG_NOTE
, vectorization_factor
);
2569 dump_printf (MSG_NOTE
, ", niters = %wd\n",
2570 LOOP_VINFO_INT_NITERS (loop_vinfo
));
2573 loop_vinfo
->vector_costs
= init_cost (loop_vinfo
, false);
2575 /* Analyze the alignment of the data-refs in the loop.
2576 Fail if a data reference is found that cannot be vectorized. */
2578 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
2581 if (dump_enabled_p ())
2582 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2583 "bad data alignment.\n");
2587 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2588 It is important to call pruning after vect_analyze_data_ref_accesses,
2589 since we use grouping information gathered by interleaving analysis. */
2590 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
2594 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2595 vectorization, since we do not want to add extra peeling or
2596 add versioning for alignment. */
2597 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2598 /* This pass will decide on using loop versioning and/or loop peeling in
2599 order to enhance the alignment of data references in the loop. */
2600 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
2606 /* Analyze operations in the SLP instances. Note this may
2607 remove unsupported SLP instances which makes the above
2608 SLP kind detection invalid. */
2609 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
2610 vect_slp_analyze_operations (loop_vinfo
);
2611 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
2613 ok
= opt_result::failure_at (vect_location
,
2614 "unsupported SLP instances\n");
2618 /* Check whether any load in ALL SLP instances is possibly permuted. */
2619 slp_tree load_node
, slp_root
;
2621 slp_instance instance
;
2622 bool can_use_lanes
= true;
2623 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), x
, instance
)
2625 slp_root
= SLP_INSTANCE_TREE (instance
);
2626 int group_size
= SLP_TREE_LANES (slp_root
);
2627 tree vectype
= SLP_TREE_VECTYPE (slp_root
);
2628 bool loads_permuted
= false;
2629 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2631 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2634 stmt_vec_info load_info
;
2635 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2636 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2638 loads_permuted
= true;
2643 /* If the loads and stores can be handled with load/store-lane
2644 instructions record it and move on to the next instance. */
2646 && SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
2647 && vect_store_lanes_supported (vectype
, group_size
, false))
2649 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
2651 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2652 (SLP_TREE_SCALAR_STMTS (load_node
)[0]);
2653 /* Use SLP for strided accesses (or if we can't
2655 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
2656 || ! vect_load_lanes_supported
2657 (STMT_VINFO_VECTYPE (stmt_vinfo
),
2658 DR_GROUP_SIZE (stmt_vinfo
), false))
2663 = can_use_lanes
&& i
== SLP_INSTANCE_LOADS (instance
).length ();
2665 if (can_use_lanes
&& dump_enabled_p ())
2666 dump_printf_loc (MSG_NOTE
, vect_location
,
2667 "SLP instance %p can use load/store-lanes\n",
2672 can_use_lanes
= false;
2677 /* If all SLP instances can use load/store-lanes abort SLP and try again
2678 with SLP disabled. */
2681 ok
= opt_result::failure_at (vect_location
,
2682 "Built SLP cancelled: can use "
2683 "load/store-lanes\n");
2684 if (dump_enabled_p ())
2685 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2686 "Built SLP cancelled: all SLP instances support "
2687 "load/store-lanes\n");
2692 /* Dissolve SLP-only groups. */
2693 vect_dissolve_slp_only_groups (loop_vinfo
);
2695 /* Scan all the remaining operations in the loop that are not subject
2696 to SLP and make sure they are vectorizable. */
2697 ok
= vect_analyze_loop_operations (loop_vinfo
);
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2702 "bad operation or unsupported loop bound.\n");
2706 /* For now, we don't expect to mix both masking and length approaches for one
2707 loop, disable it if both are recorded. */
2708 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2709 && !LOOP_VINFO_MASKS (loop_vinfo
).is_empty ()
2710 && !LOOP_VINFO_LENS (loop_vinfo
).is_empty ())
2712 if (dump_enabled_p ())
2713 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2714 "can't vectorize a loop with partial vectors"
2715 " because we don't expect to mix different"
2716 " approaches with partial vectors for the"
2718 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2721 /* If we still have the option of using partial vectors,
2722 check whether we can generate the necessary loop controls. */
2723 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2724 && !vect_verify_full_masking (loop_vinfo
)
2725 && !vect_verify_loop_lens (loop_vinfo
))
2726 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
2728 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2729 to be able to handle fewer than VF scalars, or needs to have a lower VF
2730 than the main loop. */
2731 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2732 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2733 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2734 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
)))
2735 return opt_result::failure_at (vect_location
,
2736 "Vectorization factor too high for"
2737 " epilogue loop.\n");
2739 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2740 assuming that the loop will be used as a main loop. We will redo
2741 this analysis later if we instead decide to use the loop as an
2743 ok
= vect_determine_partial_vectors_and_peeling (loop_vinfo
, false);
2747 /* Check the costings of the loop make vectorizing worthwhile. */
2748 res
= vect_analyze_loop_costing (loop_vinfo
, suggested_unroll_factor
);
2751 ok
= opt_result::failure_at (vect_location
,
2752 "Loop costings may not be worthwhile.\n");
2756 return opt_result::failure_at (vect_location
,
2757 "Loop costings not worthwhile.\n");
2759 /* If an epilogue loop is required make sure we can create one. */
2760 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2761 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2763 if (dump_enabled_p ())
2764 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
2765 if (!vect_can_advance_ivs_p (loop_vinfo
)
2766 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
2767 single_exit (LOOP_VINFO_LOOP
2770 ok
= opt_result::failure_at (vect_location
,
2771 "not vectorized: can't create required "
2777 /* During peeling, we need to check if number of loop iterations is
2778 enough for both peeled prolog loop and vector loop. This check
2779 can be merged along with threshold check of loop versioning, so
2780 increase threshold for this case if necessary.
2782 If we are analyzing an epilogue we still want to check what its
2783 versioning threshold would be. If we decide to vectorize the epilogues we
2784 will want to use the lowest versioning threshold of all epilogues and main
2785 loop. This will enable us to enter a vectorized epilogue even when
2786 versioning the loop. We can't simply check whether the epilogue requires
2787 versioning though since we may have skipped some versioning checks when
2788 analyzing the epilogue. For instance, checks for alias versioning will be
2789 skipped when dealing with epilogues as we assume we already checked them
2790 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2791 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo
))
2793 poly_uint64 niters_th
= 0;
2794 unsigned int th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
2796 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2798 /* Niters for peeled prolog loop. */
2799 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2801 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
2802 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
2803 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
2806 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2809 /* Niters for at least one iteration of vectorized loop. */
2810 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
2811 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2812 /* One additional iteration because of peeling for gap. */
2813 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2816 /* Use the same condition as vect_transform_loop to decide when to use
2817 the cost to determine a versioning threshold. */
2818 if (vect_apply_runtime_profitability_check_p (loop_vinfo
)
2819 && ordered_p (th
, niters_th
))
2820 niters_th
= ordered_max (poly_uint64 (th
), niters_th
);
2822 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
2825 gcc_assert (known_eq (vectorization_factor
,
2826 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
2828 slp_done_for_suggested_uf
= slp
;
2830 /* Ok to vectorize! */
2831 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
2832 return opt_result::success ();
2835 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2838 /* Try again with SLP forced off but if we didn't do any SLP there is
2839 no point in re-trying. */
2843 /* If the slp decision is true when suggested unroll factor is worked
2844 out, and we are applying suggested unroll factor, we don't need to
2846 if (applying_suggested_uf
&& slp_done_for_suggested_uf
)
2849 /* If there are reduction chains re-trying will fail anyway. */
2850 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
2853 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2854 via interleaving or lane instructions. */
2855 slp_instance instance
;
2858 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
2860 stmt_vec_info vinfo
;
2861 vinfo
= SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0];
2862 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
2864 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2865 unsigned int size
= DR_GROUP_SIZE (vinfo
);
2866 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
2867 if (! vect_store_lanes_supported (vectype
, size
, false)
2868 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1U)
2869 && ! vect_grouped_store_supported (vectype
, size
))
2870 return opt_result::failure_at (vinfo
->stmt
,
2871 "unsupported grouped store\n");
2872 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
2874 vinfo
= SLP_TREE_SCALAR_STMTS (node
)[0];
2875 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2876 bool single_element_p
= !DR_GROUP_NEXT_ELEMENT (vinfo
);
2877 size
= DR_GROUP_SIZE (vinfo
);
2878 vectype
= STMT_VINFO_VECTYPE (vinfo
);
2879 if (! vect_load_lanes_supported (vectype
, size
, false)
2880 && ! vect_grouped_load_supported (vectype
, single_element_p
,
2882 return opt_result::failure_at (vinfo
->stmt
,
2883 "unsupported grouped load\n");
2887 if (dump_enabled_p ())
2888 dump_printf_loc (MSG_NOTE
, vect_location
,
2889 "re-trying with SLP disabled\n");
2891 /* Roll back state appropriately. No SLP this time. */
2893 /* Restore vectorization factor as it were without SLP. */
2894 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
2895 /* Free the SLP instances. */
2896 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
2897 vect_free_slp_instance (instance
);
2898 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
2899 /* Reset SLP type to loop_vect on all stmts. */
2900 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2902 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2903 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
2904 !gsi_end_p (si
); gsi_next (&si
))
2906 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2907 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2908 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
2909 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
2911 /* vectorizable_reduction adjusts reduction stmt def-types,
2912 restore them to that of the PHI. */
2913 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info
))
2914 = STMT_VINFO_DEF_TYPE (stmt_info
);
2915 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2916 (STMT_VINFO_REDUC_DEF (stmt_info
)))
2917 = STMT_VINFO_DEF_TYPE (stmt_info
);
2920 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
2921 !gsi_end_p (si
); gsi_next (&si
))
2923 if (is_gimple_debug (gsi_stmt (si
)))
2925 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2926 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2927 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2929 stmt_vec_info pattern_stmt_info
2930 = STMT_VINFO_RELATED_STMT (stmt_info
);
2931 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info
))
2932 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
2934 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
2935 STMT_SLP_TYPE (pattern_stmt_info
) = loop_vect
;
2936 for (gimple_stmt_iterator pi
= gsi_start (pattern_def_seq
);
2937 !gsi_end_p (pi
); gsi_next (&pi
))
2938 STMT_SLP_TYPE (loop_vinfo
->lookup_stmt (gsi_stmt (pi
)))
2943 /* Free optimized alias test DDRS. */
2944 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
2945 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
2946 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
2947 /* Reset target cost data. */
2948 delete loop_vinfo
->vector_costs
;
2949 loop_vinfo
->vector_costs
= nullptr;
2950 /* Reset accumulated rgroup information. */
2951 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo
));
2952 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo
));
2953 /* Reset assorted flags. */
2954 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2955 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
2956 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
2957 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
2958 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
)
2959 = saved_can_use_partial_vectors_p
;
2964 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2965 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2966 OLD_LOOP_VINFO is better unless something specifically indicates
2969 Note that this deliberately isn't a partial order. */
2972 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo
,
2973 loop_vec_info old_loop_vinfo
)
2975 struct loop
*loop
= LOOP_VINFO_LOOP (new_loop_vinfo
);
2976 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo
) == loop
);
2978 poly_int64 new_vf
= LOOP_VINFO_VECT_FACTOR (new_loop_vinfo
);
2979 poly_int64 old_vf
= LOOP_VINFO_VECT_FACTOR (old_loop_vinfo
);
2981 /* Always prefer a VF of loop->simdlen over any other VF. */
2984 bool new_simdlen_p
= known_eq (new_vf
, loop
->simdlen
);
2985 bool old_simdlen_p
= known_eq (old_vf
, loop
->simdlen
);
2986 if (new_simdlen_p
!= old_simdlen_p
)
2987 return new_simdlen_p
;
2990 const auto *old_costs
= old_loop_vinfo
->vector_costs
;
2991 const auto *new_costs
= new_loop_vinfo
->vector_costs
;
2992 if (loop_vec_info main_loop
= LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo
))
2993 return new_costs
->better_epilogue_loop_than_p (old_costs
, main_loop
);
2995 return new_costs
->better_main_loop_than_p (old_costs
);
2998 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2999 true if we should. */
3002 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo
,
3003 loop_vec_info old_loop_vinfo
)
3005 if (!vect_better_loop_vinfo_p (new_loop_vinfo
, old_loop_vinfo
))
3008 if (dump_enabled_p ())
3009 dump_printf_loc (MSG_NOTE
, vect_location
,
3010 "***** Preferring vector mode %s to vector mode %s\n",
3011 GET_MODE_NAME (new_loop_vinfo
->vector_mode
),
3012 GET_MODE_NAME (old_loop_vinfo
->vector_mode
));
3016 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3017 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3018 MODE_I to the next mode useful to analyze.
3019 Return the loop_vinfo on success and wrapped null on failure. */
3021 static opt_loop_vec_info
3022 vect_analyze_loop_1 (class loop
*loop
, vec_info_shared
*shared
,
3023 const vect_loop_form_info
*loop_form_info
,
3024 loop_vec_info main_loop_vinfo
,
3025 const vector_modes
&vector_modes
, unsigned &mode_i
,
3026 machine_mode
&autodetected_vector_mode
,
3029 loop_vec_info loop_vinfo
3030 = vect_create_loop_vinfo (loop
, shared
, loop_form_info
, main_loop_vinfo
);
3032 machine_mode vector_mode
= vector_modes
[mode_i
];
3033 loop_vinfo
->vector_mode
= vector_mode
;
3034 unsigned int suggested_unroll_factor
= 1;
3035 bool slp_done_for_suggested_uf
;
3037 /* Run the main analysis. */
3038 opt_result res
= vect_analyze_loop_2 (loop_vinfo
, fatal
,
3039 &suggested_unroll_factor
,
3040 slp_done_for_suggested_uf
);
3041 if (dump_enabled_p ())
3042 dump_printf_loc (MSG_NOTE
, vect_location
,
3043 "***** Analysis %s with vector mode %s\n",
3044 res
? "succeeded" : " failed",
3045 GET_MODE_NAME (loop_vinfo
->vector_mode
));
3047 if (!main_loop_vinfo
&& suggested_unroll_factor
> 1)
3049 if (dump_enabled_p ())
3050 dump_printf_loc (MSG_NOTE
, vect_location
,
3051 "***** Re-trying analysis for unrolling"
3052 " with unroll factor %d and slp %s.\n",
3053 suggested_unroll_factor
,
3054 slp_done_for_suggested_uf
? "on" : "off");
3055 loop_vec_info unroll_vinfo
3056 = vect_create_loop_vinfo (loop
, shared
, loop_form_info
, main_loop_vinfo
);
3057 unroll_vinfo
->vector_mode
= vector_mode
;
3058 unroll_vinfo
->suggested_unroll_factor
= suggested_unroll_factor
;
3059 opt_result new_res
= vect_analyze_loop_2 (unroll_vinfo
, fatal
, NULL
,
3060 slp_done_for_suggested_uf
);
3064 loop_vinfo
= unroll_vinfo
;
3067 delete unroll_vinfo
;
3070 /* Remember the autodetected vector mode. */
3071 if (vector_mode
== VOIDmode
)
3072 autodetected_vector_mode
= loop_vinfo
->vector_mode
;
3074 /* Advance mode_i, first skipping modes that would result in the
3075 same analysis result. */
3076 while (mode_i
+ 1 < vector_modes
.length ()
3077 && vect_chooses_same_modes_p (loop_vinfo
,
3078 vector_modes
[mode_i
+ 1]))
3080 if (dump_enabled_p ())
3081 dump_printf_loc (MSG_NOTE
, vect_location
,
3082 "***** The result for vector mode %s would"
3084 GET_MODE_NAME (vector_modes
[mode_i
+ 1]));
3087 if (mode_i
+ 1 < vector_modes
.length ()
3088 && VECTOR_MODE_P (autodetected_vector_mode
)
3089 && (related_vector_mode (vector_modes
[mode_i
+ 1],
3090 GET_MODE_INNER (autodetected_vector_mode
))
3091 == autodetected_vector_mode
)
3092 && (related_vector_mode (autodetected_vector_mode
,
3093 GET_MODE_INNER (vector_modes
[mode_i
+ 1]))
3094 == vector_modes
[mode_i
+ 1]))
3096 if (dump_enabled_p ())
3097 dump_printf_loc (MSG_NOTE
, vect_location
,
3098 "***** Skipping vector mode %s, which would"
3099 " repeat the analysis for %s\n",
3100 GET_MODE_NAME (vector_modes
[mode_i
+ 1]),
3101 GET_MODE_NAME (autodetected_vector_mode
));
3110 gcc_checking_assert (main_loop_vinfo
== NULL
);
3111 return opt_loop_vec_info::propagate_failure (res
);
3114 return opt_loop_vec_info::success (loop_vinfo
);
3117 /* Function vect_analyze_loop.
3119 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3120 for it. The different analyses will record information in the
3121 loop_vec_info struct. */
3123 vect_analyze_loop (class loop
*loop
, vec_info_shared
*shared
)
3125 DUMP_VECT_SCOPE ("analyze_loop_nest");
3127 if (loop_outer (loop
)
3128 && loop_vec_info_for_loop (loop_outer (loop
))
3129 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
3130 return opt_loop_vec_info::failure_at (vect_location
,
3131 "outer-loop already vectorized.\n");
3133 if (!find_loop_nest (loop
, &shared
->loop_nest
))
3134 return opt_loop_vec_info::failure_at
3136 "not vectorized: loop nest containing two or more consecutive inner"
3137 " loops cannot be vectorized\n");
3139 /* Analyze the loop form. */
3140 vect_loop_form_info loop_form_info
;
3141 opt_result res
= vect_analyze_loop_form (loop
, &loop_form_info
);
3144 if (dump_enabled_p ())
3145 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3146 "bad loop form.\n");
3147 return opt_loop_vec_info::propagate_failure (res
);
3149 if (!integer_onep (loop_form_info
.assumptions
))
3151 /* We consider to vectorize this loop by versioning it under
3152 some assumptions. In order to do this, we need to clear
3153 existing information computed by scev and niter analyzer. */
3155 free_numbers_of_iterations_estimates (loop
);
3156 /* Also set flag for this loop so that following scev and niter
3157 analysis are done under the assumptions. */
3158 loop_constraint_set (loop
, LOOP_C_FINITE
);
3161 auto_vector_modes vector_modes
;
3162 /* Autodetect first vector size we try. */
3163 vector_modes
.safe_push (VOIDmode
);
3164 unsigned int autovec_flags
3165 = targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
,
3166 loop
->simdlen
!= 0);
3167 bool pick_lowest_cost_p
= ((autovec_flags
& VECT_COMPARE_COSTS
)
3168 && !unlimited_cost_model (loop
));
3169 machine_mode autodetected_vector_mode
= VOIDmode
;
3170 opt_loop_vec_info first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3171 unsigned int mode_i
= 0;
3172 unsigned HOST_WIDE_INT simdlen
= loop
->simdlen
;
3174 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3175 a mode has not been analyzed. */
3176 auto_vec
<poly_uint64
, 8> cached_vf_per_mode
;
3177 for (unsigned i
= 0; i
< vector_modes
.length (); ++i
)
3178 cached_vf_per_mode
.safe_push (0);
3180 /* First determine the main loop vectorization mode, either the first
3181 one that works, starting with auto-detecting the vector mode and then
3182 following the targets order of preference, or the one with the
3183 lowest cost if pick_lowest_cost_p. */
3187 unsigned int last_mode_i
= mode_i
;
3188 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3190 cached_vf_per_mode
[last_mode_i
] = -1;
3191 opt_loop_vec_info loop_vinfo
3192 = vect_analyze_loop_1 (loop
, shared
, &loop_form_info
,
3193 NULL
, vector_modes
, mode_i
,
3194 autodetected_vector_mode
, fatal
);
3200 /* Analyzis has been successful so update the VF value. The
3201 VF should always be a multiple of unroll_factor and we want to
3202 capture the original VF here. */
3203 cached_vf_per_mode
[last_mode_i
]
3204 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
3205 loop_vinfo
->suggested_unroll_factor
);
3206 /* Once we hit the desired simdlen for the first time,
3207 discard any previous attempts. */
3209 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), simdlen
))
3211 delete first_loop_vinfo
;
3212 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3215 else if (pick_lowest_cost_p
3217 && vect_joust_loop_vinfos (loop_vinfo
, first_loop_vinfo
))
3219 /* Pick loop_vinfo over first_loop_vinfo. */
3220 delete first_loop_vinfo
;
3221 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
3223 if (first_loop_vinfo
== NULL
)
3224 first_loop_vinfo
= loop_vinfo
;
3228 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3231 /* Commit to first_loop_vinfo if we have no reason to try
3233 if (!simdlen
&& !pick_lowest_cost_p
)
3236 if (mode_i
== vector_modes
.length ()
3237 || autodetected_vector_mode
== VOIDmode
)
3240 /* Try the next biggest vector size. */
3241 if (dump_enabled_p ())
3242 dump_printf_loc (MSG_NOTE
, vect_location
,
3243 "***** Re-trying analysis with vector mode %s\n",
3244 GET_MODE_NAME (vector_modes
[mode_i
]));
3246 if (!first_loop_vinfo
)
3247 return opt_loop_vec_info::propagate_failure (res
);
3249 if (dump_enabled_p ())
3250 dump_printf_loc (MSG_NOTE
, vect_location
,
3251 "***** Choosing vector mode %s\n",
3252 GET_MODE_NAME (first_loop_vinfo
->vector_mode
));
3254 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3255 enabled, SIMDUID is not set, it is the innermost loop and we have
3256 either already found the loop's SIMDLEN or there was no SIMDLEN to
3258 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3259 bool vect_epilogues
= (!simdlen
3260 && loop
->inner
== NULL
3261 && param_vect_epilogues_nomask
3262 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo
)
3264 if (!vect_epilogues
)
3265 return first_loop_vinfo
;
3267 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3268 poly_uint64 lowest_th
= LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
);
3270 /* For epilogues start the analysis from the first mode. The motivation
3271 behind starting from the beginning comes from cases where the VECTOR_MODES
3272 array may contain length-agnostic and length-specific modes. Their
3273 ordering is not guaranteed, so we could end up picking a mode for the main
3274 loop that is after the epilogue's optimal mode. */
3275 vector_modes
[0] = autodetected_vector_mode
;
3278 bool supports_partial_vectors
=
3279 partial_vectors_supported_p () && param_vect_partial_vector_usage
!= 0;
3280 poly_uint64 first_vinfo_vf
= LOOP_VINFO_VECT_FACTOR (first_loop_vinfo
);
3284 /* If the target does not support partial vectors we can shorten the
3285 number of modes to analyze for the epilogue as we know we can't pick a
3286 mode that would lead to a VF at least as big as the
3288 if (!supports_partial_vectors
3289 && maybe_ge (cached_vf_per_mode
[mode_i
], first_vinfo_vf
))
3292 if (mode_i
== vector_modes
.length ())
3297 if (dump_enabled_p ())
3298 dump_printf_loc (MSG_NOTE
, vect_location
,
3299 "***** Re-trying epilogue analysis with vector "
3300 "mode %s\n", GET_MODE_NAME (vector_modes
[mode_i
]));
3303 opt_loop_vec_info loop_vinfo
3304 = vect_analyze_loop_1 (loop
, shared
, &loop_form_info
,
3306 vector_modes
, mode_i
,
3307 autodetected_vector_mode
, fatal
);
3313 if (pick_lowest_cost_p
)
3315 /* Keep trying to roll back vectorization attempts while the
3316 loop_vec_infos they produced were worse than this one. */
3317 vec
<loop_vec_info
> &vinfos
= first_loop_vinfo
->epilogue_vinfos
;
3318 while (!vinfos
.is_empty ()
3319 && vect_joust_loop_vinfos (loop_vinfo
, vinfos
.last ()))
3321 gcc_assert (vect_epilogues
);
3322 delete vinfos
.pop ();
3325 /* For now only allow one epilogue loop. */
3326 if (first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3328 first_loop_vinfo
->epilogue_vinfos
.safe_push (loop_vinfo
);
3329 poly_uint64 th
= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
3330 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
3331 || maybe_ne (lowest_th
, 0U));
3332 /* Keep track of the known smallest versioning
3334 if (ordered_p (lowest_th
, th
))
3335 lowest_th
= ordered_min (lowest_th
, th
);
3340 loop_vinfo
= opt_loop_vec_info::success (NULL
);
3343 /* For now only allow one epilogue loop, but allow
3344 pick_lowest_cost_p to replace it, so commit to the
3345 first epilogue if we have no reason to try alternatives. */
3346 if (!pick_lowest_cost_p
)
3350 if (mode_i
== vector_modes
.length ())
3355 if (!first_loop_vinfo
->epilogue_vinfos
.is_empty ())
3357 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
) = lowest_th
;
3358 if (dump_enabled_p ())
3359 dump_printf_loc (MSG_NOTE
, vect_location
,
3360 "***** Choosing epilogue vector mode %s\n",
3362 (first_loop_vinfo
->epilogue_vinfos
[0]->vector_mode
));
3365 return first_loop_vinfo
;
3368 /* Return true if there is an in-order reduction function for CODE, storing
3369 it in *REDUC_FN if so. */
3372 fold_left_reduction_fn (code_helper code
, internal_fn
*reduc_fn
)
3374 if (code
== PLUS_EXPR
)
3376 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
3382 /* Function reduction_fn_for_scalar_code
3385 CODE - tree_code of a reduction operations.
3388 REDUC_FN - the corresponding internal function to be used to reduce the
3389 vector of partial results into a single scalar result, or IFN_LAST
3390 if the operation is a supported reduction operation, but does not have
3391 such an internal function.
3393 Return FALSE if CODE currently cannot be vectorized as reduction. */
3396 reduction_fn_for_scalar_code (code_helper code
, internal_fn
*reduc_fn
)
3398 if (code
.is_tree_code ())
3399 switch (tree_code (code
))
3402 *reduc_fn
= IFN_REDUC_MAX
;
3406 *reduc_fn
= IFN_REDUC_MIN
;
3410 *reduc_fn
= IFN_REDUC_PLUS
;
3414 *reduc_fn
= IFN_REDUC_AND
;
3418 *reduc_fn
= IFN_REDUC_IOR
;
3422 *reduc_fn
= IFN_REDUC_XOR
;
3427 *reduc_fn
= IFN_LAST
;
3434 switch (combined_fn (code
))
3437 *reduc_fn
= IFN_REDUC_FMAX
;
3441 *reduc_fn
= IFN_REDUC_FMIN
;
3449 /* If there is a neutral value X such that a reduction would not be affected
3450 by the introduction of additional X elements, return that X, otherwise
3451 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3452 of the scalar elements. If the reduction has just a single initial value
3453 then INITIAL_VALUE is that value, otherwise it is null. */
3456 neutral_op_for_reduction (tree scalar_type
, code_helper code
,
3459 if (code
.is_tree_code ())
3460 switch (tree_code (code
))
3462 case WIDEN_SUM_EXPR
:
3469 return build_zero_cst (scalar_type
);
3472 return build_one_cst (scalar_type
);
3475 return build_all_ones_cst (scalar_type
);
3479 return initial_value
;
3485 switch (combined_fn (code
))
3489 return initial_value
;
3496 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3497 STMT is printed with a message MSG. */
3500 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
3502 dump_printf_loc (msg_type
, vect_location
, "%s%G", msg
, stmt
);
3505 /* Return true if we need an in-order reduction for operation CODE
3506 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3507 overflow must wrap. */
3510 needs_fold_left_reduction_p (tree type
, code_helper code
)
3512 /* CHECKME: check for !flag_finite_math_only too? */
3513 if (SCALAR_FLOAT_TYPE_P (type
))
3515 if (code
.is_tree_code ())
3516 switch (tree_code (code
))
3523 return !flag_associative_math
;
3526 switch (combined_fn (code
))
3533 return !flag_associative_math
;
3537 if (INTEGRAL_TYPE_P (type
))
3538 return (!code
.is_tree_code ()
3539 || !operation_no_trapping_overflow (type
, tree_code (code
)));
3541 if (SAT_FIXED_POINT_TYPE_P (type
))
3547 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3548 has a handled computation expression. Store the main reduction
3549 operation in *CODE. */
3552 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3553 tree loop_arg
, code_helper
*code
,
3554 vec
<std::pair
<ssa_op_iter
, use_operand_p
> > &path
)
3556 auto_bitmap visited
;
3557 tree lookfor
= PHI_RESULT (phi
);
3559 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
3560 while (USE_FROM_PTR (curr
) != loop_arg
)
3561 curr
= op_iter_next_use (&curri
);
3562 curri
.i
= curri
.numops
;
3565 path
.safe_push (std::make_pair (curri
, curr
));
3566 tree use
= USE_FROM_PTR (curr
);
3569 gimple
*def
= SSA_NAME_DEF_STMT (use
);
3570 if (gimple_nop_p (def
)
3571 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
3576 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
3580 curr
= op_iter_next_use (&curri
);
3581 /* Skip already visited or non-SSA operands (from iterating
3583 while (curr
!= NULL_USE_OPERAND_P
3584 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3585 || ! bitmap_set_bit (visited
,
3587 (USE_FROM_PTR (curr
)))));
3589 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
3590 if (curr
== NULL_USE_OPERAND_P
)
3595 if (gimple_code (def
) == GIMPLE_PHI
)
3596 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
3598 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
3599 while (curr
!= NULL_USE_OPERAND_P
3600 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
3601 || ! bitmap_set_bit (visited
,
3603 (USE_FROM_PTR (curr
)))))
3604 curr
= op_iter_next_use (&curri
);
3605 if (curr
== NULL_USE_OPERAND_P
)
3610 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3612 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
3614 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
3615 FOR_EACH_VEC_ELT (path
, i
, x
)
3616 dump_printf (MSG_NOTE
, "%T ", USE_FROM_PTR (x
->second
));
3617 dump_printf (MSG_NOTE
, "\n");
3620 /* Check whether the reduction path detected is valid. */
3621 bool fail
= path
.length () == 0;
3625 for (unsigned i
= 1; i
< path
.length (); ++i
)
3627 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
3629 if (!gimple_extract_op (use_stmt
, &op
))
3634 unsigned int opi
= op
.num_ops
;
3635 if (gassign
*assign
= dyn_cast
<gassign
*> (use_stmt
))
3637 /* The following make sure we can compute the operand index
3638 easily plus it mostly disallows chaining via COND_EXPR condition
3640 for (opi
= 0; opi
< op
.num_ops
; ++opi
)
3641 if (gimple_assign_rhs1_ptr (assign
) + opi
== path
[i
].second
->use
)
3644 else if (gcall
*call
= dyn_cast
<gcall
*> (use_stmt
))
3646 for (opi
= 0; opi
< op
.num_ops
; ++opi
)
3647 if (gimple_call_arg_ptr (call
, opi
) == path
[i
].second
->use
)
3650 if (opi
== op
.num_ops
)
3655 op
.code
= canonicalize_code (op
.code
, op
.type
);
3656 if (op
.code
== MINUS_EXPR
)
3658 op
.code
= PLUS_EXPR
;
3659 /* Track whether we negate the reduction value each iteration. */
3660 if (op
.ops
[1] == op
.ops
[opi
])
3663 if (CONVERT_EXPR_CODE_P (op
.code
)
3664 && tree_nop_conversion_p (op
.type
, TREE_TYPE (op
.ops
[0])))
3666 else if (*code
== ERROR_MARK
)
3669 sign
= TYPE_SIGN (op
.type
);
3671 else if (op
.code
!= *code
)
3676 else if ((op
.code
== MIN_EXPR
3677 || op
.code
== MAX_EXPR
)
3678 && sign
!= TYPE_SIGN (op
.type
))
3683 /* Check there's only a single stmt the op is used on. For the
3684 not value-changing tail and the last stmt allow out-of-loop uses.
3685 ??? We could relax this and handle arbitrary live stmts by
3686 forcing a scalar epilogue for example. */
3687 imm_use_iterator imm_iter
;
3688 gimple
*op_use_stmt
;
3690 FOR_EACH_IMM_USE_STMT (op_use_stmt
, imm_iter
, op
.ops
[opi
])
3691 if (!is_gimple_debug (op_use_stmt
)
3692 && (*code
!= ERROR_MARK
3693 || flow_bb_inside_loop_p (loop
, gimple_bb (op_use_stmt
))))
3695 /* We want to allow x + x but not x < 1 ? x : 2. */
3696 if (is_gimple_assign (op_use_stmt
)
3697 && gimple_assign_rhs_code (op_use_stmt
) == COND_EXPR
)
3699 use_operand_p use_p
;
3700 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
3712 return ! fail
&& ! neg
&& *code
!= ERROR_MARK
;
3716 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3717 tree loop_arg
, enum tree_code code
)
3719 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3721 return (check_reduction_path (loc
, loop
, phi
, loop_arg
, &code_
, path
)
3727 /* Function vect_is_simple_reduction
3729 (1) Detect a cross-iteration def-use cycle that represents a simple
3730 reduction computation. We look for the following pattern:
3735 a2 = operation (a3, a1)
3742 a2 = operation (a3, a1)
3745 1. operation is commutative and associative and it is safe to
3746 change the order of the computation
3747 2. no uses for a2 in the loop (a2 is used out of the loop)
3748 3. no uses of a1 in the loop besides the reduction operation
3749 4. no uses of a1 outside the loop.
3751 Conditions 1,4 are tested here.
3752 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3754 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3757 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3761 inner loop (def of a3)
3764 (4) Detect condition expressions, ie:
3765 for (int i = 0; i < N; i++)
3771 static stmt_vec_info
3772 vect_is_simple_reduction (loop_vec_info loop_info
, stmt_vec_info phi_info
,
3773 bool *double_reduc
, bool *reduc_chain_p
, bool slp
)
3775 gphi
*phi
= as_a
<gphi
*> (phi_info
->stmt
);
3776 gimple
*phi_use_stmt
= NULL
;
3777 imm_use_iterator imm_iter
;
3778 use_operand_p use_p
;
3780 *double_reduc
= false;
3781 *reduc_chain_p
= false;
3782 STMT_VINFO_REDUC_TYPE (phi_info
) = TREE_CODE_REDUCTION
;
3784 tree phi_name
= PHI_RESULT (phi
);
3785 /* ??? If there are no uses of the PHI result the inner loop reduction
3786 won't be detected as possibly double-reduction by vectorizable_reduction
3787 because that tries to walk the PHI arg from the preheader edge which
3788 can be constant. See PR60382. */
3789 if (has_zero_uses (phi_name
))
3791 class loop
*loop
= (gimple_bb (phi
))->loop_father
;
3792 unsigned nphi_def_loop_uses
= 0;
3793 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
3795 gimple
*use_stmt
= USE_STMT (use_p
);
3796 if (is_gimple_debug (use_stmt
))
3799 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3801 if (dump_enabled_p ())
3802 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3803 "intermediate value used outside loop.\n");
3808 nphi_def_loop_uses
++;
3809 phi_use_stmt
= use_stmt
;
3812 tree latch_def
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
3813 if (TREE_CODE (latch_def
) != SSA_NAME
)
3815 if (dump_enabled_p ())
3816 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3817 "reduction: not ssa_name: %T\n", latch_def
);
3821 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (latch_def
);
3823 || !flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
)))
3826 bool nested_in_vect_loop
3827 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info
), loop
);
3828 unsigned nlatch_def_loop_uses
= 0;
3829 auto_vec
<gphi
*, 3> lcphis
;
3830 bool inner_loop_of_double_reduc
= false;
3831 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, latch_def
)
3833 gimple
*use_stmt
= USE_STMT (use_p
);
3834 if (is_gimple_debug (use_stmt
))
3836 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3837 nlatch_def_loop_uses
++;
3840 /* We can have more than one loop-closed PHI. */
3841 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
3842 if (nested_in_vect_loop
3843 && (STMT_VINFO_DEF_TYPE (loop_info
->lookup_stmt (use_stmt
))
3844 == vect_double_reduction_def
))
3845 inner_loop_of_double_reduc
= true;
3849 /* If we are vectorizing an inner reduction we are executing that
3850 in the original order only in case we are not dealing with a
3851 double reduction. */
3852 if (nested_in_vect_loop
&& !inner_loop_of_double_reduc
)
3854 if (dump_enabled_p ())
3855 report_vect_op (MSG_NOTE
, def_stmt_info
->stmt
,
3856 "detected nested cycle: ");
3857 return def_stmt_info
;
3860 /* When the inner loop of a double reduction ends up with more than
3861 one loop-closed PHI we have failed to classify alternate such
3862 PHIs as double reduction, leading to wrong code. See PR103237. */
3863 if (inner_loop_of_double_reduc
&& lcphis
.length () != 1)
3865 if (dump_enabled_p ())
3866 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3867 "unhandle double reduction\n");
3871 /* If this isn't a nested cycle or if the nested cycle reduction value
3872 is used ouside of the inner loop we cannot handle uses of the reduction
3874 if (nlatch_def_loop_uses
> 1 || nphi_def_loop_uses
> 1)
3876 if (dump_enabled_p ())
3877 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3878 "reduction used in loop.\n");
3882 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3883 defined in the inner loop. */
3884 if (gphi
*def_stmt
= dyn_cast
<gphi
*> (def_stmt_info
->stmt
))
3886 tree op1
= PHI_ARG_DEF (def_stmt
, 0);
3887 if (gimple_phi_num_args (def_stmt
) != 1
3888 || TREE_CODE (op1
) != SSA_NAME
)
3890 if (dump_enabled_p ())
3891 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3892 "unsupported phi node definition.\n");
3897 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3898 and the latch definition op1. */
3899 gimple
*def1
= SSA_NAME_DEF_STMT (op1
);
3900 if (gimple_bb (def1
)
3901 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3903 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
3904 && (is_gimple_assign (def1
) || is_gimple_call (def1
))
3905 && is_a
<gphi
*> (phi_use_stmt
)
3906 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
))
3907 && (op1
== PHI_ARG_DEF_FROM_EDGE (phi_use_stmt
,
3908 loop_latch_edge (loop
->inner
))))
3910 if (dump_enabled_p ())
3911 report_vect_op (MSG_NOTE
, def_stmt
,
3912 "detected double reduction: ");
3914 *double_reduc
= true;
3915 return def_stmt_info
;
3921 /* Look for the expression computing latch_def from then loop PHI result. */
3922 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3924 if (check_reduction_path (vect_location
, loop
, phi
, latch_def
, &code
,
3927 STMT_VINFO_REDUC_CODE (phi_info
) = code
;
3928 if (code
== COND_EXPR
&& !nested_in_vect_loop
)
3929 STMT_VINFO_REDUC_TYPE (phi_info
) = COND_REDUCTION
;
3931 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3932 reduction chain for which the additional restriction is that
3933 all operations in the chain are the same. */
3934 auto_vec
<stmt_vec_info
, 8> reduc_chain
;
3936 bool is_slp_reduc
= !nested_in_vect_loop
&& code
!= COND_EXPR
;
3937 for (i
= path
.length () - 1; i
>= 1; --i
)
3939 gimple
*stmt
= USE_STMT (path
[i
].second
);
3940 stmt_vec_info stmt_info
= loop_info
->lookup_stmt (stmt
);
3942 if (!gimple_extract_op (stmt
, &op
))
3944 if (gassign
*assign
= dyn_cast
<gassign
*> (stmt
))
3945 STMT_VINFO_REDUC_IDX (stmt_info
)
3946 = path
[i
].second
->use
- gimple_assign_rhs1_ptr (assign
);
3949 gcall
*call
= as_a
<gcall
*> (stmt
);
3950 STMT_VINFO_REDUC_IDX (stmt_info
)
3951 = path
[i
].second
->use
- gimple_call_arg_ptr (call
, 0);
3953 bool leading_conversion
= (CONVERT_EXPR_CODE_P (op
.code
)
3954 && (i
== 1 || i
== path
.length () - 1));
3955 if ((op
.code
!= code
&& !leading_conversion
)
3956 /* We can only handle the final value in epilogue
3957 generation for reduction chains. */
3958 || (i
!= 1 && !has_single_use (gimple_get_lhs (stmt
))))
3959 is_slp_reduc
= false;
3960 /* For reduction chains we support a trailing/leading
3961 conversions. We do not store those in the actual chain. */
3962 if (leading_conversion
)
3964 reduc_chain
.safe_push (stmt_info
);
3966 if (slp
&& is_slp_reduc
&& reduc_chain
.length () > 1)
3968 for (unsigned i
= 0; i
< reduc_chain
.length () - 1; ++i
)
3970 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
[i
]) = reduc_chain
[0];
3971 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
[i
]) = reduc_chain
[i
+1];
3973 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
.last ()) = reduc_chain
[0];
3974 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
.last ()) = NULL
;
3976 /* Save the chain for further analysis in SLP detection. */
3977 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (reduc_chain
[0]);
3978 REDUC_GROUP_SIZE (reduc_chain
[0]) = reduc_chain
.length ();
3980 *reduc_chain_p
= true;
3981 if (dump_enabled_p ())
3982 dump_printf_loc (MSG_NOTE
, vect_location
,
3983 "reduction: detected reduction chain\n");
3985 else if (dump_enabled_p ())
3986 dump_printf_loc (MSG_NOTE
, vect_location
,
3987 "reduction: detected reduction\n");
3989 return def_stmt_info
;
3992 if (dump_enabled_p ())
3993 dump_printf_loc (MSG_NOTE
, vect_location
,
3994 "reduction: unknown pattern\n");
3999 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4000 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4001 or -1 if not known. */
4004 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo
, int peel_iters_prologue
)
4006 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
4007 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) || peel_iters_prologue
== -1)
4009 if (dump_enabled_p ())
4010 dump_printf_loc (MSG_NOTE
, vect_location
,
4011 "cost model: epilogue peel iters set to vf/2 "
4012 "because loop iterations are unknown .\n");
4013 return assumed_vf
/ 2;
4017 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
4018 peel_iters_prologue
= MIN (niters
, peel_iters_prologue
);
4019 int peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
4020 /* If we need to peel for gaps, but no peeling is required, we have to
4021 peel VF iterations. */
4022 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !peel_iters_epilogue
)
4023 peel_iters_epilogue
= assumed_vf
;
4024 return peel_iters_epilogue
;
4028 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4030 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
4031 int *peel_iters_epilogue
,
4032 stmt_vector_for_cost
*scalar_cost_vec
,
4033 stmt_vector_for_cost
*prologue_cost_vec
,
4034 stmt_vector_for_cost
*epilogue_cost_vec
)
4038 *peel_iters_epilogue
4039 = vect_get_peel_iters_epilogue (loop_vinfo
, peel_iters_prologue
);
4041 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
4043 /* If peeled iterations are known but number of scalar loop
4044 iterations are unknown, count a taken branch per peeled loop. */
4045 if (peel_iters_prologue
> 0)
4046 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
4048 if (*peel_iters_epilogue
> 0)
4049 retval
+= record_stmt_cost (epilogue_cost_vec
, 1, cond_branch_taken
,
4053 stmt_info_for_cost
*si
;
4055 if (peel_iters_prologue
)
4056 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
4057 retval
+= record_stmt_cost (prologue_cost_vec
,
4058 si
->count
* peel_iters_prologue
,
4059 si
->kind
, si
->stmt_info
, si
->misalign
,
4061 if (*peel_iters_epilogue
)
4062 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
4063 retval
+= record_stmt_cost (epilogue_cost_vec
,
4064 si
->count
* *peel_iters_epilogue
,
4065 si
->kind
, si
->stmt_info
, si
->misalign
,
4071 /* Function vect_estimate_min_profitable_iters
4073 Return the number of iterations required for the vector version of the
4074 loop to be profitable relative to the cost of the scalar version of the
4077 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4078 of iterations for vectorization. -1 value means loop vectorization
4079 is not profitable. This returned value may be used for dynamic
4080 profitability check.
4082 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4083 for static check against estimated number of iterations. */
4086 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
4087 int *ret_min_profitable_niters
,
4088 int *ret_min_profitable_estimate
,
4089 unsigned *suggested_unroll_factor
)
4091 int min_profitable_iters
;
4092 int min_profitable_estimate
;
4093 int peel_iters_prologue
;
4094 int peel_iters_epilogue
;
4095 unsigned vec_inside_cost
= 0;
4096 int vec_outside_cost
= 0;
4097 unsigned vec_prologue_cost
= 0;
4098 unsigned vec_epilogue_cost
= 0;
4099 int scalar_single_iter_cost
= 0;
4100 int scalar_outside_cost
= 0;
4101 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
4102 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
4103 vector_costs
*target_cost_data
= loop_vinfo
->vector_costs
;
4105 /* Cost model disabled. */
4106 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
4108 if (dump_enabled_p ())
4109 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
4110 *ret_min_profitable_niters
= 0;
4111 *ret_min_profitable_estimate
= 0;
4115 /* Requires loop versioning tests to handle misalignment. */
4116 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
4118 /* FIXME: Make cost depend on complexity of individual check. */
4119 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
4120 (void) add_stmt_cost (target_cost_data
, len
, scalar_stmt
, vect_prologue
);
4121 if (dump_enabled_p ())
4122 dump_printf (MSG_NOTE
,
4123 "cost model: Adding cost of checks for loop "
4124 "versioning to treat misalignment.\n");
4127 /* Requires loop versioning with alias checks. */
4128 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
4130 /* FIXME: Make cost depend on complexity of individual check. */
4131 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
4132 (void) add_stmt_cost (target_cost_data
, len
, scalar_stmt
, vect_prologue
);
4133 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
4135 /* Count LEN - 1 ANDs and LEN comparisons. */
4136 (void) add_stmt_cost (target_cost_data
, len
* 2 - 1,
4137 scalar_stmt
, vect_prologue
);
4138 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
4141 /* Count LEN - 1 ANDs and LEN comparisons. */
4142 unsigned int nstmts
= len
* 2 - 1;
4143 /* +1 for each bias that needs adding. */
4144 for (unsigned int i
= 0; i
< len
; ++i
)
4145 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
4147 (void) add_stmt_cost (target_cost_data
, nstmts
,
4148 scalar_stmt
, vect_prologue
);
4150 if (dump_enabled_p ())
4151 dump_printf (MSG_NOTE
,
4152 "cost model: Adding cost of checks for loop "
4153 "versioning aliasing.\n");
4156 /* Requires loop versioning with niter checks. */
4157 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
4159 /* FIXME: Make cost depend on complexity of individual check. */
4160 (void) add_stmt_cost (target_cost_data
, 1, vector_stmt
,
4161 NULL
, NULL
, NULL_TREE
, 0, vect_prologue
);
4162 if (dump_enabled_p ())
4163 dump_printf (MSG_NOTE
,
4164 "cost model: Adding cost of checks for loop "
4165 "versioning niters.\n");
4168 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4169 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4172 /* Count statements in scalar loop. Using this as scalar cost for a single
4175 TODO: Add outer loop support.
4177 TODO: Consider assigning different costs to different scalar
4180 scalar_single_iter_cost
= loop_vinfo
->scalar_costs
->total_cost ();
4182 /* Add additional cost for the peeled instructions in prologue and epilogue
4183 loop. (For fully-masked loops there will be no peeling.)
4185 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4186 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4188 TODO: Build an expression that represents peel_iters for prologue and
4189 epilogue to be used in a run-time test. */
4191 bool prologue_need_br_taken_cost
= false;
4192 bool prologue_need_br_not_taken_cost
= false;
4194 /* Calculate peel_iters_prologue. */
4195 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
4196 peel_iters_prologue
= 0;
4199 peel_iters_prologue
= assumed_vf
/ 2;
4200 if (dump_enabled_p ())
4201 dump_printf (MSG_NOTE
, "cost model: "
4202 "prologue peel iters set to vf/2.\n");
4204 /* If peeled iterations are unknown, count a taken branch and a not taken
4205 branch per peeled loop. Even if scalar loop iterations are known,
4206 vector iterations are not known since peeled prologue iterations are
4207 not known. Hence guards remain the same. */
4208 prologue_need_br_taken_cost
= true;
4209 prologue_need_br_not_taken_cost
= true;
4213 peel_iters_prologue
= npeel
;
4214 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_prologue
> 0)
4215 /* If peeled iterations are known but number of scalar loop
4216 iterations are unknown, count a taken branch per peeled loop. */
4217 prologue_need_br_taken_cost
= true;
4220 bool epilogue_need_br_taken_cost
= false;
4221 bool epilogue_need_br_not_taken_cost
= false;
4223 /* Calculate peel_iters_epilogue. */
4224 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4225 /* We need to peel exactly one iteration for gaps. */
4226 peel_iters_epilogue
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
4229 /* If peeling for alignment is unknown, loop bound of main loop
4231 peel_iters_epilogue
= assumed_vf
/ 2;
4232 if (dump_enabled_p ())
4233 dump_printf (MSG_NOTE
, "cost model: "
4234 "epilogue peel iters set to vf/2 because "
4235 "peeling for alignment is unknown.\n");
4237 /* See the same reason above in peel_iters_prologue calculation. */
4238 epilogue_need_br_taken_cost
= true;
4239 epilogue_need_br_not_taken_cost
= true;
4243 peel_iters_epilogue
= vect_get_peel_iters_epilogue (loop_vinfo
, npeel
);
4244 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && peel_iters_epilogue
> 0)
4245 /* If peeled iterations are known but number of scalar loop
4246 iterations are unknown, count a taken branch per peeled loop. */
4247 epilogue_need_br_taken_cost
= true;
4250 stmt_info_for_cost
*si
;
4252 /* Add costs associated with peel_iters_prologue. */
4253 if (peel_iters_prologue
)
4254 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4256 (void) add_stmt_cost (target_cost_data
,
4257 si
->count
* peel_iters_prologue
, si
->kind
,
4258 si
->stmt_info
, si
->node
, si
->vectype
,
4259 si
->misalign
, vect_prologue
);
4262 /* Add costs associated with peel_iters_epilogue. */
4263 if (peel_iters_epilogue
)
4264 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
4266 (void) add_stmt_cost (target_cost_data
,
4267 si
->count
* peel_iters_epilogue
, si
->kind
,
4268 si
->stmt_info
, si
->node
, si
->vectype
,
4269 si
->misalign
, vect_epilogue
);
4272 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4274 if (prologue_need_br_taken_cost
)
4275 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4278 if (prologue_need_br_not_taken_cost
)
4279 (void) add_stmt_cost (target_cost_data
, 1,
4280 cond_branch_not_taken
, vect_prologue
);
4282 if (epilogue_need_br_taken_cost
)
4283 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
4286 if (epilogue_need_br_not_taken_cost
)
4287 (void) add_stmt_cost (target_cost_data
, 1,
4288 cond_branch_not_taken
, vect_epilogue
);
4290 /* Take care of special costs for rgroup controls of partial vectors. */
4291 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
4293 /* Calculate how many masks we need to generate. */
4294 unsigned int num_masks
= 0;
4295 rgroup_controls
*rgm
;
4296 unsigned int num_vectors_m1
;
4297 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), num_vectors_m1
, rgm
)
4299 num_masks
+= num_vectors_m1
+ 1;
4300 gcc_assert (num_masks
> 0);
4302 /* In the worst case, we need to generate each mask in the prologue
4303 and in the loop body. One of the loop body mask instructions
4304 replaces the comparison in the scalar loop, and since we don't
4305 count the scalar comparison against the scalar body, we shouldn't
4306 count that vector instruction against the vector body either.
4308 Sometimes we can use unpacks instead of generating prologue
4309 masks and sometimes the prologue mask will fold to a constant,
4310 so the actual prologue cost might be smaller. However, it's
4311 simpler and safer to use the worst-case cost; if this ends up
4312 being the tie-breaker between vectorizing or not, then it's
4313 probably better not to vectorize. */
4314 (void) add_stmt_cost (target_cost_data
, num_masks
,
4315 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4317 (void) add_stmt_cost (target_cost_data
, num_masks
- 1,
4318 vector_stmt
, NULL
, NULL
, NULL_TREE
, 0,
4321 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
4323 /* Referring to the functions vect_set_loop_condition_partial_vectors
4324 and vect_set_loop_controls_directly, we need to generate each
4325 length in the prologue and in the loop body if required. Although
4326 there are some possible optimizations, we consider the worst case
4329 bool niters_known_p
= LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
);
4330 signed char partial_load_store_bias
4331 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
);
4333 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
4334 && !vect_known_niters_smaller_than_vf (loop_vinfo
));
4336 /* Calculate how many statements to be added. */
4337 unsigned int prologue_stmts
= 0;
4338 unsigned int body_stmts
= 0;
4340 rgroup_controls
*rgc
;
4341 unsigned int num_vectors_m1
;
4342 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo
), num_vectors_m1
, rgc
)
4345 /* May need one SHIFT for nitems_total computation. */
4346 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
4347 if (nitems
!= 1 && !niters_known_p
)
4348 prologue_stmts
+= 1;
4350 /* May need one MAX and one MINUS for wrap around. */
4351 if (vect_rgroup_iv_might_wrap_p (loop_vinfo
, rgc
))
4352 prologue_stmts
+= 2;
4354 /* Need one MAX and one MINUS for each batch limit excepting for
4356 prologue_stmts
+= num_vectors_m1
* 2;
4358 unsigned int num_vectors
= num_vectors_m1
+ 1;
4360 /* Need to set up lengths in prologue, only one MIN required
4361 for each since start index is zero. */
4362 prologue_stmts
+= num_vectors
;
4364 /* If we have a non-zero partial load bias, we need one PLUS
4365 to adjust the load length. */
4366 if (partial_load_store_bias
!= 0)
4369 /* Each may need two MINs and one MINUS to update lengths in body
4370 for next iteration. */
4372 body_stmts
+= 3 * num_vectors
;
4375 (void) add_stmt_cost (target_cost_data
, prologue_stmts
,
4376 scalar_stmt
, vect_prologue
);
4377 (void) add_stmt_cost (target_cost_data
, body_stmts
,
4378 scalar_stmt
, vect_body
);
4381 /* FORNOW: The scalar outside cost is incremented in one of the
4384 1. The vectorizer checks for alignment and aliasing and generates
4385 a condition that allows dynamic vectorization. A cost model
4386 check is ANDED with the versioning condition. Hence scalar code
4387 path now has the added cost of the versioning check.
4389 if (cost > th & versioning_check)
4392 Hence run-time scalar is incremented by not-taken branch cost.
4394 2. The vectorizer then checks if a prologue is required. If the
4395 cost model check was not done before during versioning, it has to
4396 be done before the prologue check.
4399 prologue = scalar_iters
4404 if (prologue == num_iters)
4407 Hence the run-time scalar cost is incremented by a taken branch,
4408 plus a not-taken branch, plus a taken branch cost.
4410 3. The vectorizer then checks if an epilogue is required. If the
4411 cost model check was not done before during prologue check, it
4412 has to be done with the epilogue check.
4418 if (prologue == num_iters)
4421 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4424 Hence the run-time scalar cost should be incremented by 2 taken
4427 TODO: The back end may reorder the BBS's differently and reverse
4428 conditions/branch directions. Change the estimates below to
4429 something more reasonable. */
4431 /* If the number of iterations is known and we do not do versioning, we can
4432 decide whether to vectorize at compile time. Hence the scalar version
4433 do not carry cost model guard costs. */
4434 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
4435 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4437 /* Cost model check occurs at versioning. */
4438 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
4439 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
4442 /* Cost model check occurs at prologue generation. */
4443 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
4444 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
4445 + vect_get_stmt_cost (cond_branch_not_taken
);
4446 /* Cost model check occurs at epilogue generation. */
4448 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
4452 /* Complete the target-specific cost calculations. */
4453 finish_cost (loop_vinfo
->vector_costs
, loop_vinfo
->scalar_costs
,
4454 &vec_prologue_cost
, &vec_inside_cost
, &vec_epilogue_cost
,
4455 suggested_unroll_factor
);
4457 if (suggested_unroll_factor
&& *suggested_unroll_factor
> 1
4458 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) != MAX_VECTORIZATION_FACTOR
4459 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo
) *
4460 *suggested_unroll_factor
,
4461 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
)))
4463 if (dump_enabled_p ())
4464 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4465 "can't unroll as unrolled vectorization factor larger"
4466 " than maximum vectorization factor: "
4467 HOST_WIDE_INT_PRINT_UNSIGNED
"\n",
4468 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
));
4469 *suggested_unroll_factor
= 1;
4472 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
4474 if (dump_enabled_p ())
4476 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4477 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
4479 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
4481 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
4483 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
4484 scalar_single_iter_cost
);
4485 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
4486 scalar_outside_cost
);
4487 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
4489 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
4490 peel_iters_prologue
);
4491 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
4492 peel_iters_epilogue
);
4495 /* Calculate number of iterations required to make the vector version
4496 profitable, relative to the loop bodies only. The following condition
4498 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4500 SIC = scalar iteration cost, VIC = vector iteration cost,
4501 VOC = vector outside cost, VF = vectorization factor,
4502 NPEEL = prologue iterations + epilogue iterations,
4503 SOC = scalar outside cost for run time cost model check. */
4505 int saving_per_viter
= (scalar_single_iter_cost
* assumed_vf
4507 if (saving_per_viter
<= 0)
4509 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
4510 warning_at (vect_location
.get_location_t (), OPT_Wopenmp_simd
,
4511 "vectorization did not happen for a simd loop");
4513 if (dump_enabled_p ())
4514 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4515 "cost model: the vector iteration cost = %d "
4516 "divided by the scalar iteration cost = %d "
4517 "is greater or equal to the vectorization factor = %d"
4519 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
4520 *ret_min_profitable_niters
= -1;
4521 *ret_min_profitable_estimate
= -1;
4525 /* ??? The "if" arm is written to handle all cases; see below for what
4526 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4527 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4529 /* Rewriting the condition above in terms of the number of
4530 vector iterations (vniters) rather than the number of
4531 scalar iterations (niters) gives:
4533 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4535 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4537 For integer N, X and Y when X > 0:
4539 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4540 int outside_overhead
= (vec_outside_cost
4541 - scalar_single_iter_cost
* peel_iters_prologue
4542 - scalar_single_iter_cost
* peel_iters_epilogue
4543 - scalar_outside_cost
);
4544 /* We're only interested in cases that require at least one
4545 vector iteration. */
4546 int min_vec_niters
= 1;
4547 if (outside_overhead
> 0)
4548 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
4550 if (dump_enabled_p ())
4551 dump_printf (MSG_NOTE
, " Minimum number of vector iterations: %d\n",
4554 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4556 /* Now that we know the minimum number of vector iterations,
4557 find the minimum niters for which the scalar cost is larger:
4559 SIC * niters > VIC * vniters + VOC - SOC
4561 We know that the minimum niters is no more than
4562 vniters * VF + NPEEL, but it might be (and often is) less
4563 than that if a partial vector iteration is cheaper than the
4564 equivalent scalar code. */
4565 int threshold
= (vec_inside_cost
* min_vec_niters
4567 - scalar_outside_cost
);
4569 min_profitable_iters
= 1;
4571 min_profitable_iters
= threshold
/ scalar_single_iter_cost
+ 1;
4574 /* Convert the number of vector iterations into a number of
4575 scalar iterations. */
4576 min_profitable_iters
= (min_vec_niters
* assumed_vf
4577 + peel_iters_prologue
4578 + peel_iters_epilogue
);
4582 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
4584 - vec_inside_cost
* peel_iters_prologue
4585 - vec_inside_cost
* peel_iters_epilogue
);
4586 if (min_profitable_iters
<= 0)
4587 min_profitable_iters
= 0;
4590 min_profitable_iters
/= saving_per_viter
;
4592 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
4593 <= (((int) vec_inside_cost
* min_profitable_iters
)
4594 + (((int) vec_outside_cost
- scalar_outside_cost
)
4596 min_profitable_iters
++;
4600 if (dump_enabled_p ())
4601 dump_printf (MSG_NOTE
,
4602 " Calculated minimum iters for profitability: %d\n",
4603 min_profitable_iters
);
4605 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
4606 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
4607 /* We want the vectorized loop to execute at least once. */
4608 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
4609 else if (min_profitable_iters
< peel_iters_prologue
)
4610 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4611 vectorized loop executes at least once. */
4612 min_profitable_iters
= peel_iters_prologue
;
4614 if (dump_enabled_p ())
4615 dump_printf_loc (MSG_NOTE
, vect_location
,
4616 " Runtime profitability threshold = %d\n",
4617 min_profitable_iters
);
4619 *ret_min_profitable_niters
= min_profitable_iters
;
4621 /* Calculate number of iterations required to make the vector version
4622 profitable, relative to the loop bodies only.
4624 Non-vectorized variant is SIC * niters and it must win over vector
4625 variant on the expected loop trip count. The following condition must hold true:
4626 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4628 if (vec_outside_cost
<= 0)
4629 min_profitable_estimate
= 0;
4630 /* ??? This "else if" arm is written to handle all cases; see below for
4631 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4632 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4634 /* This is a repeat of the code above, but with + SOC rather
4636 int outside_overhead
= (vec_outside_cost
4637 - scalar_single_iter_cost
* peel_iters_prologue
4638 - scalar_single_iter_cost
* peel_iters_epilogue
4639 + scalar_outside_cost
);
4640 int min_vec_niters
= 1;
4641 if (outside_overhead
> 0)
4642 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
4644 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
4646 int threshold
= (vec_inside_cost
* min_vec_niters
4648 + scalar_outside_cost
);
4649 min_profitable_estimate
= threshold
/ scalar_single_iter_cost
+ 1;
4652 min_profitable_estimate
= (min_vec_niters
* assumed_vf
4653 + peel_iters_prologue
4654 + peel_iters_epilogue
);
4658 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
4660 - vec_inside_cost
* peel_iters_prologue
4661 - vec_inside_cost
* peel_iters_epilogue
)
4662 / ((scalar_single_iter_cost
* assumed_vf
)
4665 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
4666 if (dump_enabled_p ())
4667 dump_printf_loc (MSG_NOTE
, vect_location
,
4668 " Static estimate profitability threshold = %d\n",
4669 min_profitable_estimate
);
4671 *ret_min_profitable_estimate
= min_profitable_estimate
;
4674 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4675 vector elements (not bits) for a vector with NELT elements. */
4677 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
4678 vec_perm_builder
*sel
)
4680 /* The encoding is a single stepped pattern. Any wrap-around is handled
4681 by vec_perm_indices. */
4682 sel
->new_vector (nelt
, 1, 3);
4683 for (unsigned int i
= 0; i
< 3; i
++)
4684 sel
->quick_push (i
+ offset
);
4687 /* Checks whether the target supports whole-vector shifts for vectors of mode
4688 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4689 it supports vec_perm_const with masks for all necessary shift amounts. */
4691 have_whole_vector_shift (machine_mode mode
)
4693 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
4696 /* Variable-length vectors should be handled via the optab. */
4698 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
4701 vec_perm_builder sel
;
4702 vec_perm_indices indices
;
4703 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
4705 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
4706 indices
.new_vector (sel
, 2, nelt
);
4707 if (!can_vec_perm_const_p (mode
, mode
, indices
, false))
4713 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4714 multiplication operands have differing signs and (b) we intend
4715 to emulate the operation using a series of signed DOT_PROD_EXPRs.
4716 See vect_emulate_mixed_dot_prod for the actual sequence used. */
4719 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo
,
4720 stmt_vec_info stmt_info
)
4722 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
4723 if (!assign
|| gimple_assign_rhs_code (assign
) != DOT_PROD_EXPR
)
4726 tree rhs1
= gimple_assign_rhs1 (assign
);
4727 tree rhs2
= gimple_assign_rhs2 (assign
);
4728 if (TYPE_SIGN (TREE_TYPE (rhs1
)) == TYPE_SIGN (TREE_TYPE (rhs2
)))
4731 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
4732 gcc_assert (reduc_info
->is_reduc_info
);
4733 return !directly_supported_p (DOT_PROD_EXPR
,
4734 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
),
4735 optab_vector_mixed_sign
);
4738 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4739 functions. Design better to avoid maintenance issues. */
4741 /* Function vect_model_reduction_cost.
4743 Models cost for a reduction operation, including the vector ops
4744 generated within the strip-mine loop in some cases, the initial
4745 definition before the loop, and the epilogue code that must be generated. */
4748 vect_model_reduction_cost (loop_vec_info loop_vinfo
,
4749 stmt_vec_info stmt_info
, internal_fn reduc_fn
,
4750 vect_reduction_type reduction_type
,
4751 int ncopies
, stmt_vector_for_cost
*cost_vec
)
4753 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
= 0;
4756 class loop
*loop
= NULL
;
4759 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4761 /* Condition reductions generate two reductions in the loop. */
4762 if (reduction_type
== COND_REDUCTION
)
4765 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4766 mode
= TYPE_MODE (vectype
);
4767 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4770 if (!gimple_extract_op (orig_stmt_info
->stmt
, &op
))
4773 bool emulated_mixed_dot_prod
4774 = vect_is_emulated_mixed_dot_prod (loop_vinfo
, stmt_info
);
4775 if (reduction_type
== EXTRACT_LAST_REDUCTION
)
4776 /* No extra instructions are needed in the prologue. The loop body
4777 operations are costed in vectorizable_condition. */
4779 else if (reduction_type
== FOLD_LEFT_REDUCTION
)
4781 /* No extra instructions needed in the prologue. */
4784 if (reduc_fn
!= IFN_LAST
)
4785 /* Count one reduction-like operation per vector. */
4786 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vec_to_scalar
,
4787 stmt_info
, 0, vect_body
);
4790 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4791 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
4792 inside_cost
= record_stmt_cost (cost_vec
, nelements
,
4793 vec_to_scalar
, stmt_info
, 0,
4795 inside_cost
+= record_stmt_cost (cost_vec
, nelements
,
4796 scalar_stmt
, stmt_info
, 0,
4802 /* Add in the cost of the initial definitions. */
4804 if (reduction_type
== COND_REDUCTION
)
4805 /* For cond reductions we have four vectors: initial index, step,
4806 initial result of the data reduction, initial value of the index
4809 else if (emulated_mixed_dot_prod
)
4810 /* We need the initial reduction value and two invariants:
4811 one that contains the minimum signed value and one that
4812 contains half of its negative. */
4816 prologue_cost
+= record_stmt_cost (cost_vec
, prologue_stmts
,
4817 scalar_to_vec
, stmt_info
, 0,
4821 /* Determine cost of epilogue code.
4823 We have a reduction operator that will reduce the vector in one statement.
4824 Also requires scalar extract. */
4826 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt_info
))
4828 if (reduc_fn
!= IFN_LAST
)
4830 if (reduction_type
== COND_REDUCTION
)
4832 /* An EQ stmt and an COND_EXPR stmt. */
4833 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4834 vector_stmt
, stmt_info
, 0,
4836 /* Reduction of the max index and a reduction of the found
4838 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4839 vec_to_scalar
, stmt_info
, 0,
4841 /* A broadcast of the max value. */
4842 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4843 scalar_to_vec
, stmt_info
, 0,
4848 epilogue_cost
+= record_stmt_cost (cost_vec
, 1, vector_stmt
,
4849 stmt_info
, 0, vect_epilogue
);
4850 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4851 vec_to_scalar
, stmt_info
, 0,
4855 else if (reduction_type
== COND_REDUCTION
)
4857 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
4858 /* Extraction of scalar elements. */
4859 epilogue_cost
+= record_stmt_cost (cost_vec
,
4860 2 * estimated_nunits
,
4861 vec_to_scalar
, stmt_info
, 0,
4863 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4864 epilogue_cost
+= record_stmt_cost (cost_vec
,
4865 2 * estimated_nunits
- 3,
4866 scalar_stmt
, stmt_info
, 0,
4869 else if (reduction_type
== EXTRACT_LAST_REDUCTION
4870 || reduction_type
== FOLD_LEFT_REDUCTION
)
4871 /* No extra instructions need in the epilogue. */
4875 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4876 tree bitsize
= TYPE_SIZE (op
.type
);
4877 int element_bitsize
= tree_to_uhwi (bitsize
);
4878 int nelements
= vec_size_in_bits
/ element_bitsize
;
4880 if (op
.code
== COND_EXPR
)
4883 /* We have a whole vector shift available. */
4884 if (VECTOR_MODE_P (mode
)
4885 && directly_supported_p (op
.code
, vectype
)
4886 && have_whole_vector_shift (mode
))
4888 /* Final reduction via vector shifts and the reduction operator.
4889 Also requires scalar extract. */
4890 epilogue_cost
+= record_stmt_cost (cost_vec
,
4891 exact_log2 (nelements
) * 2,
4892 vector_stmt
, stmt_info
, 0,
4894 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4895 vec_to_scalar
, stmt_info
, 0,
4899 /* Use extracts and reduction op for final reduction. For N
4900 elements, we have N extracts and N-1 reduction ops. */
4901 epilogue_cost
+= record_stmt_cost (cost_vec
,
4902 nelements
+ nelements
- 1,
4903 vector_stmt
, stmt_info
, 0,
4908 if (dump_enabled_p ())
4909 dump_printf (MSG_NOTE
,
4910 "vect_model_reduction_cost: inside_cost = %d, "
4911 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
4912 prologue_cost
, epilogue_cost
);
4915 /* SEQ is a sequence of instructions that initialize the reduction
4916 described by REDUC_INFO. Emit them in the appropriate place. */
4919 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo
,
4920 stmt_vec_info reduc_info
, gimple
*seq
)
4922 if (reduc_info
->reused_accumulator
)
4924 /* When reusing an accumulator from the main loop, we only need
4925 initialization instructions if the main loop can be skipped.
4926 In that case, emit the initialization instructions at the end
4927 of the guard block that does the skip. */
4928 edge skip_edge
= loop_vinfo
->skip_main_loop_edge
;
4929 gcc_assert (skip_edge
);
4930 gimple_stmt_iterator gsi
= gsi_last_bb (skip_edge
->src
);
4931 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
4935 /* The normal case: emit the initialization instructions on the
4937 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4938 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), seq
);
4942 /* Function get_initial_def_for_reduction
4945 REDUC_INFO - the info_for_reduction
4946 INIT_VAL - the initial value of the reduction variable
4947 NEUTRAL_OP - a value that has no effect on the reduction, as per
4948 neutral_op_for_reduction
4951 Return a vector variable, initialized according to the operation that
4952 STMT_VINFO performs. This vector will be used as the initial value
4953 of the vector of partial results.
4955 The value we need is a vector in which element 0 has value INIT_VAL
4956 and every other element has value NEUTRAL_OP. */
4959 get_initial_def_for_reduction (loop_vec_info loop_vinfo
,
4960 stmt_vec_info reduc_info
,
4961 tree init_val
, tree neutral_op
)
4963 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4964 tree scalar_type
= TREE_TYPE (init_val
);
4965 tree vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
4967 gimple_seq stmts
= NULL
;
4969 gcc_assert (vectype
);
4971 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
4972 || SCALAR_FLOAT_TYPE_P (scalar_type
));
4974 gcc_assert (nested_in_vect_loop_p (loop
, reduc_info
)
4975 || loop
== (gimple_bb (reduc_info
->stmt
))->loop_father
);
4977 if (operand_equal_p (init_val
, neutral_op
))
4979 /* If both elements are equal then the vector described above is
4981 neutral_op
= gimple_convert (&stmts
, TREE_TYPE (vectype
), neutral_op
);
4982 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, neutral_op
);
4986 neutral_op
= gimple_convert (&stmts
, TREE_TYPE (vectype
), neutral_op
);
4987 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
4988 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
4990 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4992 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4994 init_def
= gimple_build (&stmts
, CFN_VEC_SHL_INSERT
,
4995 vectype
, init_def
, init_val
);
4999 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5000 tree_vector_builder
elts (vectype
, 1, 2);
5001 elts
.quick_push (init_val
);
5002 elts
.quick_push (neutral_op
);
5003 init_def
= gimple_build_vector (&stmts
, &elts
);
5008 vect_emit_reduction_init_stmts (loop_vinfo
, reduc_info
, stmts
);
5012 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5013 which performs a reduction involving GROUP_SIZE scalar statements.
5014 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5015 is nonnull, introducing extra elements of that value will not change the
5019 get_initial_defs_for_reduction (loop_vec_info loop_vinfo
,
5020 stmt_vec_info reduc_info
,
5021 vec
<tree
> *vec_oprnds
,
5022 unsigned int number_of_vectors
,
5023 unsigned int group_size
, tree neutral_op
)
5025 vec
<tree
> &initial_values
= reduc_info
->reduc_initial_values
;
5026 unsigned HOST_WIDE_INT nunits
;
5027 unsigned j
, number_of_places_left_in_vector
;
5028 tree vector_type
= STMT_VINFO_VECTYPE (reduc_info
);
5031 gcc_assert (group_size
== initial_values
.length () || neutral_op
);
5033 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5034 created vectors. It is greater than 1 if unrolling is performed.
5036 For example, we have two scalar operands, s1 and s2 (e.g., group of
5037 strided accesses of size two), while NUNITS is four (i.e., four scalars
5038 of this type can be packed in a vector). The output vector will contain
5039 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5042 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5043 vectors containing the operands.
5045 For example, NUNITS is four as before, and the group size is 8
5046 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5047 {s5, s6, s7, s8}. */
5049 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
5050 nunits
= group_size
;
5052 number_of_places_left_in_vector
= nunits
;
5053 bool constant_p
= true;
5054 tree_vector_builder
elts (vector_type
, nunits
, 1);
5055 elts
.quick_grow (nunits
);
5056 gimple_seq ctor_seq
= NULL
;
5057 for (j
= 0; j
< nunits
* number_of_vectors
; ++j
)
5062 /* Get the def before the loop. In reduction chain we have only
5063 one initial value. Else we have as many as PHIs in the group. */
5064 if (i
>= initial_values
.length () || (j
> i
&& neutral_op
))
5067 op
= initial_values
[i
];
5069 /* Create 'vect_ = {op0,op1,...,opn}'. */
5070 number_of_places_left_in_vector
--;
5071 elts
[nunits
- number_of_places_left_in_vector
- 1] = op
;
5072 if (!CONSTANT_CLASS_P (op
))
5075 if (number_of_places_left_in_vector
== 0)
5078 if (constant_p
&& !neutral_op
5079 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
5080 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
5081 /* Build the vector directly from ELTS. */
5082 init
= gimple_build_vector (&ctor_seq
, &elts
);
5083 else if (neutral_op
)
5085 /* Build a vector of the neutral value and shift the
5086 other elements into place. */
5087 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
5090 while (k
> 0 && elts
[k
- 1] == neutral_op
)
5095 init
= gimple_build (&ctor_seq
, CFN_VEC_SHL_INSERT
,
5096 vector_type
, init
, elts
[k
]);
5101 /* First time round, duplicate ELTS to fill the
5102 required number of vectors. */
5103 duplicate_and_interleave (loop_vinfo
, &ctor_seq
, vector_type
,
5104 elts
, number_of_vectors
, *vec_oprnds
);
5107 vec_oprnds
->quick_push (init
);
5109 number_of_places_left_in_vector
= nunits
;
5110 elts
.new_vector (vector_type
, nunits
, 1);
5111 elts
.quick_grow (nunits
);
5115 if (ctor_seq
!= NULL
)
5116 vect_emit_reduction_init_stmts (loop_vinfo
, reduc_info
, ctor_seq
);
5119 /* For a statement STMT_INFO taking part in a reduction operation return
5120 the stmt_vec_info the meta information is stored on. */
5123 info_for_reduction (vec_info
*vinfo
, stmt_vec_info stmt_info
)
5125 stmt_info
= vect_orig_stmt (stmt_info
);
5126 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info
));
5127 if (!is_a
<gphi
*> (stmt_info
->stmt
)
5128 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
5129 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
5130 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
5131 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
5133 if (gimple_phi_num_args (phi
) == 1)
5134 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
5136 else if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
5138 stmt_vec_info info
= vinfo
->lookup_def (vect_phi_initial_value (phi
));
5139 if (info
&& STMT_VINFO_DEF_TYPE (info
) == vect_double_reduction_def
)
5145 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5146 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5150 vect_find_reusable_accumulator (loop_vec_info loop_vinfo
,
5151 stmt_vec_info reduc_info
)
5153 loop_vec_info main_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
5154 if (!main_loop_vinfo
)
5157 if (STMT_VINFO_REDUC_TYPE (reduc_info
) != TREE_CODE_REDUCTION
)
5160 unsigned int num_phis
= reduc_info
->reduc_initial_values
.length ();
5161 auto_vec
<tree
, 16> main_loop_results (num_phis
);
5162 auto_vec
<tree
, 16> initial_values (num_phis
);
5163 if (edge main_loop_edge
= loop_vinfo
->main_loop_edge
)
5165 /* The epilogue loop can be entered either from the main loop or
5166 from an earlier guard block. */
5167 edge skip_edge
= loop_vinfo
->skip_main_loop_edge
;
5168 for (tree incoming_value
: reduc_info
->reduc_initial_values
)
5172 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5173 INITIAL_VALUE(guard block)>. */
5174 gcc_assert (TREE_CODE (incoming_value
) == SSA_NAME
);
5176 gphi
*phi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (incoming_value
));
5177 gcc_assert (gimple_bb (phi
) == main_loop_edge
->dest
);
5179 tree from_main_loop
= PHI_ARG_DEF_FROM_EDGE (phi
, main_loop_edge
);
5180 tree from_skip
= PHI_ARG_DEF_FROM_EDGE (phi
, skip_edge
);
5182 main_loop_results
.quick_push (from_main_loop
);
5183 initial_values
.quick_push (from_skip
);
5187 /* The main loop dominates the epilogue loop. */
5188 main_loop_results
.splice (reduc_info
->reduc_initial_values
);
5190 /* See if the main loop has the kind of accumulator we need. */
5191 vect_reusable_accumulator
*accumulator
5192 = main_loop_vinfo
->reusable_accumulators
.get (main_loop_results
[0]);
5194 || num_phis
!= accumulator
->reduc_info
->reduc_scalar_results
.length ()
5195 || !std::equal (main_loop_results
.begin (), main_loop_results
.end (),
5196 accumulator
->reduc_info
->reduc_scalar_results
.begin ()))
5199 /* Handle the case where we can reduce wider vectors to narrower ones. */
5200 tree vectype
= STMT_VINFO_VECTYPE (reduc_info
);
5201 tree old_vectype
= TREE_TYPE (accumulator
->reduc_input
);
5202 unsigned HOST_WIDE_INT m
;
5203 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype
),
5204 TYPE_VECTOR_SUBPARTS (vectype
), &m
))
5206 /* Check the intermediate vector types and operations are available. */
5207 tree prev_vectype
= old_vectype
;
5208 poly_uint64 intermediate_nunits
= TYPE_VECTOR_SUBPARTS (old_vectype
);
5209 while (known_gt (intermediate_nunits
, TYPE_VECTOR_SUBPARTS (vectype
)))
5211 intermediate_nunits
= exact_div (intermediate_nunits
, 2);
5212 tree intermediate_vectype
= get_related_vectype_for_scalar_type
5213 (TYPE_MODE (vectype
), TREE_TYPE (vectype
), intermediate_nunits
);
5214 if (!intermediate_vectype
5215 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info
),
5216 intermediate_vectype
)
5217 || !can_vec_extract (TYPE_MODE (prev_vectype
),
5218 TYPE_MODE (intermediate_vectype
)))
5220 prev_vectype
= intermediate_vectype
;
5223 /* Non-SLP reductions might apply an adjustment after the reduction
5224 operation, in order to simplify the initialization of the accumulator.
5225 If the epilogue loop carries on from where the main loop left off,
5226 it should apply the same adjustment to the final reduction result.
5228 If the epilogue loop can also be entered directly (rather than via
5229 the main loop), we need to be able to handle that case in the same way,
5230 with the same adjustment. (In principle we could add a PHI node
5231 to select the correct adjustment, but in practice that shouldn't be
5233 tree main_adjustment
5234 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator
->reduc_info
);
5235 if (loop_vinfo
->main_loop_edge
&& main_adjustment
)
5237 gcc_assert (num_phis
== 1);
5238 tree initial_value
= initial_values
[0];
5239 /* Check that we can use INITIAL_VALUE as the adjustment and
5240 initialize the accumulator with a neutral value instead. */
5241 if (!operand_equal_p (initial_value
, main_adjustment
))
5243 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
5244 initial_values
[0] = neutral_op_for_reduction (TREE_TYPE (initial_value
),
5245 code
, initial_value
);
5247 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
) = main_adjustment
;
5248 reduc_info
->reduc_initial_values
.truncate (0);
5249 reduc_info
->reduc_initial_values
.splice (initial_values
);
5250 reduc_info
->reused_accumulator
= accumulator
;
5254 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5255 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5258 vect_create_partial_epilog (tree vec_def
, tree vectype
, code_helper code
,
5261 unsigned nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def
)).to_constant ();
5262 unsigned nunits1
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5263 tree stype
= TREE_TYPE (vectype
);
5264 tree new_temp
= vec_def
;
5265 while (nunits
> nunits1
)
5268 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5270 unsigned int bitsize
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5272 /* The target has to make sure we support lowpart/highpart
5273 extraction, either via direct vector extract or through
5274 an integer mode punning. */
5276 gimple
*epilog_stmt
;
5277 if (convert_optab_handler (vec_extract_optab
,
5278 TYPE_MODE (TREE_TYPE (new_temp
)),
5279 TYPE_MODE (vectype1
))
5280 != CODE_FOR_nothing
)
5282 /* Extract sub-vectors directly once vec_extract becomes
5283 a conversion optab. */
5284 dst1
= make_ssa_name (vectype1
);
5286 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
5287 build3 (BIT_FIELD_REF
, vectype1
,
5288 new_temp
, TYPE_SIZE (vectype1
),
5290 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5291 dst2
= make_ssa_name (vectype1
);
5293 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
5294 build3 (BIT_FIELD_REF
, vectype1
,
5295 new_temp
, TYPE_SIZE (vectype1
),
5296 bitsize_int (bitsize
)));
5297 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5301 /* Extract via punning to appropriately sized integer mode
5303 tree eltype
= build_nonstandard_integer_type (bitsize
, 1);
5304 tree etype
= build_vector_type (eltype
, 2);
5305 gcc_assert (convert_optab_handler (vec_extract_optab
,
5308 != CODE_FOR_nothing
);
5309 tree tem
= make_ssa_name (etype
);
5310 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
5311 build1 (VIEW_CONVERT_EXPR
,
5313 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5315 tem
= make_ssa_name (eltype
);
5317 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5318 build3 (BIT_FIELD_REF
, eltype
,
5319 new_temp
, TYPE_SIZE (eltype
),
5321 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5322 dst1
= make_ssa_name (vectype1
);
5323 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
5324 build1 (VIEW_CONVERT_EXPR
,
5326 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5327 tem
= make_ssa_name (eltype
);
5329 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5330 build3 (BIT_FIELD_REF
, eltype
,
5331 new_temp
, TYPE_SIZE (eltype
),
5332 bitsize_int (bitsize
)));
5333 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5334 dst2
= make_ssa_name (vectype1
);
5335 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
5336 build1 (VIEW_CONVERT_EXPR
,
5338 gimple_seq_add_stmt_without_update (seq
, epilog_stmt
);
5341 new_temp
= gimple_build (seq
, code
, vectype1
, dst1
, dst2
);
5347 /* Function vect_create_epilog_for_reduction
5349 Create code at the loop-epilog to finalize the result of a reduction
5352 STMT_INFO is the scalar reduction stmt that is being vectorized.
5353 SLP_NODE is an SLP node containing a group of reduction statements. The
5354 first one in this group is STMT_INFO.
5355 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5356 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5360 1. Completes the reduction def-use cycles.
5361 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5362 by calling the function specified by REDUC_FN if available, or by
5363 other means (whole-vector shifts or a scalar loop).
5364 The function also creates a new phi node at the loop exit to preserve
5365 loop-closed form, as illustrated below.
5367 The flow at the entry to this function:
5370 vec_def = phi <vec_init, null> # REDUCTION_PHI
5371 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5372 s_loop = scalar_stmt # (scalar) STMT_INFO
5374 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5378 The above is transformed by this function into:
5381 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5382 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5383 s_loop = scalar_stmt # (scalar) STMT_INFO
5385 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5386 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5387 v_out2 = reduce <v_out1>
5388 s_out3 = extract_field <v_out2, 0>
5389 s_out4 = adjust_result <s_out3>
5395 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo
,
5396 stmt_vec_info stmt_info
,
5398 slp_instance slp_node_instance
)
5400 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
5401 gcc_assert (reduc_info
->is_reduc_info
);
5402 /* For double reductions we need to get at the inner loop reduction
5403 stmt which has the meta info attached. Our stmt_info is that of the
5404 loop-closed PHI of the inner loop which we remember as
5405 def for the reduction PHI generation. */
5406 bool double_reduc
= false;
5407 stmt_vec_info rdef_info
= stmt_info
;
5408 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
5410 gcc_assert (!slp_node
);
5411 double_reduc
= true;
5412 stmt_info
= loop_vinfo
->lookup_def (gimple_phi_arg_def
5413 (stmt_info
->stmt
, 0));
5414 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
5416 gphi
*reduc_def_stmt
5417 = as_a
<gphi
*> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))->stmt
);
5418 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
5419 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
5422 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
5423 basic_block exit_bb
;
5426 gimple
*new_phi
= NULL
, *phi
;
5427 gimple_stmt_iterator exit_gsi
;
5428 tree new_temp
= NULL_TREE
, new_name
, new_scalar_dest
;
5429 gimple
*epilog_stmt
= NULL
;
5433 tree orig_name
, scalar_result
;
5434 imm_use_iterator imm_iter
, phi_imm_iter
;
5435 use_operand_p use_p
, phi_use_p
;
5437 auto_vec
<tree
> reduc_inputs
;
5439 vec
<tree
> &scalar_results
= reduc_info
->reduc_scalar_results
;
5440 unsigned int group_size
= 1, k
;
5441 auto_vec
<gimple
*> phis
;
5442 /* SLP reduction without reduction chain, e.g.,
5446 b2 = operation (b1) */
5447 bool slp_reduc
= (slp_node
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
5448 bool direct_slp_reduc
;
5449 tree induction_index
= NULL_TREE
;
5452 group_size
= SLP_TREE_LANES (slp_node
);
5454 if (nested_in_vect_loop_p (loop
, stmt_info
))
5458 gcc_assert (!slp_node
&& double_reduc
);
5461 vectype
= STMT_VINFO_REDUC_VECTYPE (reduc_info
);
5462 gcc_assert (vectype
);
5463 mode
= TYPE_MODE (vectype
);
5465 tree induc_val
= NULL_TREE
;
5466 tree adjustment_def
= NULL
;
5471 /* Optimize: for induction condition reduction, if we can't use zero
5472 for induc_val, use initial_def. */
5473 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5474 induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
5475 else if (double_reduc
)
5478 adjustment_def
= STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
);
5481 stmt_vec_info single_live_out_stmt
[] = { stmt_info
};
5482 array_slice
<const stmt_vec_info
> live_out_stmts
= single_live_out_stmt
;
5484 /* All statements produce live-out values. */
5485 live_out_stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
5488 /* The last statement in the reduction chain produces the live-out
5489 value. Note SLP optimization can shuffle scalar stmts to
5490 optimize permutations so we have to search for the last stmt. */
5491 for (k
= 0; k
< group_size
; ++k
)
5492 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node
)[k
]))
5494 single_live_out_stmt
[0] = SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
5503 vec_num
= SLP_TREE_VEC_STMTS (slp_node_instance
->reduc_phis
).length ();
5508 stmt_vec_info reduc_info
= loop_vinfo
->lookup_stmt (reduc_def_stmt
);
5510 ncopies
= STMT_VINFO_VEC_STMTS (reduc_info
).length ();
5513 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5514 which is updated with the current index of the loop for every match of
5515 the original loop's cond_expr (VEC_STMT). This results in a vector
5516 containing the last time the condition passed for that vector lane.
5517 The first match will be a 1 to allow 0 to be used for non-matching
5518 indexes. If there are no matches at all then the vector will be all
5521 PR92772: This algorithm is broken for architectures that support
5522 masked vectors, but do not provide fold_extract_last. */
5523 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
5525 auto_vec
<std::pair
<tree
, bool>, 2> ccompares
;
5526 stmt_vec_info cond_info
= STMT_VINFO_REDUC_DEF (reduc_info
);
5527 cond_info
= vect_stmt_to_vectorize (cond_info
);
5528 while (cond_info
!= reduc_info
)
5530 if (gimple_assign_rhs_code (cond_info
->stmt
) == COND_EXPR
)
5532 gimple
*vec_stmt
= STMT_VINFO_VEC_STMTS (cond_info
)[0];
5533 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
5535 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt
)),
5536 STMT_VINFO_REDUC_IDX (cond_info
) == 2));
5539 = loop_vinfo
->lookup_def (gimple_op (cond_info
->stmt
,
5540 1 + STMT_VINFO_REDUC_IDX
5542 cond_info
= vect_stmt_to_vectorize (cond_info
);
5544 gcc_assert (ccompares
.length () != 0);
5546 tree indx_before_incr
, indx_after_incr
;
5547 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
5548 int scalar_precision
5549 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
5550 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
5551 tree cr_index_vector_type
= get_related_vectype_for_scalar_type
5552 (TYPE_MODE (vectype
), cr_index_scalar_type
,
5553 TYPE_VECTOR_SUBPARTS (vectype
));
5555 /* First we create a simple vector induction variable which starts
5556 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5557 vector size (STEP). */
5559 /* Create a {1,2,3,...} vector. */
5560 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
5562 /* Create a vector of the step value. */
5563 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
5564 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
5566 /* Create an induction variable. */
5567 gimple_stmt_iterator incr_gsi
;
5569 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
5570 create_iv (series_vect
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
5571 insert_after
, &indx_before_incr
, &indx_after_incr
);
5573 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5574 filled with zeros (VEC_ZERO). */
5576 /* Create a vector of 0s. */
5577 tree zero
= build_zero_cst (cr_index_scalar_type
);
5578 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
5580 /* Create a vector phi node. */
5581 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
5582 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
5583 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
5584 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
5586 /* Now take the condition from the loops original cond_exprs
5587 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5588 every match uses values from the induction variable
5589 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5591 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5592 the new cond_expr (INDEX_COND_EXPR). */
5593 gimple_seq stmts
= NULL
;
5594 for (int i
= ccompares
.length () - 1; i
!= -1; --i
)
5596 tree ccompare
= ccompares
[i
].first
;
5597 if (ccompares
[i
].second
)
5598 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
5599 cr_index_vector_type
,
5601 indx_before_incr
, new_phi_tree
);
5603 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
5604 cr_index_vector_type
,
5606 new_phi_tree
, indx_before_incr
);
5608 gsi_insert_seq_before (&incr_gsi
, stmts
, GSI_SAME_STMT
);
5610 /* Update the phi with the vec cond. */
5611 induction_index
= new_phi_tree
;
5612 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
5613 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
5616 /* 2. Create epilog code.
5617 The reduction epilog code operates across the elements of the vector
5618 of partial results computed by the vectorized loop.
5619 The reduction epilog code consists of:
5621 step 1: compute the scalar result in a vector (v_out2)
5622 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5623 step 3: adjust the scalar result (s_out3) if needed.
5625 Step 1 can be accomplished using one the following three schemes:
5626 (scheme 1) using reduc_fn, if available.
5627 (scheme 2) using whole-vector shifts, if available.
5628 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5631 The overall epilog code looks like this:
5633 s_out0 = phi <s_loop> # original EXIT_PHI
5634 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5635 v_out2 = reduce <v_out1> # step 1
5636 s_out3 = extract_field <v_out2, 0> # step 2
5637 s_out4 = adjust_result <s_out3> # step 3
5639 (step 3 is optional, and steps 1 and 2 may be combined).
5640 Lastly, the uses of s_out0 are replaced by s_out4. */
5643 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5644 v_out1 = phi <VECT_DEF>
5645 Store them in NEW_PHIS. */
5648 exit_bb
= single_exit (loop
)->dest
;
5649 exit_gsi
= gsi_after_labels (exit_bb
);
5650 reduc_inputs
.create (slp_node
? vec_num
: ncopies
);
5651 for (unsigned i
= 0; i
< vec_num
; i
++)
5653 gimple_seq stmts
= NULL
;
5655 def
= vect_get_slp_vect_def (slp_node
, i
);
5657 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[0]);
5658 for (j
= 0; j
< ncopies
; j
++)
5660 tree new_def
= copy_ssa_name (def
);
5661 phi
= create_phi_node (new_def
, exit_bb
);
5663 def
= gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info
)[j
]);
5664 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
5665 new_def
= gimple_convert (&stmts
, vectype
, new_def
);
5666 reduc_inputs
.quick_push (new_def
);
5668 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5671 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5672 (i.e. when reduc_fn is not available) and in the final adjustment
5673 code (if needed). Also get the original scalar reduction variable as
5674 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5675 represents a reduction pattern), the tree-code and scalar-def are
5676 taken from the original stmt that the pattern-stmt (STMT) replaces.
5677 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5678 are taken from STMT. */
5680 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
5681 if (orig_stmt_info
!= stmt_info
)
5683 /* Reduction pattern */
5684 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
5685 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info
) == stmt_info
);
5688 scalar_dest
= gimple_get_lhs (orig_stmt_info
->stmt
);
5689 scalar_type
= TREE_TYPE (scalar_dest
);
5690 scalar_results
.truncate (0);
5691 scalar_results
.reserve_exact (group_size
);
5692 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
5693 bitsize
= TYPE_SIZE (scalar_type
);
5695 /* True if we should implement SLP_REDUC using native reduction operations
5696 instead of scalar operations. */
5697 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
5699 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
5701 /* In case of reduction chain, e.g.,
5704 a3 = operation (a2),
5706 we may end up with more than one vector result. Here we reduce them
5709 The same is true for a SLP reduction, e.g.,
5713 b2 = operation (a2),
5715 where we can end up with more than one vector as well. We can
5716 easily accumulate vectors when the number of vector elements is
5717 a multiple of the SLP group size.
5719 The same is true if we couldn't use a single defuse cycle. */
5720 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
5723 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype
), group_size
))
5726 gimple_seq stmts
= NULL
;
5727 tree single_input
= reduc_inputs
[0];
5728 for (k
= 1; k
< reduc_inputs
.length (); k
++)
5729 single_input
= gimple_build (&stmts
, code
, vectype
,
5730 single_input
, reduc_inputs
[k
]);
5731 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5733 reduc_inputs
.truncate (0);
5734 reduc_inputs
.safe_push (single_input
);
5737 tree orig_reduc_input
= reduc_inputs
[0];
5739 /* If this loop is an epilogue loop that can be skipped after the
5740 main loop, we can only share a reduction operation between the
5741 main loop and the epilogue if we put it at the target of the
5744 We can still reuse accumulators if this check fails. Doing so has
5745 the minor(?) benefit of making the epilogue loop's scalar result
5746 independent of the main loop's scalar result. */
5747 bool unify_with_main_loop_p
= false;
5748 if (reduc_info
->reused_accumulator
5749 && loop_vinfo
->skip_this_loop_edge
5750 && single_succ_p (exit_bb
)
5751 && single_succ (exit_bb
) == loop_vinfo
->skip_this_loop_edge
->dest
)
5753 unify_with_main_loop_p
= true;
5755 basic_block reduc_block
= loop_vinfo
->skip_this_loop_edge
->dest
;
5756 reduc_inputs
[0] = make_ssa_name (vectype
);
5757 gphi
*new_phi
= create_phi_node (reduc_inputs
[0], reduc_block
);
5758 add_phi_arg (new_phi
, orig_reduc_input
, single_succ_edge (exit_bb
),
5760 add_phi_arg (new_phi
, reduc_info
->reused_accumulator
->reduc_input
,
5761 loop_vinfo
->skip_this_loop_edge
, UNKNOWN_LOCATION
);
5762 exit_gsi
= gsi_after_labels (reduc_block
);
5765 /* Shouldn't be used beyond this point. */
5768 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
5769 && reduc_fn
!= IFN_LAST
)
5771 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5772 various data values where the condition matched and another vector
5773 (INDUCTION_INDEX) containing all the indexes of those matches. We
5774 need to extract the last matching index (which will be the index with
5775 highest value) and use this to index into the data vector.
5776 For the case where there were no matches, the data vector will contain
5777 all default values and the index vector will be all zeros. */
5779 /* Get various versions of the type of the vector of indexes. */
5780 tree index_vec_type
= TREE_TYPE (induction_index
);
5781 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
5782 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
5783 tree index_vec_cmp_type
= truth_type_for (index_vec_type
);
5785 /* Get an unsigned integer version of the type of the data vector. */
5786 int scalar_precision
5787 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
5788 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
5789 tree vectype_unsigned
= get_same_sized_vectype (scalar_type_unsigned
,
5792 /* First we need to create a vector (ZERO_VEC) of zeros and another
5793 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5794 can create using a MAX reduction and then expanding.
5795 In the case where the loop never made any matches, the max index will
5798 /* Vector of {0, 0, 0,...}. */
5799 tree zero_vec
= build_zero_cst (vectype
);
5801 /* Find maximum value from the vector of found indexes. */
5802 tree max_index
= make_ssa_name (index_scalar_type
);
5803 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5804 1, induction_index
);
5805 gimple_call_set_lhs (max_index_stmt
, max_index
);
5806 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
5808 /* Vector of {max_index, max_index, max_index,...}. */
5809 tree max_index_vec
= make_ssa_name (index_vec_type
);
5810 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
5812 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
5814 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
5816 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5817 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5818 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5819 otherwise. Only one value should match, resulting in a vector
5820 (VEC_COND) with one data value and the rest zeros.
5821 In the case where the loop never made any matches, every index will
5822 match, resulting in a vector with all data values (which will all be
5823 the default value). */
5825 /* Compare the max index vector to the vector of found indexes to find
5826 the position of the max value. */
5827 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
5828 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
5831 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
5833 /* Use the compare to choose either values from the data vector or
5835 tree vec_cond
= make_ssa_name (vectype
);
5836 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
5840 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
5842 /* Finally we need to extract the data value from the vector (VEC_COND)
5843 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5844 reduction, but because this doesn't exist, we can use a MAX reduction
5845 instead. The data value might be signed or a float so we need to cast
5847 In the case where the loop never made any matches, the data values are
5848 all identical, and so will reduce down correctly. */
5850 /* Make the matched data values unsigned. */
5851 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
5852 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
5854 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
5857 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
5859 /* Reduce down to a scalar value. */
5860 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
5861 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
5863 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
5864 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
5866 /* Convert the reduced value back to the result type and set as the
5868 gimple_seq stmts
= NULL
;
5869 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
5871 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5872 scalar_results
.safe_push (new_temp
);
5874 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
5875 && reduc_fn
== IFN_LAST
)
5877 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5879 idx_val = induction_index[0];
5880 val = data_reduc[0];
5881 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5882 if (induction_index[i] > idx_val)
5883 val = data_reduc[i], idx_val = induction_index[i];
5886 tree data_eltype
= TREE_TYPE (vectype
);
5887 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
5888 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
5889 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
5890 /* Enforced by vectorizable_reduction, which ensures we have target
5891 support before allowing a conditional reduction on variable-length
5893 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
5894 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
5895 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
5897 tree old_idx_val
= idx_val
;
5899 idx_val
= make_ssa_name (idx_eltype
);
5900 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
5901 build3 (BIT_FIELD_REF
, idx_eltype
,
5903 bitsize_int (el_size
),
5904 bitsize_int (off
)));
5905 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5906 val
= make_ssa_name (data_eltype
);
5907 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
5908 build3 (BIT_FIELD_REF
,
5911 bitsize_int (el_size
),
5912 bitsize_int (off
)));
5913 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5916 tree new_idx_val
= idx_val
;
5917 if (off
!= v_size
- el_size
)
5919 new_idx_val
= make_ssa_name (idx_eltype
);
5920 epilog_stmt
= gimple_build_assign (new_idx_val
,
5923 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5925 tree cond
= make_ssa_name (boolean_type_node
);
5926 epilog_stmt
= gimple_build_assign (cond
, GT_EXPR
,
5927 idx_val
, old_idx_val
);
5928 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5929 tree new_val
= make_ssa_name (data_eltype
);
5930 epilog_stmt
= gimple_build_assign (new_val
, COND_EXPR
,
5931 cond
, val
, old_val
);
5932 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5933 idx_val
= new_idx_val
;
5937 /* Convert the reduced value back to the result type and set as the
5939 gimple_seq stmts
= NULL
;
5940 val
= gimple_convert (&stmts
, scalar_type
, val
);
5941 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5942 scalar_results
.safe_push (val
);
5945 /* 2.3 Create the reduction code, using one of the three schemes described
5946 above. In SLP we simply need to extract all the elements from the
5947 vector (without reducing them), so we use scalar shifts. */
5948 else if (reduc_fn
!= IFN_LAST
&& !slp_reduc
)
5954 v_out2 = reduc_expr <v_out1> */
5956 if (dump_enabled_p ())
5957 dump_printf_loc (MSG_NOTE
, vect_location
,
5958 "Reduce using direct vector reduction.\n");
5960 gimple_seq stmts
= NULL
;
5961 vec_elem_type
= TREE_TYPE (vectype
);
5962 new_temp
= gimple_build (&stmts
, as_combined_fn (reduc_fn
),
5963 vec_elem_type
, reduc_inputs
[0]);
5964 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
5965 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5967 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5970 /* Earlier we set the initial value to be a vector if induc_val
5971 values. Check the result and if it is induc_val then replace
5972 with the original initial value, unless induc_val is
5973 the same as initial_def already. */
5974 tree zcompare
= make_ssa_name (boolean_type_node
);
5975 epilog_stmt
= gimple_build_assign (zcompare
, EQ_EXPR
,
5976 new_temp
, induc_val
);
5977 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5978 tree initial_def
= reduc_info
->reduc_initial_values
[0];
5979 tmp
= make_ssa_name (new_scalar_dest
);
5980 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5981 initial_def
, new_temp
);
5982 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5986 scalar_results
.safe_push (new_temp
);
5988 else if (direct_slp_reduc
)
5990 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5991 with the elements for other SLP statements replaced with the
5992 neutral value. We can then do a normal reduction on each vector. */
5994 /* Enforced by vectorizable_reduction. */
5995 gcc_assert (reduc_inputs
.length () == 1);
5996 gcc_assert (pow2p_hwi (group_size
));
5998 gimple_seq seq
= NULL
;
6000 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6001 and the same element size as VECTYPE. */
6002 tree index
= build_index_vector (vectype
, 0, 1);
6003 tree index_type
= TREE_TYPE (index
);
6004 tree index_elt_type
= TREE_TYPE (index_type
);
6005 tree mask_type
= truth_type_for (index_type
);
6007 /* Create a vector that, for each element, identifies which of
6008 the REDUC_GROUP_SIZE results should use it. */
6009 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
6010 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
6011 build_vector_from_val (index_type
, index_mask
));
6013 /* Get a neutral vector value. This is simply a splat of the neutral
6014 scalar value if we have one, otherwise the initial scalar value
6015 is itself a neutral value. */
6016 tree vector_identity
= NULL_TREE
;
6017 tree neutral_op
= NULL_TREE
;
6020 tree initial_value
= NULL_TREE
;
6021 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6022 initial_value
= reduc_info
->reduc_initial_values
[0];
6023 neutral_op
= neutral_op_for_reduction (TREE_TYPE (vectype
), code
,
6027 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
6029 for (unsigned int i
= 0; i
< group_size
; ++i
)
6031 /* If there's no univeral neutral value, we can use the
6032 initial scalar value from the original PHI. This is used
6033 for MIN and MAX reduction, for example. */
6036 tree scalar_value
= reduc_info
->reduc_initial_values
[i
];
6037 scalar_value
= gimple_convert (&seq
, TREE_TYPE (vectype
),
6039 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
6043 /* Calculate the equivalent of:
6045 sel[j] = (index[j] == i);
6047 which selects the elements of REDUC_INPUTS[0] that should
6048 be included in the result. */
6049 tree compare_val
= build_int_cst (index_elt_type
, i
);
6050 compare_val
= build_vector_from_val (index_type
, compare_val
);
6051 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
6052 index
, compare_val
);
6054 /* Calculate the equivalent of:
6056 vec = seq ? reduc_inputs[0] : vector_identity;
6058 VEC is now suitable for a full vector reduction. */
6059 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
6060 sel
, reduc_inputs
[0], vector_identity
);
6062 /* Do the reduction and convert it to the appropriate type. */
6063 tree scalar
= gimple_build (&seq
, as_combined_fn (reduc_fn
),
6064 TREE_TYPE (vectype
), vec
);
6065 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
6066 scalar_results
.safe_push (scalar
);
6068 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
6072 bool reduce_with_shift
;
6075 gcc_assert (slp_reduc
|| reduc_inputs
.length () == 1);
6077 /* See if the target wants to do the final (shift) reduction
6078 in a vector mode of smaller size and first reduce upper/lower
6079 halves against each other. */
6080 enum machine_mode mode1
= mode
;
6081 tree stype
= TREE_TYPE (vectype
);
6082 unsigned nunits
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
6083 unsigned nunits1
= nunits
;
6084 if ((mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
6085 && reduc_inputs
.length () == 1)
6087 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
6088 /* For SLP reductions we have to make sure lanes match up, but
6089 since we're doing individual element final reduction reducing
6090 vector width here is even more important.
6091 ??? We can also separate lanes with permutes, for the common
6092 case of power-of-two group-size odd/even extracts would work. */
6093 if (slp_reduc
&& nunits
!= nunits1
)
6095 nunits1
= least_common_multiple (nunits1
, group_size
);
6096 gcc_assert (exact_log2 (nunits1
) != -1 && nunits1
<= nunits
);
6100 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
6101 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
6103 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
6105 reduce_with_shift
= have_whole_vector_shift (mode1
);
6106 if (!VECTOR_MODE_P (mode1
)
6107 || !directly_supported_p (code
, vectype1
))
6108 reduce_with_shift
= false;
6110 /* First reduce the vector to the desired vector size we should
6111 do shift reduction on by combining upper and lower halves. */
6112 gimple_seq stmts
= NULL
;
6113 new_temp
= vect_create_partial_epilog (reduc_inputs
[0], vectype1
,
6115 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6116 reduc_inputs
[0] = new_temp
;
6118 if (reduce_with_shift
&& !slp_reduc
)
6120 int element_bitsize
= tree_to_uhwi (bitsize
);
6121 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6122 for variable-length vectors and also requires direct target support
6123 for loop reductions. */
6124 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
6125 int nelements
= vec_size_in_bits
/ element_bitsize
;
6126 vec_perm_builder sel
;
6127 vec_perm_indices indices
;
6131 tree zero_vec
= build_zero_cst (vectype1
);
6133 for (offset = nelements/2; offset >= 1; offset/=2)
6135 Create: va' = vec_shift <va, offset>
6136 Create: va = vop <va, va'>
6141 if (dump_enabled_p ())
6142 dump_printf_loc (MSG_NOTE
, vect_location
,
6143 "Reduce using vector shifts\n");
6145 gimple_seq stmts
= NULL
;
6146 new_temp
= gimple_convert (&stmts
, vectype1
, new_temp
);
6147 for (elt_offset
= nelements
/ 2;
6151 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
6152 indices
.new_vector (sel
, 2, nelements
);
6153 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
6154 new_name
= gimple_build (&stmts
, VEC_PERM_EXPR
, vectype1
,
6155 new_temp
, zero_vec
, mask
);
6156 new_temp
= gimple_build (&stmts
, code
,
6157 vectype1
, new_name
, new_temp
);
6159 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6161 /* 2.4 Extract the final scalar result. Create:
6162 s_out3 = extract_field <v_out2, bitpos> */
6164 if (dump_enabled_p ())
6165 dump_printf_loc (MSG_NOTE
, vect_location
,
6166 "extract scalar result\n");
6168 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
6169 bitsize
, bitsize_zero_node
);
6170 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
6171 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
6172 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
6173 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6174 scalar_results
.safe_push (new_temp
);
6179 s = extract_field <v_out2, 0>
6180 for (offset = element_size;
6181 offset < vector_size;
6182 offset += element_size;)
6184 Create: s' = extract_field <v_out2, offset>
6185 Create: s = op <s, s'> // For non SLP cases
6188 if (dump_enabled_p ())
6189 dump_printf_loc (MSG_NOTE
, vect_location
,
6190 "Reduce using scalar code.\n");
6192 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
6193 int element_bitsize
= tree_to_uhwi (bitsize
);
6194 tree compute_type
= TREE_TYPE (vectype
);
6195 gimple_seq stmts
= NULL
;
6196 FOR_EACH_VEC_ELT (reduc_inputs
, i
, vec_temp
)
6199 new_temp
= gimple_build (&stmts
, BIT_FIELD_REF
, compute_type
,
6200 vec_temp
, bitsize
, bitsize_zero_node
);
6202 /* In SLP we don't need to apply reduction operation, so we just
6203 collect s' values in SCALAR_RESULTS. */
6205 scalar_results
.safe_push (new_temp
);
6207 for (bit_offset
= element_bitsize
;
6208 bit_offset
< vec_size_in_bits
;
6209 bit_offset
+= element_bitsize
)
6211 tree bitpos
= bitsize_int (bit_offset
);
6212 new_name
= gimple_build (&stmts
, BIT_FIELD_REF
,
6213 compute_type
, vec_temp
,
6217 /* In SLP we don't need to apply reduction operation, so
6218 we just collect s' values in SCALAR_RESULTS. */
6219 new_temp
= new_name
;
6220 scalar_results
.safe_push (new_name
);
6223 new_temp
= gimple_build (&stmts
, code
, compute_type
,
6224 new_name
, new_temp
);
6228 /* The only case where we need to reduce scalar results in SLP, is
6229 unrolling. If the size of SCALAR_RESULTS is greater than
6230 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6231 REDUC_GROUP_SIZE. */
6234 tree res
, first_res
, new_res
;
6236 /* Reduce multiple scalar results in case of SLP unrolling. */
6237 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
6240 first_res
= scalar_results
[j
% group_size
];
6241 new_res
= gimple_build (&stmts
, code
, compute_type
,
6243 scalar_results
[j
% group_size
] = new_res
;
6245 scalar_results
.truncate (group_size
);
6246 for (k
= 0; k
< group_size
; k
++)
6247 scalar_results
[k
] = gimple_convert (&stmts
, scalar_type
,
6252 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6253 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
6254 scalar_results
.safe_push (new_temp
);
6257 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6260 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
6263 /* Earlier we set the initial value to be a vector if induc_val
6264 values. Check the result and if it is induc_val then replace
6265 with the original initial value, unless induc_val is
6266 the same as initial_def already. */
6267 tree zcompare
= make_ssa_name (boolean_type_node
);
6268 epilog_stmt
= gimple_build_assign (zcompare
, EQ_EXPR
, new_temp
,
6270 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6271 tree initial_def
= reduc_info
->reduc_initial_values
[0];
6272 tree tmp
= make_ssa_name (new_scalar_dest
);
6273 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
6274 initial_def
, new_temp
);
6275 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
6276 scalar_results
[0] = tmp
;
6280 /* 2.5 Adjust the final result by the initial value of the reduction
6281 variable. (When such adjustment is not needed, then
6282 'adjustment_def' is zero). For example, if code is PLUS we create:
6283 new_temp = loop_exit_def + adjustment_def */
6287 gcc_assert (!slp_reduc
);
6288 gimple_seq stmts
= NULL
;
6291 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def
)));
6292 adjustment_def
= gimple_convert (&stmts
, vectype
, adjustment_def
);
6293 new_temp
= gimple_build (&stmts
, code
, vectype
,
6294 reduc_inputs
[0], adjustment_def
);
6298 new_temp
= scalar_results
[0];
6299 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
6300 adjustment_def
= gimple_convert (&stmts
, scalar_type
, adjustment_def
);
6301 new_temp
= gimple_build (&stmts
, code
, scalar_type
,
6302 new_temp
, adjustment_def
);
6305 epilog_stmt
= gimple_seq_last_stmt (stmts
);
6306 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
6307 scalar_results
[0] = new_temp
;
6310 /* Record this operation if it could be reused by the epilogue loop. */
6311 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == TREE_CODE_REDUCTION
6312 && reduc_inputs
.length () == 1)
6313 loop_vinfo
->reusable_accumulators
.put (scalar_results
[0],
6314 { orig_reduc_input
, reduc_info
});
6319 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6320 phis with new adjusted scalar results, i.e., replace use <s_out0>
6325 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6326 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6327 v_out2 = reduce <v_out1>
6328 s_out3 = extract_field <v_out2, 0>
6329 s_out4 = adjust_result <s_out3>
6336 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6337 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6338 v_out2 = reduce <v_out1>
6339 s_out3 = extract_field <v_out2, 0>
6340 s_out4 = adjust_result <s_out3>
6344 gcc_assert (live_out_stmts
.size () == scalar_results
.length ());
6345 for (k
= 0; k
< live_out_stmts
.size (); k
++)
6347 stmt_vec_info scalar_stmt_info
= vect_orig_stmt (live_out_stmts
[k
]);
6348 scalar_dest
= gimple_get_lhs (scalar_stmt_info
->stmt
);
6351 /* Find the loop-closed-use at the loop exit of the original scalar
6352 result. (The reduction result is expected to have two immediate uses,
6353 one at the latch block, and one at the loop exit). For double
6354 reductions we are looking for exit phis of the outer loop. */
6355 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
6357 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
6359 if (!is_gimple_debug (USE_STMT (use_p
)))
6360 phis
.safe_push (USE_STMT (use_p
));
6364 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
6366 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
6368 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
6370 if (!flow_bb_inside_loop_p (loop
,
6371 gimple_bb (USE_STMT (phi_use_p
)))
6372 && !is_gimple_debug (USE_STMT (phi_use_p
)))
6373 phis
.safe_push (USE_STMT (phi_use_p
));
6379 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
6381 /* Replace the uses: */
6382 orig_name
= PHI_RESULT (exit_phi
);
6384 /* Look for a single use at the target of the skip edge. */
6385 if (unify_with_main_loop_p
)
6387 use_operand_p use_p
;
6389 if (!single_imm_use (orig_name
, &use_p
, &user
))
6391 orig_name
= gimple_get_lhs (user
);
6394 scalar_result
= scalar_results
[k
];
6395 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
6397 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
6398 SET_USE (use_p
, scalar_result
);
6399 update_stmt (use_stmt
);
6407 /* Return a vector of type VECTYPE that is equal to the vector select
6408 operation "MASK ? VEC : IDENTITY". Insert the select statements
6412 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
6413 tree vec
, tree identity
)
6415 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
6416 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
6417 mask
, vec
, identity
);
6418 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6422 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6423 order, starting with LHS. Insert the extraction statements before GSI and
6424 associate the new scalar SSA names with variable SCALAR_DEST.
6425 Return the SSA name for the result. */
6428 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
6429 tree_code code
, tree lhs
, tree vector_rhs
)
6431 tree vectype
= TREE_TYPE (vector_rhs
);
6432 tree scalar_type
= TREE_TYPE (vectype
);
6433 tree bitsize
= TYPE_SIZE (scalar_type
);
6434 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
6435 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
6437 for (unsigned HOST_WIDE_INT bit_offset
= 0;
6438 bit_offset
< vec_size_in_bits
;
6439 bit_offset
+= element_bitsize
)
6441 tree bitpos
= bitsize_int (bit_offset
);
6442 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
6445 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
6446 rhs
= make_ssa_name (scalar_dest
, stmt
);
6447 gimple_assign_set_lhs (stmt
, rhs
);
6448 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6450 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
6451 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
6452 gimple_assign_set_lhs (stmt
, new_name
);
6453 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
6459 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6460 type of the vector input. */
6463 get_masked_reduction_fn (internal_fn reduc_fn
, tree vectype_in
)
6465 internal_fn mask_reduc_fn
;
6469 case IFN_FOLD_LEFT_PLUS
:
6470 mask_reduc_fn
= IFN_MASK_FOLD_LEFT_PLUS
;
6477 if (direct_internal_fn_supported_p (mask_reduc_fn
, vectype_in
,
6478 OPTIMIZE_FOR_SPEED
))
6479 return mask_reduc_fn
;
6483 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6484 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6485 statement. CODE is the operation performed by STMT_INFO and OPS are
6486 its scalar operands. REDUC_INDEX is the index of the operand in
6487 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6488 implements in-order reduction, or IFN_LAST if we should open-code it.
6489 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6490 that should be used to control the operation in a fully-masked loop. */
6493 vectorize_fold_left_reduction (loop_vec_info loop_vinfo
,
6494 stmt_vec_info stmt_info
,
6495 gimple_stmt_iterator
*gsi
,
6496 gimple
**vec_stmt
, slp_tree slp_node
,
6497 gimple
*reduc_def_stmt
,
6498 tree_code code
, internal_fn reduc_fn
,
6499 tree ops
[3], tree vectype_in
,
6500 int reduc_index
, vec_loop_masks
*masks
)
6502 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6503 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6504 internal_fn mask_reduc_fn
= get_masked_reduction_fn (reduc_fn
, vectype_in
);
6510 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6512 gcc_assert (!nested_in_vect_loop_p (loop
, stmt_info
));
6513 gcc_assert (ncopies
== 1);
6514 gcc_assert (TREE_CODE_LENGTH (code
) == binary_op
);
6517 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
6518 TYPE_VECTOR_SUBPARTS (vectype_in
)));
6520 tree op0
= ops
[1 - reduc_index
];
6523 stmt_vec_info scalar_dest_def_info
;
6524 auto_vec
<tree
> vec_oprnds0
;
6527 auto_vec
<vec
<tree
> > vec_defs (2);
6528 vect_get_slp_defs (loop_vinfo
, slp_node
, &vec_defs
);
6529 vec_oprnds0
.safe_splice (vec_defs
[1 - reduc_index
]);
6530 vec_defs
[0].release ();
6531 vec_defs
[1].release ();
6532 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
6533 scalar_dest_def_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
6537 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
6539 scalar_dest_def_info
= stmt_info
;
6542 tree scalar_dest
= gimple_assign_lhs (scalar_dest_def_info
->stmt
);
6543 tree scalar_type
= TREE_TYPE (scalar_dest
);
6544 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
6546 int vec_num
= vec_oprnds0
.length ();
6547 gcc_assert (vec_num
== 1 || slp_node
);
6548 tree vec_elem_type
= TREE_TYPE (vectype_out
);
6549 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
6551 tree vector_identity
= NULL_TREE
;
6552 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6553 vector_identity
= build_zero_cst (vectype_out
);
6555 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
6558 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
6561 tree mask
= NULL_TREE
;
6562 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
6563 mask
= vect_get_loop_mask (gsi
, masks
, vec_num
, vectype_in
, i
);
6565 /* Handle MINUS by adding the negative. */
6566 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
6568 tree negated
= make_ssa_name (vectype_out
);
6569 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
6570 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
6574 if (mask
&& mask_reduc_fn
== IFN_LAST
)
6575 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
6578 /* On the first iteration the input is simply the scalar phi
6579 result, and for subsequent iterations it is the output of
6580 the preceding operation. */
6581 if (reduc_fn
!= IFN_LAST
|| (mask
&& mask_reduc_fn
!= IFN_LAST
))
6583 if (mask
&& mask_reduc_fn
!= IFN_LAST
)
6584 new_stmt
= gimple_build_call_internal (mask_reduc_fn
, 3, reduc_var
,
6587 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
,
6589 /* For chained SLP reductions the output of the previous reduction
6590 operation serves as the input of the next. For the final statement
6591 the output cannot be a temporary - we reuse the original
6592 scalar destination of the last statement. */
6593 if (i
!= vec_num
- 1)
6595 gimple_set_lhs (new_stmt
, scalar_dest_var
);
6596 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
6597 gimple_set_lhs (new_stmt
, reduc_var
);
6602 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
, code
,
6604 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
6605 /* Remove the statement, so that we can use the same code paths
6606 as for statements that we've just created. */
6607 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
6608 gsi_remove (&tmp_gsi
, true);
6611 if (i
== vec_num
- 1)
6613 gimple_set_lhs (new_stmt
, scalar_dest
);
6614 vect_finish_replace_stmt (loop_vinfo
,
6615 scalar_dest_def_info
,
6619 vect_finish_stmt_generation (loop_vinfo
,
6620 scalar_dest_def_info
,
6624 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
6627 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
6628 *vec_stmt
= new_stmt
;
6635 /* Function is_nonwrapping_integer_induction.
6637 Check if STMT_VINO (which is part of loop LOOP) both increments and
6638 does not cause overflow. */
6641 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo
, class loop
*loop
)
6643 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
6644 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
6645 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
6646 tree lhs_type
= TREE_TYPE (gimple_phi_result (phi
));
6647 widest_int ni
, max_loop_value
, lhs_max
;
6648 wi::overflow_type overflow
= wi::OVF_NONE
;
6650 /* Make sure the loop is integer based. */
6651 if (TREE_CODE (base
) != INTEGER_CST
6652 || TREE_CODE (step
) != INTEGER_CST
)
6655 /* Check that the max size of the loop will not wrap. */
6657 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
6660 if (! max_stmt_executions (loop
, &ni
))
6663 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
6668 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
6669 TYPE_SIGN (lhs_type
), &overflow
);
6673 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
6674 <= TYPE_PRECISION (lhs_type
));
6677 /* Check if masking can be supported by inserting a conditional expression.
6678 CODE is the code for the operation. COND_FN is the conditional internal
6679 function, if it exists. VECTYPE_IN is the type of the vector input. */
6681 use_mask_by_cond_expr_p (code_helper code
, internal_fn cond_fn
,
6684 if (cond_fn
!= IFN_LAST
6685 && direct_internal_fn_supported_p (cond_fn
, vectype_in
,
6686 OPTIMIZE_FOR_SPEED
))
6689 if (code
.is_tree_code ())
6690 switch (tree_code (code
))
6702 /* Insert a conditional expression to enable masked vectorization. CODE is the
6703 code for the operation. VOP is the array of operands. MASK is the loop
6704 mask. GSI is a statement iterator used to place the new conditional
6707 build_vect_cond_expr (code_helper code
, tree vop
[3], tree mask
,
6708 gimple_stmt_iterator
*gsi
)
6710 switch (tree_code (code
))
6714 tree vectype
= TREE_TYPE (vop
[1]);
6715 tree zero
= build_zero_cst (vectype
);
6716 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6717 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6718 mask
, vop
[1], zero
);
6719 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6720 vop
[1] = masked_op1
;
6726 tree vectype
= TREE_TYPE (vop
[1]);
6727 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6728 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6729 mask
, vop
[1], vop
[0]);
6730 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6731 vop
[1] = masked_op1
;
6740 /* Function vectorizable_reduction.
6742 Check if STMT_INFO performs a reduction operation that can be vectorized.
6743 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6744 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6745 Return true if STMT_INFO is vectorizable in this way.
6747 This function also handles reduction idioms (patterns) that have been
6748 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6749 may be of this form:
6750 X = pattern_expr (arg0, arg1, ..., X)
6751 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6752 sequence that had been detected and replaced by the pattern-stmt
6755 This function also handles reduction of condition expressions, for example:
6756 for (int i = 0; i < N; i++)
6759 This is handled by vectorising the loop and creating an additional vector
6760 containing the loop indexes for which "a[i] < value" was true. In the
6761 function epilogue this is reduced to a single max value and then used to
6762 index into the vector of results.
6764 In some cases of reduction patterns, the type of the reduction variable X is
6765 different than the type of the other arguments of STMT_INFO.
6766 In such cases, the vectype that is used when transforming STMT_INFO into
6767 a vector stmt is different than the vectype that is used to determine the
6768 vectorization factor, because it consists of a different number of elements
6769 than the actual number of elements that are being operated upon in parallel.
6771 For example, consider an accumulation of shorts into an int accumulator.
6772 On some targets it's possible to vectorize this pattern operating on 8
6773 shorts at a time (hence, the vectype for purposes of determining the
6774 vectorization factor should be V8HI); on the other hand, the vectype that
6775 is used to create the vector form is actually V4SI (the type of the result).
6777 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6778 indicates what is the actual level of parallelism (V8HI in the example), so
6779 that the right vectorization factor would be derived. This vectype
6780 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6781 be used to create the vectorized stmt. The right vectype for the vectorized
6782 stmt is obtained from the type of the result X:
6783 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6785 This means that, contrary to "regular" reductions (or "regular" stmts in
6786 general), the following equation:
6787 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6788 does *NOT* necessarily hold for reduction patterns. */
6791 vectorizable_reduction (loop_vec_info loop_vinfo
,
6792 stmt_vec_info stmt_info
, slp_tree slp_node
,
6793 slp_instance slp_node_instance
,
6794 stmt_vector_for_cost
*cost_vec
)
6796 tree vectype_in
= NULL_TREE
;
6797 tree vectype_op
[3] = { NULL_TREE
, NULL_TREE
, NULL_TREE
};
6798 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6799 enum vect_def_type cond_reduc_dt
= vect_unknown_def_type
;
6800 stmt_vec_info cond_stmt_vinfo
= NULL
;
6803 bool single_defuse_cycle
= false;
6804 bool nested_cycle
= false;
6805 bool double_reduc
= false;
6807 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
6808 tree cond_reduc_val
= NULL_TREE
;
6810 /* Make sure it was already recognized as a reduction computation. */
6811 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
6812 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
6813 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
6816 /* The stmt we store reduction analysis meta on. */
6817 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
6818 reduc_info
->is_reduc_info
= true;
6820 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
6822 if (is_a
<gphi
*> (stmt_info
->stmt
))
6826 /* We eventually need to set a vector type on invariant
6830 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
6831 if (!vect_maybe_update_slp_op_vectype
6832 (child
, SLP_TREE_VECTYPE (slp_node
)))
6834 if (dump_enabled_p ())
6835 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6836 "incompatible vector types for "
6841 /* Analysis for double-reduction is done on the outer
6842 loop PHI, nested cycles have no further restrictions. */
6843 STMT_VINFO_TYPE (stmt_info
) = cycle_phi_info_type
;
6846 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6850 stmt_vec_info orig_stmt_of_analysis
= stmt_info
;
6851 stmt_vec_info phi_info
= stmt_info
;
6852 if (!is_a
<gphi
*> (stmt_info
->stmt
))
6854 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6859 slp_node_instance
->reduc_phis
= slp_node
;
6860 /* ??? We're leaving slp_node to point to the PHIs, we only
6861 need it to get at the number of vector stmts which wasn't
6862 yet initialized for the instance root. */
6864 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
6866 use_operand_p use_p
;
6868 bool res
= single_imm_use (gimple_phi_result (stmt_info
->stmt
),
6871 phi_info
= loop_vinfo
->lookup_stmt (use_stmt
);
6874 /* PHIs should not participate in patterns. */
6875 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6876 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
6878 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6879 and compute the reduction chain length. Discover the real
6880 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6882 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
,
6884 (gimple_bb (reduc_def_phi
)->loop_father
));
6885 unsigned reduc_chain_length
= 0;
6886 bool only_slp_reduc_chain
= true;
6888 slp_tree slp_for_stmt_info
= slp_node
? slp_node_instance
->root
: NULL
;
6889 while (reduc_def
!= PHI_RESULT (reduc_def_phi
))
6891 stmt_vec_info def
= loop_vinfo
->lookup_def (reduc_def
);
6892 stmt_vec_info vdef
= vect_stmt_to_vectorize (def
);
6893 if (STMT_VINFO_REDUC_IDX (vdef
) == -1)
6895 if (dump_enabled_p ())
6896 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6897 "reduction chain broken by patterns.\n");
6900 if (!REDUC_GROUP_FIRST_ELEMENT (vdef
))
6901 only_slp_reduc_chain
= false;
6902 /* For epilogue generation live members of the chain need
6903 to point back to the PHI via their original stmt for
6904 info_for_reduction to work. For SLP we need to look at
6905 all lanes here - even though we only will vectorize from
6906 the SLP node with live lane zero the other live lanes also
6907 need to be identified as part of a reduction to be able
6908 to skip code generation for them. */
6909 if (slp_for_stmt_info
)
6911 for (auto s
: SLP_TREE_SCALAR_STMTS (slp_for_stmt_info
))
6912 if (STMT_VINFO_LIVE_P (s
))
6913 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s
)) = phi_info
;
6915 else if (STMT_VINFO_LIVE_P (vdef
))
6916 STMT_VINFO_REDUC_DEF (def
) = phi_info
;
6918 if (!gimple_extract_op (vdef
->stmt
, &op
))
6920 if (dump_enabled_p ())
6921 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6922 "reduction chain includes unsupported"
6923 " statement type.\n");
6926 if (CONVERT_EXPR_CODE_P (op
.code
))
6928 if (!tree_nop_conversion_p (op
.type
, TREE_TYPE (op
.ops
[0])))
6930 if (dump_enabled_p ())
6931 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6932 "conversion in the reduction chain.\n");
6936 else if (!stmt_info
)
6937 /* First non-conversion stmt. */
6939 reduc_def
= op
.ops
[STMT_VINFO_REDUC_IDX (vdef
)];
6940 reduc_chain_length
++;
6941 if (!stmt_info
&& slp_node
)
6942 slp_for_stmt_info
= SLP_TREE_CHILDREN (slp_for_stmt_info
)[0];
6944 /* PHIs should not participate in patterns. */
6945 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6947 if (nested_in_vect_loop_p (loop
, stmt_info
))
6950 nested_cycle
= true;
6953 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6955 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6957 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info
));
6958 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
6960 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6961 gcc_assert (slp_node
6962 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
);
6964 /* 1. Is vectorizable reduction? */
6965 /* Not supportable if the reduction variable is used in the loop, unless
6966 it's a reduction chain. */
6967 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
6968 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6971 /* Reductions that are not used even in an enclosing outer-loop,
6972 are expected to be "live" (used out of the loop). */
6973 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
6974 && !STMT_VINFO_LIVE_P (stmt_info
))
6977 /* 2. Has this been recognized as a reduction pattern?
6979 Check if STMT represents a pattern that has been recognized
6980 in earlier analysis stages. For stmts that represent a pattern,
6981 the STMT_VINFO_RELATED_STMT field records the last stmt in
6982 the original sequence that constitutes the pattern. */
6984 stmt_vec_info orig_stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
6987 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
6988 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
6991 /* 3. Check the operands of the operation. The first operands are defined
6992 inside the loop body. The last operand is the reduction variable,
6993 which is defined by the loop-header-phi. */
6995 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6996 STMT_VINFO_REDUC_VECTYPE (reduc_info
) = vectype_out
;
6998 if (!gimple_extract_op (stmt_info
->stmt
, &op
))
7000 bool lane_reduc_code_p
= (op
.code
== DOT_PROD_EXPR
7001 || op
.code
== WIDEN_SUM_EXPR
7002 || op
.code
== SAD_EXPR
);
7004 if (!POINTER_TYPE_P (op
.type
) && !INTEGRAL_TYPE_P (op
.type
)
7005 && !SCALAR_FLOAT_TYPE_P (op
.type
))
7008 /* Do not try to vectorize bit-precision reductions. */
7009 if (!type_has_mode_precision_p (op
.type
))
7012 /* For lane-reducing ops we're reducing the number of reduction PHIs
7013 which means the only use of that may be in the lane-reducing operation. */
7014 if (lane_reduc_code_p
7015 && reduc_chain_length
!= 1
7016 && !only_slp_reduc_chain
)
7018 if (dump_enabled_p ())
7019 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7020 "lane-reducing reduction with extra stmts.\n");
7024 /* All uses but the last are expected to be defined in the loop.
7025 The last use is the reduction variable. In case of nested cycle this
7026 assumption is not true: we use reduc_index to record the index of the
7027 reduction variable. */
7028 slp_tree
*slp_op
= XALLOCAVEC (slp_tree
, op
.num_ops
);
7029 /* We need to skip an extra operand for COND_EXPRs with embedded
7031 unsigned opno_adjust
= 0;
7032 if (op
.code
== COND_EXPR
&& COMPARISON_CLASS_P (op
.ops
[0]))
7034 for (i
= 0; i
< (int) op
.num_ops
; i
++)
7036 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7037 if (i
== 0 && op
.code
== COND_EXPR
)
7040 stmt_vec_info def_stmt_info
;
7041 enum vect_def_type dt
;
7042 if (!vect_is_simple_use (loop_vinfo
, stmt_info
, slp_for_stmt_info
,
7043 i
+ opno_adjust
, &op
.ops
[i
], &slp_op
[i
], &dt
,
7044 &vectype_op
[i
], &def_stmt_info
))
7046 if (dump_enabled_p ())
7047 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7048 "use not simple.\n");
7051 if (i
== STMT_VINFO_REDUC_IDX (stmt_info
))
7054 /* There should be only one cycle def in the stmt, the one
7055 leading to reduc_def. */
7056 if (VECTORIZABLE_CYCLE_DEF (dt
))
7061 = get_vectype_for_scalar_type (loop_vinfo
,
7062 TREE_TYPE (op
.ops
[i
]), slp_op
[i
]);
7064 /* To properly compute ncopies we are interested in the widest
7065 non-reduction input type in case we're looking at a widening
7066 accumulation that we later handle in vect_transform_reduction. */
7067 if (lane_reduc_code_p
7070 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
7071 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op
[i
]))))))
7072 vectype_in
= vectype_op
[i
];
7074 if (op
.code
== COND_EXPR
)
7076 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7077 if (dt
== vect_constant_def
)
7080 cond_reduc_val
= op
.ops
[i
];
7082 if (dt
== vect_induction_def
7084 && is_nonwrapping_integer_induction (def_stmt_info
, loop
))
7087 cond_stmt_vinfo
= def_stmt_info
;
7092 vectype_in
= STMT_VINFO_VECTYPE (phi_info
);
7093 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
) = vectype_in
;
7095 enum vect_reduction_type v_reduc_type
= STMT_VINFO_REDUC_TYPE (phi_info
);
7096 STMT_VINFO_REDUC_TYPE (reduc_info
) = v_reduc_type
;
7097 /* If we have a condition reduction, see if we can simplify it further. */
7098 if (v_reduc_type
== COND_REDUCTION
)
7103 /* When the condition uses the reduction value in the condition, fail. */
7104 if (STMT_VINFO_REDUC_IDX (stmt_info
) == 0)
7106 if (dump_enabled_p ())
7107 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7108 "condition depends on previous iteration\n");
7112 if (reduc_chain_length
== 1
7113 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
,
7114 vectype_in
, OPTIMIZE_FOR_SPEED
))
7116 if (dump_enabled_p ())
7117 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7118 "optimizing condition reduction with"
7119 " FOLD_EXTRACT_LAST.\n");
7120 STMT_VINFO_REDUC_TYPE (reduc_info
) = EXTRACT_LAST_REDUCTION
;
7122 else if (cond_reduc_dt
== vect_induction_def
)
7125 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
7126 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
7128 gcc_assert (TREE_CODE (base
) == INTEGER_CST
7129 && TREE_CODE (step
) == INTEGER_CST
);
7130 cond_reduc_val
= NULL_TREE
;
7131 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
7132 tree res
= PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo
));
7133 if (!types_compatible_p (TREE_TYPE (res
), TREE_TYPE (base
)))
7135 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7136 above base; punt if base is the minimum value of the type for
7137 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7138 else if (tree_int_cst_sgn (step
) == -1)
7140 cond_reduc_op_code
= MIN_EXPR
;
7141 if (tree_int_cst_sgn (base
) == -1)
7142 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
7143 else if (tree_int_cst_lt (base
,
7144 TYPE_MAX_VALUE (TREE_TYPE (base
))))
7146 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
7150 cond_reduc_op_code
= MAX_EXPR
;
7151 if (tree_int_cst_sgn (base
) == 1)
7152 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
7153 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
7156 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
7160 if (dump_enabled_p ())
7161 dump_printf_loc (MSG_NOTE
, vect_location
,
7162 "condition expression based on "
7163 "integer induction.\n");
7164 STMT_VINFO_REDUC_CODE (reduc_info
) = cond_reduc_op_code
;
7165 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
)
7167 STMT_VINFO_REDUC_TYPE (reduc_info
) = INTEGER_INDUC_COND_REDUCTION
;
7170 else if (cond_reduc_dt
== vect_constant_def
)
7172 enum vect_def_type cond_initial_dt
;
7173 tree cond_initial_val
= vect_phi_initial_value (reduc_def_phi
);
7174 vect_is_simple_use (cond_initial_val
, loop_vinfo
, &cond_initial_dt
);
7175 if (cond_initial_dt
== vect_constant_def
7176 && types_compatible_p (TREE_TYPE (cond_initial_val
),
7177 TREE_TYPE (cond_reduc_val
)))
7179 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
7180 cond_initial_val
, cond_reduc_val
);
7181 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
7183 if (dump_enabled_p ())
7184 dump_printf_loc (MSG_NOTE
, vect_location
,
7185 "condition expression based on "
7186 "compile time constant.\n");
7187 /* Record reduction code at analysis stage. */
7188 STMT_VINFO_REDUC_CODE (reduc_info
)
7189 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
7190 STMT_VINFO_REDUC_TYPE (reduc_info
) = CONST_COND_REDUCTION
;
7196 if (STMT_VINFO_LIVE_P (phi_info
))
7202 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7204 gcc_assert (ncopies
>= 1);
7206 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
7210 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
)
7211 == vect_double_reduction_def
);
7212 double_reduc
= true;
7215 /* 4.2. Check support for the epilog operation.
7217 If STMT represents a reduction pattern, then the type of the
7218 reduction variable may be different than the type of the rest
7219 of the arguments. For example, consider the case of accumulation
7220 of shorts into an int accumulator; The original code:
7221 S1: int_a = (int) short_a;
7222 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7225 STMT: int_acc = widen_sum <short_a, int_acc>
7228 1. The tree-code that is used to create the vector operation in the
7229 epilog code (that reduces the partial results) is not the
7230 tree-code of STMT, but is rather the tree-code of the original
7231 stmt from the pattern that STMT is replacing. I.e, in the example
7232 above we want to use 'widen_sum' in the loop, but 'plus' in the
7234 2. The type (mode) we use to check available target support
7235 for the vector operation to be created in the *epilog*, is
7236 determined by the type of the reduction variable (in the example
7237 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7238 However the type (mode) we use to check available target support
7239 for the vector operation to be created *inside the loop*, is
7240 determined by the type of the other arguments to STMT (in the
7241 example we'd check this: optab_handler (widen_sum_optab,
7244 This is contrary to "regular" reductions, in which the types of all
7245 the arguments are the same as the type of the reduction variable.
7246 For "regular" reductions we can therefore use the same vector type
7247 (and also the same tree-code) when generating the epilog code and
7248 when generating the code inside the loop. */
7250 code_helper orig_code
= STMT_VINFO_REDUC_CODE (phi_info
);
7251 STMT_VINFO_REDUC_CODE (reduc_info
) = orig_code
;
7253 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
7254 if (reduction_type
== TREE_CODE_REDUCTION
)
7256 /* Check whether it's ok to change the order of the computation.
7257 Generally, when vectorizing a reduction we change the order of the
7258 computation. This may change the behavior of the program in some
7259 cases, so we need to check that this is ok. One exception is when
7260 vectorizing an outer-loop: the inner-loop is executed sequentially,
7261 and therefore vectorizing reductions in the inner-loop during
7262 outer-loop vectorization is safe. Likewise when we are vectorizing
7263 a series of reductions using SLP and the VF is one the reductions
7264 are performed in scalar order. */
7266 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
7267 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1u))
7269 else if (needs_fold_left_reduction_p (op
.type
, orig_code
))
7271 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7272 is not directy used in stmt. */
7273 if (!only_slp_reduc_chain
7274 && reduc_chain_length
!= 1)
7276 if (dump_enabled_p ())
7277 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7278 "in-order reduction chain without SLP.\n");
7281 STMT_VINFO_REDUC_TYPE (reduc_info
)
7282 = reduction_type
= FOLD_LEFT_REDUCTION
;
7284 else if (!commutative_binary_op_p (orig_code
, op
.type
)
7285 || !associative_binary_op_p (orig_code
, op
.type
))
7287 if (dump_enabled_p ())
7288 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7289 "reduction: not commutative/associative");
7294 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
7297 if (dump_enabled_p ())
7298 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7299 "multiple types in double reduction or condition "
7300 "reduction or fold-left reduction.\n");
7304 internal_fn reduc_fn
= IFN_LAST
;
7305 if (reduction_type
== TREE_CODE_REDUCTION
7306 || reduction_type
== FOLD_LEFT_REDUCTION
7307 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
7308 || reduction_type
== CONST_COND_REDUCTION
)
7310 if (reduction_type
== FOLD_LEFT_REDUCTION
7311 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
7312 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
7314 if (reduc_fn
!= IFN_LAST
7315 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
7316 OPTIMIZE_FOR_SPEED
))
7318 if (dump_enabled_p ())
7319 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7320 "reduc op not supported by target.\n");
7322 reduc_fn
= IFN_LAST
;
7327 if (!nested_cycle
|| double_reduc
)
7329 if (dump_enabled_p ())
7330 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7331 "no reduc code for scalar code.\n");
7337 else if (reduction_type
== COND_REDUCTION
)
7339 int scalar_precision
7340 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op
.type
));
7341 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
7342 cr_index_vector_type
= get_same_sized_vectype (cr_index_scalar_type
,
7345 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
7346 OPTIMIZE_FOR_SPEED
))
7347 reduc_fn
= IFN_REDUC_MAX
;
7349 STMT_VINFO_REDUC_FN (reduc_info
) = reduc_fn
;
7351 if (reduction_type
!= EXTRACT_LAST_REDUCTION
7352 && (!nested_cycle
|| double_reduc
)
7353 && reduc_fn
== IFN_LAST
7354 && !nunits_out
.is_constant ())
7356 if (dump_enabled_p ())
7357 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7358 "missing target support for reduction on"
7359 " variable-length vectors.\n");
7363 /* For SLP reductions, see if there is a neutral value we can use. */
7364 tree neutral_op
= NULL_TREE
;
7367 tree initial_value
= NULL_TREE
;
7368 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
) != NULL
)
7369 initial_value
= vect_phi_initial_value (reduc_def_phi
);
7370 neutral_op
= neutral_op_for_reduction (TREE_TYPE (vectype_out
),
7371 orig_code
, initial_value
);
7374 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
7376 /* We can't support in-order reductions of code such as this:
7378 for (int i = 0; i < n1; ++i)
7379 for (int j = 0; j < n2; ++j)
7382 since GCC effectively transforms the loop when vectorizing:
7384 for (int i = 0; i < n1 / VF; ++i)
7385 for (int j = 0; j < n2; ++j)
7386 for (int k = 0; k < VF; ++k)
7389 which is a reassociation of the original operation. */
7390 if (dump_enabled_p ())
7391 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7392 "in-order double reduction not supported.\n");
7397 if (reduction_type
== FOLD_LEFT_REDUCTION
7399 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7401 /* We cannot use in-order reductions in this case because there is
7402 an implicit reassociation of the operations involved. */
7403 if (dump_enabled_p ())
7404 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7405 "in-order unchained SLP reductions not supported.\n");
7409 /* For double reductions, and for SLP reductions with a neutral value,
7410 we construct a variable-length initial vector by loading a vector
7411 full of the neutral value and then shift-and-inserting the start
7412 values into the low-numbered elements. */
7413 if ((double_reduc
|| neutral_op
)
7414 && !nunits_out
.is_constant ()
7415 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
7416 vectype_out
, OPTIMIZE_FOR_SPEED
))
7418 if (dump_enabled_p ())
7419 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7420 "reduction on variable-length vectors requires"
7421 " target support for a vector-shift-and-insert"
7426 /* Check extra constraints for variable-length unchained SLP reductions. */
7428 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
7429 && !nunits_out
.is_constant ())
7431 /* We checked above that we could build the initial vector when
7432 there's a neutral element value. Check here for the case in
7433 which each SLP statement has its own initial value and in which
7434 that value needs to be repeated for every instance of the
7435 statement within the initial vector. */
7436 unsigned int group_size
= SLP_TREE_LANES (slp_node
);
7438 && !can_duplicate_and_interleave_p (loop_vinfo
, group_size
,
7439 TREE_TYPE (vectype_out
)))
7441 if (dump_enabled_p ())
7442 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7443 "unsupported form of SLP reduction for"
7444 " variable-length vectors: cannot build"
7445 " initial vector.\n");
7448 /* The epilogue code relies on the number of elements being a multiple
7449 of the group size. The duplicate-and-interleave approach to setting
7450 up the initial vector does too. */
7451 if (!multiple_p (nunits_out
, group_size
))
7453 if (dump_enabled_p ())
7454 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7455 "unsupported form of SLP reduction for"
7456 " variable-length vectors: the vector size"
7457 " is not a multiple of the number of results.\n");
7462 if (reduction_type
== COND_REDUCTION
)
7466 if (! max_loop_iterations (loop
, &ni
))
7468 if (dump_enabled_p ())
7469 dump_printf_loc (MSG_NOTE
, vect_location
,
7470 "loop count not known, cannot create cond "
7474 /* Convert backedges to iterations. */
7477 /* The additional index will be the same type as the condition. Check
7478 that the loop can fit into this less one (because we'll use up the
7479 zero slot for when there are no matches). */
7480 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
7481 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
7483 if (dump_enabled_p ())
7484 dump_printf_loc (MSG_NOTE
, vect_location
,
7485 "loop size is greater than data size.\n");
7490 /* In case the vectorization factor (VF) is bigger than the number
7491 of elements that we can fit in a vectype (nunits), we have to generate
7492 more than one vector stmt - i.e - we need to "unroll" the
7493 vector stmt by a factor VF/nunits. For more details see documentation
7494 in vectorizable_operation. */
7496 /* If the reduction is used in an outer loop we need to generate
7497 VF intermediate results, like so (e.g. for ncopies=2):
7502 (i.e. we generate VF results in 2 registers).
7503 In this case we have a separate def-use cycle for each copy, and therefore
7504 for each copy we get the vector def for the reduction variable from the
7505 respective phi node created for this copy.
7507 Otherwise (the reduction is unused in the loop nest), we can combine
7508 together intermediate results, like so (e.g. for ncopies=2):
7512 (i.e. we generate VF/2 results in a single register).
7513 In this case for each copy we get the vector def for the reduction variable
7514 from the vectorized reduction operation generated in the previous iteration.
7516 This only works when we see both the reduction PHI and its only consumer
7517 in vectorizable_reduction and there are no intermediate stmts
7518 participating. When unrolling we want each unrolled iteration to have its
7519 own reduction accumulator since one of the main goals of unrolling a
7520 reduction is to reduce the aggregate loop-carried latency. */
7522 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
7523 && reduc_chain_length
== 1
7524 && loop_vinfo
->suggested_unroll_factor
== 1)
7525 single_defuse_cycle
= true;
7527 if (single_defuse_cycle
|| lane_reduc_code_p
)
7529 gcc_assert (op
.code
!= COND_EXPR
);
7531 /* 4. Supportable by target? */
7534 /* 4.1. check support for the operation in the loop
7536 This isn't necessary for the lane reduction codes, since they
7537 can only be produced by pattern matching, and it's up to the
7538 pattern matcher to test for support. The main reason for
7539 specifically skipping this step is to avoid rechecking whether
7540 mixed-sign dot-products can be implemented using signed
7542 machine_mode vec_mode
= TYPE_MODE (vectype_in
);
7543 if (!lane_reduc_code_p
7544 && !directly_supported_p (op
.code
, vectype_in
, optab_vector
))
7546 if (dump_enabled_p ())
7547 dump_printf (MSG_NOTE
, "op not supported by target.\n");
7548 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
7549 || !vect_can_vectorize_without_simd_p (op
.code
))
7552 if (dump_enabled_p ())
7553 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
7556 if (vect_emulated_vector_p (vectype_in
)
7557 && !vect_can_vectorize_without_simd_p (op
.code
))
7559 if (dump_enabled_p ())
7560 dump_printf (MSG_NOTE
, "using word mode not possible.\n");
7564 /* lane-reducing operations have to go through vect_transform_reduction.
7565 For the other cases try without the single cycle optimization. */
7568 if (lane_reduc_code_p
)
7571 single_defuse_cycle
= false;
7574 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
) = single_defuse_cycle
;
7576 /* If the reduction stmt is one of the patterns that have lane
7577 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7578 if ((ncopies
> 1 && ! single_defuse_cycle
)
7579 && lane_reduc_code_p
)
7581 if (dump_enabled_p ())
7582 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7583 "multi def-use cycle not possible for lane-reducing "
7584 "reduction operation\n");
7589 && !(!single_defuse_cycle
7590 && !lane_reduc_code_p
7591 && reduction_type
!= FOLD_LEFT_REDUCTION
))
7592 for (i
= 0; i
< (int) op
.num_ops
; i
++)
7593 if (!vect_maybe_update_slp_op_vectype (slp_op
[i
], vectype_op
[i
]))
7595 if (dump_enabled_p ())
7596 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7597 "incompatible vector types for invariants\n");
7602 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7606 vect_model_reduction_cost (loop_vinfo
, stmt_info
, reduc_fn
,
7607 reduction_type
, ncopies
, cost_vec
);
7608 /* Cost the reduction op inside the loop if transformed via
7609 vect_transform_reduction. Otherwise this is costed by the
7610 separate vectorizable_* routines. */
7611 if (single_defuse_cycle
|| lane_reduc_code_p
)
7614 if (vect_is_emulated_mixed_dot_prod (loop_vinfo
, stmt_info
))
7615 /* Three dot-products and a subtraction. */
7617 record_stmt_cost (cost_vec
, ncopies
* factor
, vector_stmt
,
7618 stmt_info
, 0, vect_body
);
7621 if (dump_enabled_p ()
7622 && reduction_type
== FOLD_LEFT_REDUCTION
)
7623 dump_printf_loc (MSG_NOTE
, vect_location
,
7624 "using an in-order (fold-left) reduction.\n");
7625 STMT_VINFO_TYPE (orig_stmt_of_analysis
) = cycle_phi_info_type
;
7626 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7627 reductions go through their own vectorizable_* routines. */
7628 if (!single_defuse_cycle
7629 && !lane_reduc_code_p
7630 && reduction_type
!= FOLD_LEFT_REDUCTION
)
7633 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
7634 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (tem
))
7636 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem
));
7637 tem
= REDUC_GROUP_FIRST_ELEMENT (tem
);
7639 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem
)) = vect_internal_def
;
7640 STMT_VINFO_DEF_TYPE (tem
) = vect_internal_def
;
7642 else if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
7644 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7645 internal_fn cond_fn
= get_conditional_internal_fn (op
.code
, op
.type
);
7647 if (reduction_type
!= FOLD_LEFT_REDUCTION
7648 && !use_mask_by_cond_expr_p (op
.code
, cond_fn
, vectype_in
)
7649 && (cond_fn
== IFN_LAST
7650 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
7651 OPTIMIZE_FOR_SPEED
)))
7653 if (dump_enabled_p ())
7654 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7655 "can't operate on partial vectors because"
7656 " no conditional operation is available.\n");
7657 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7659 else if (reduction_type
== FOLD_LEFT_REDUCTION
7660 && reduc_fn
== IFN_LAST
7661 && !expand_vec_cond_expr_p (vectype_in
,
7662 truth_type_for (vectype_in
),
7665 if (dump_enabled_p ())
7666 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7667 "can't operate on partial vectors because"
7668 " no conditional operation is available.\n");
7669 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
7672 vect_record_loop_mask (loop_vinfo
, masks
, ncopies
* vec_num
,
7678 /* STMT_INFO is a dot-product reduction whose multiplication operands
7679 have different signs. Emit a sequence to emulate the operation
7680 using a series of signed DOT_PROD_EXPRs and return the last
7681 statement generated. VEC_DEST is the result of the vector operation
7682 and VOP lists its inputs. */
7685 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
7686 gimple_stmt_iterator
*gsi
, tree vec_dest
,
7689 tree wide_vectype
= signed_type_for (TREE_TYPE (vec_dest
));
7690 tree narrow_vectype
= signed_type_for (TREE_TYPE (vop
[0]));
7691 tree narrow_elttype
= TREE_TYPE (narrow_vectype
);
7694 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7695 if (!TYPE_UNSIGNED (TREE_TYPE (vop
[0])))
7696 std::swap (vop
[0], vop
[1]);
7698 /* Convert all inputs to signed types. */
7699 for (int i
= 0; i
< 3; ++i
)
7700 if (TYPE_UNSIGNED (TREE_TYPE (vop
[i
])))
7702 tree tmp
= make_ssa_name (signed_type_for (TREE_TYPE (vop
[i
])));
7703 new_stmt
= gimple_build_assign (tmp
, NOP_EXPR
, vop
[i
]);
7704 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7708 /* In the comments below we assume 8-bit inputs for simplicity,
7709 but the approach works for any full integer type. */
7711 /* Create a vector of -128. */
7712 tree min_narrow_elttype
= TYPE_MIN_VALUE (narrow_elttype
);
7713 tree min_narrow
= build_vector_from_val (narrow_vectype
,
7714 min_narrow_elttype
);
7716 /* Create a vector of 64. */
7717 auto half_wi
= wi::lrshift (wi::to_wide (min_narrow_elttype
), 1);
7718 tree half_narrow
= wide_int_to_tree (narrow_elttype
, half_wi
);
7719 half_narrow
= build_vector_from_val (narrow_vectype
, half_narrow
);
7721 /* Emit: SUB_RES = VOP[0] - 128. */
7722 tree sub_res
= make_ssa_name (narrow_vectype
);
7723 new_stmt
= gimple_build_assign (sub_res
, PLUS_EXPR
, vop
[0], min_narrow
);
7724 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7728 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7729 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7730 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7732 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7733 Doing the two 64 * y steps first allows more time to compute x. */
7734 tree stage1
= make_ssa_name (wide_vectype
);
7735 new_stmt
= gimple_build_assign (stage1
, DOT_PROD_EXPR
,
7736 vop
[1], half_narrow
, vop
[2]);
7737 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7739 tree stage2
= make_ssa_name (wide_vectype
);
7740 new_stmt
= gimple_build_assign (stage2
, DOT_PROD_EXPR
,
7741 vop
[1], half_narrow
, stage1
);
7742 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7744 tree stage3
= make_ssa_name (wide_vectype
);
7745 new_stmt
= gimple_build_assign (stage3
, DOT_PROD_EXPR
,
7746 sub_res
, vop
[1], stage2
);
7747 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7749 /* Convert STAGE3 to the reduction type. */
7750 return gimple_build_assign (vec_dest
, CONVERT_EXPR
, stage3
);
7753 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7757 vect_transform_reduction (loop_vec_info loop_vinfo
,
7758 stmt_vec_info stmt_info
, gimple_stmt_iterator
*gsi
,
7759 gimple
**vec_stmt
, slp_tree slp_node
)
7761 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7762 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7767 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7768 gcc_assert (reduc_info
->is_reduc_info
);
7770 if (nested_in_vect_loop_p (loop
, stmt_info
))
7773 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
) == vect_double_reduction_def
);
7777 if (!gimple_extract_op (stmt_info
->stmt
, &op
))
7780 /* All uses but the last are expected to be defined in the loop.
7781 The last use is the reduction variable. In case of nested cycle this
7782 assumption is not true: we use reduc_index to record the index of the
7783 reduction variable. */
7784 stmt_vec_info phi_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
));
7785 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
7786 int reduc_index
= STMT_VINFO_REDUC_IDX (stmt_info
);
7787 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
7792 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7796 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7800 code_helper code
= canonicalize_code (op
.code
, op
.type
);
7801 internal_fn cond_fn
= get_conditional_internal_fn (code
, op
.type
);
7802 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7803 bool mask_by_cond_expr
= use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
);
7806 tree new_temp
= NULL_TREE
;
7807 auto_vec
<tree
> vec_oprnds0
;
7808 auto_vec
<tree
> vec_oprnds1
;
7809 auto_vec
<tree
> vec_oprnds2
;
7812 if (dump_enabled_p ())
7813 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
7815 /* FORNOW: Multiple types are not supported for condition. */
7816 if (code
== COND_EXPR
)
7817 gcc_assert (ncopies
== 1);
7819 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
7821 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
7822 if (reduction_type
== FOLD_LEFT_REDUCTION
)
7824 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
7825 gcc_assert (code
.is_tree_code ());
7826 return vectorize_fold_left_reduction
7827 (loop_vinfo
, stmt_info
, gsi
, vec_stmt
, slp_node
, reduc_def_phi
,
7828 tree_code (code
), reduc_fn
, op
.ops
, vectype_in
, reduc_index
, masks
);
7831 bool single_defuse_cycle
= STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
7832 gcc_assert (single_defuse_cycle
7833 || code
== DOT_PROD_EXPR
7834 || code
== WIDEN_SUM_EXPR
7835 || code
== SAD_EXPR
);
7837 /* Create the destination vector */
7838 tree scalar_dest
= gimple_get_lhs (stmt_info
->stmt
);
7839 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
7841 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
, ncopies
,
7842 single_defuse_cycle
&& reduc_index
== 0
7843 ? NULL_TREE
: op
.ops
[0], &vec_oprnds0
,
7844 single_defuse_cycle
&& reduc_index
== 1
7845 ? NULL_TREE
: op
.ops
[1], &vec_oprnds1
,
7847 && !(single_defuse_cycle
&& reduc_index
== 2)
7848 ? op
.ops
[2] : NULL_TREE
, &vec_oprnds2
);
7849 if (single_defuse_cycle
)
7851 gcc_assert (!slp_node
);
7852 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
7853 op
.ops
[reduc_index
],
7854 reduc_index
== 0 ? &vec_oprnds0
7855 : (reduc_index
== 1 ? &vec_oprnds1
7859 bool emulated_mixed_dot_prod
7860 = vect_is_emulated_mixed_dot_prod (loop_vinfo
, stmt_info
);
7861 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
7864 tree vop
[3] = { def0
, vec_oprnds1
[i
], NULL_TREE
};
7865 if (masked_loop_p
&& !mask_by_cond_expr
)
7867 /* No conditional ifns have been defined for dot-product yet. */
7868 gcc_assert (code
!= DOT_PROD_EXPR
);
7870 /* Make sure that the reduction accumulator is vop[0]. */
7871 if (reduc_index
== 1)
7873 gcc_assert (commutative_binary_op_p (code
, op
.type
));
7874 std::swap (vop
[0], vop
[1]);
7876 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7878 gcall
*call
= gimple_build_call_internal (cond_fn
, 4, mask
,
7879 vop
[0], vop
[1], vop
[0]);
7880 new_temp
= make_ssa_name (vec_dest
, call
);
7881 gimple_call_set_lhs (call
, new_temp
);
7882 gimple_call_set_nothrow (call
, true);
7883 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, call
, gsi
);
7888 if (op
.num_ops
== 3)
7889 vop
[2] = vec_oprnds2
[i
];
7891 if (masked_loop_p
&& mask_by_cond_expr
)
7893 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7895 build_vect_cond_expr (code
, vop
, mask
, gsi
);
7898 if (emulated_mixed_dot_prod
)
7899 new_stmt
= vect_emulate_mixed_dot_prod (loop_vinfo
, stmt_info
, gsi
,
7901 else if (code
.is_internal_fn ())
7902 new_stmt
= gimple_build_call_internal (internal_fn (code
),
7904 vop
[0], vop
[1], vop
[2]);
7906 new_stmt
= gimple_build_assign (vec_dest
, tree_code (op
.code
),
7907 vop
[0], vop
[1], vop
[2]);
7908 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
7909 gimple_set_lhs (new_stmt
, new_temp
);
7910 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, new_stmt
, gsi
);
7914 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt
);
7915 else if (single_defuse_cycle
7918 if (reduc_index
== 0)
7919 vec_oprnds0
.safe_push (gimple_get_lhs (new_stmt
));
7920 else if (reduc_index
== 1)
7921 vec_oprnds1
.safe_push (gimple_get_lhs (new_stmt
));
7922 else if (reduc_index
== 2)
7923 vec_oprnds2
.safe_push (gimple_get_lhs (new_stmt
));
7926 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
7930 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
7935 /* Transform phase of a cycle PHI. */
7938 vect_transform_cycle_phi (loop_vec_info loop_vinfo
,
7939 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
7940 slp_tree slp_node
, slp_instance slp_node_instance
)
7942 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7943 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7947 bool nested_cycle
= false;
7950 if (nested_in_vect_loop_p (loop
, stmt_info
))
7953 nested_cycle
= true;
7956 stmt_vec_info reduc_stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
7957 reduc_stmt_info
= vect_stmt_to_vectorize (reduc_stmt_info
);
7958 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7959 gcc_assert (reduc_info
->is_reduc_info
);
7961 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
7962 || STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
)
7963 /* Leave the scalar phi in place. */
7966 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
7967 /* For a nested cycle we do not fill the above. */
7969 vectype_in
= STMT_VINFO_VECTYPE (stmt_info
);
7970 gcc_assert (vectype_in
);
7974 /* The size vect_schedule_slp_instance computes is off for us. */
7975 vec_num
= vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
7976 * SLP_TREE_LANES (slp_node
), vectype_in
);
7982 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7985 /* Check whether we should use a single PHI node and accumulate
7986 vectors to one before the backedge. */
7987 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
))
7990 /* Create the destination vector */
7991 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
7992 tree vec_dest
= vect_create_destination_var (gimple_phi_result (phi
),
7995 /* Get the loop-entry arguments. */
7996 tree vec_initial_def
= NULL_TREE
;
7997 auto_vec
<tree
> vec_initial_defs
;
8000 vec_initial_defs
.reserve (vec_num
);
8003 unsigned phi_idx
= loop_preheader_edge (loop
)->dest_idx
;
8004 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[phi_idx
],
8009 gcc_assert (slp_node
== slp_node_instance
->reduc_phis
);
8010 vec
<tree
> &initial_values
= reduc_info
->reduc_initial_values
;
8011 vec
<stmt_vec_info
> &stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
8013 unsigned int num_phis
= stmts
.length ();
8014 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info
))
8016 initial_values
.reserve (num_phis
);
8017 for (unsigned int i
= 0; i
< num_phis
; ++i
)
8019 gphi
*this_phi
= as_a
<gphi
*> (stmts
[i
]->stmt
);
8020 initial_values
.quick_push (vect_phi_initial_value (this_phi
));
8023 vect_find_reusable_accumulator (loop_vinfo
, reduc_info
);
8024 if (!initial_values
.is_empty ())
8027 = (num_phis
== 1 ? initial_values
[0] : NULL_TREE
);
8028 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
8030 = neutral_op_for_reduction (TREE_TYPE (vectype_out
),
8031 code
, initial_value
);
8032 get_initial_defs_for_reduction (loop_vinfo
, reduc_info
,
8033 &vec_initial_defs
, vec_num
,
8034 stmts
.length (), neutral_op
);
8040 /* Get at the scalar def before the loop, that defines the initial
8041 value of the reduction variable. */
8042 tree initial_def
= vect_phi_initial_value (phi
);
8043 reduc_info
->reduc_initial_values
.safe_push (initial_def
);
8044 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8045 and we can't use zero for induc_val, use initial_def. Similarly
8046 for REDUC_MIN and initial_def larger than the base. */
8047 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
8049 tree induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
8050 if (TREE_CODE (initial_def
) == INTEGER_CST
8051 && !integer_zerop (induc_val
)
8052 && ((STMT_VINFO_REDUC_CODE (reduc_info
) == MAX_EXPR
8053 && tree_int_cst_lt (initial_def
, induc_val
))
8054 || (STMT_VINFO_REDUC_CODE (reduc_info
) == MIN_EXPR
8055 && tree_int_cst_lt (induc_val
, initial_def
))))
8057 induc_val
= initial_def
;
8058 /* Communicate we used the initial_def to epilouge
8060 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
) = NULL_TREE
;
8062 vec_initial_def
= build_vector_from_val (vectype_out
, induc_val
);
8064 else if (nested_cycle
)
8066 /* Do not use an adjustment def as that case is not supported
8067 correctly if ncopies is not one. */
8068 vect_get_vec_defs_for_operand (loop_vinfo
, reduc_stmt_info
,
8069 ncopies
, initial_def
,
8072 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == CONST_COND_REDUCTION
8073 || STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
8074 /* Fill the initial vector with the initial scalar value. */
8076 = get_initial_def_for_reduction (loop_vinfo
, reduc_stmt_info
,
8077 initial_def
, initial_def
);
8081 vect_find_reusable_accumulator (loop_vinfo
, reduc_info
);
8082 if (!reduc_info
->reduc_initial_values
.is_empty ())
8084 initial_def
= reduc_info
->reduc_initial_values
[0];
8085 code_helper code
= STMT_VINFO_REDUC_CODE (reduc_info
);
8087 = neutral_op_for_reduction (TREE_TYPE (initial_def
),
8089 gcc_assert (neutral_op
);
8090 /* Try to simplify the vector initialization by applying an
8091 adjustment after the reduction has been performed. */
8092 if (!reduc_info
->reused_accumulator
8093 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
8094 && !operand_equal_p (neutral_op
, initial_def
))
8096 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
)
8098 initial_def
= neutral_op
;
8101 = get_initial_def_for_reduction (loop_vinfo
, reduc_info
,
8102 initial_def
, neutral_op
);
8107 if (vec_initial_def
)
8109 vec_initial_defs
.create (ncopies
);
8110 for (i
= 0; i
< ncopies
; ++i
)
8111 vec_initial_defs
.quick_push (vec_initial_def
);
8114 if (auto *accumulator
= reduc_info
->reused_accumulator
)
8116 tree def
= accumulator
->reduc_input
;
8117 if (!useless_type_conversion_p (vectype_out
, TREE_TYPE (def
)))
8119 unsigned int nreduc
;
8120 bool res
= constant_multiple_p (TYPE_VECTOR_SUBPARTS
8122 TYPE_VECTOR_SUBPARTS (vectype_out
),
8125 gimple_seq stmts
= NULL
;
8126 /* Reduce the single vector to a smaller one. */
8129 /* Perform the reduction in the appropriate type. */
8130 tree rvectype
= vectype_out
;
8131 if (!useless_type_conversion_p (TREE_TYPE (vectype_out
),
8132 TREE_TYPE (TREE_TYPE (def
))))
8133 rvectype
= build_vector_type (TREE_TYPE (TREE_TYPE (def
)),
8134 TYPE_VECTOR_SUBPARTS
8136 def
= vect_create_partial_epilog (def
, rvectype
,
8137 STMT_VINFO_REDUC_CODE
8141 /* The epilogue loop might use a different vector mode, like
8143 if (TYPE_MODE (vectype_out
) != TYPE_MODE (TREE_TYPE (def
)))
8145 tree reduc_type
= build_vector_type_for_mode
8146 (TREE_TYPE (TREE_TYPE (def
)), TYPE_MODE (vectype_out
));
8147 def
= gimple_convert (&stmts
, reduc_type
, def
);
8149 /* Adjust the input so we pick up the partially reduced value
8150 for the skip edge in vect_create_epilog_for_reduction. */
8151 accumulator
->reduc_input
= def
;
8152 /* And the reduction could be carried out using a different sign. */
8153 if (!useless_type_conversion_p (vectype_out
, TREE_TYPE (def
)))
8154 def
= gimple_convert (&stmts
, vectype_out
, def
);
8155 if (loop_vinfo
->main_loop_edge
)
8157 /* While we'd like to insert on the edge this will split
8158 blocks and disturb bookkeeping, we also will eventually
8159 need this on the skip edge. Rely on sinking to
8160 fixup optimal placement and insert in the pred. */
8161 gimple_stmt_iterator gsi
8162 = gsi_last_bb (loop_vinfo
->main_loop_edge
->src
);
8163 /* Insert before a cond that eventually skips the
8165 if (!gsi_end_p (gsi
) && stmt_ends_bb_p (gsi_stmt (gsi
)))
8167 gsi_insert_seq_after (&gsi
, stmts
, GSI_CONTINUE_LINKING
);
8170 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
),
8173 if (loop_vinfo
->main_loop_edge
)
8175 = vect_get_main_loop_result (loop_vinfo
, def
,
8176 vec_initial_defs
[0]);
8178 vec_initial_defs
.safe_push (def
);
8181 /* Generate the reduction PHIs upfront. */
8182 for (i
= 0; i
< vec_num
; i
++)
8184 tree vec_init_def
= vec_initial_defs
[i
];
8185 for (j
= 0; j
< ncopies
; j
++)
8187 /* Create the reduction-phi that defines the reduction
8189 gphi
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
8191 /* Set the loop-entry arg of the reduction-phi. */
8192 if (j
!= 0 && nested_cycle
)
8193 vec_init_def
= vec_initial_defs
[j
];
8194 add_phi_arg (new_phi
, vec_init_def
, loop_preheader_edge (loop
),
8197 /* The loop-latch arg is set in epilogue processing. */
8200 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
8204 *vec_stmt
= new_phi
;
8205 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
8213 /* Vectorizes LC PHIs. */
8216 vectorizable_lc_phi (loop_vec_info loop_vinfo
,
8217 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
8221 || !is_a
<gphi
*> (stmt_info
->stmt
)
8222 || gimple_phi_num_args (stmt_info
->stmt
) != 1)
8225 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
8226 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
8229 if (!vec_stmt
) /* transformation not required. */
8231 /* Deal with copies from externs or constants that disguise as
8232 loop-closed PHI nodes (PR97886). */
8234 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node
)[0],
8235 SLP_TREE_VECTYPE (slp_node
)))
8237 if (dump_enabled_p ())
8238 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8239 "incompatible vector types for invariants\n");
8242 STMT_VINFO_TYPE (stmt_info
) = lc_phi_info_type
;
8246 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
8247 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
8248 basic_block bb
= gimple_bb (stmt_info
->stmt
);
8249 edge e
= single_pred_edge (bb
);
8250 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
8251 auto_vec
<tree
> vec_oprnds
;
8252 vect_get_vec_defs (loop_vinfo
, stmt_info
, slp_node
,
8253 !slp_node
? vect_get_num_copies (loop_vinfo
, vectype
) : 1,
8254 gimple_phi_arg_def (stmt_info
->stmt
, 0), &vec_oprnds
);
8255 for (unsigned i
= 0; i
< vec_oprnds
.length (); i
++)
8257 /* Create the vectorized LC PHI node. */
8258 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
8259 add_phi_arg (new_phi
, vec_oprnds
[i
], e
, UNKNOWN_LOCATION
);
8261 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi
);
8263 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_phi
);
8266 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
8271 /* Vectorizes PHIs. */
8274 vectorizable_phi (vec_info
*,
8275 stmt_vec_info stmt_info
, gimple
**vec_stmt
,
8276 slp_tree slp_node
, stmt_vector_for_cost
*cost_vec
)
8278 if (!is_a
<gphi
*> (stmt_info
->stmt
) || !slp_node
)
8281 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
)
8284 tree vectype
= SLP_TREE_VECTYPE (slp_node
);
8286 if (!vec_stmt
) /* transformation not required. */
8290 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), i
, child
)
8293 if (dump_enabled_p ())
8294 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8295 "PHI node with unvectorized backedge def\n");
8298 else if (!vect_maybe_update_slp_op_vectype (child
, vectype
))
8300 if (dump_enabled_p ())
8301 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8302 "incompatible vector types for invariants\n");
8305 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
8306 && !useless_type_conversion_p (vectype
,
8307 SLP_TREE_VECTYPE (child
)))
8309 /* With bools we can have mask and non-mask precision vectors
8310 or different non-mask precisions. while pattern recog is
8311 supposed to guarantee consistency here bugs in it can cause
8312 mismatches (PR103489 and PR103800 for example).
8313 Deal with them here instead of ICEing later. */
8314 if (dump_enabled_p ())
8315 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8316 "incompatible vector type setup from "
8317 "bool pattern detection\n");
8321 /* For single-argument PHIs assume coalescing which means zero cost
8322 for the scalar and the vector PHIs. This avoids artificially
8323 favoring the vector path (but may pessimize it in some cases). */
8324 if (gimple_phi_num_args (as_a
<gphi
*> (stmt_info
->stmt
)) > 1)
8325 record_stmt_cost (cost_vec
, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
8326 vector_stmt
, stmt_info
, vectype
, 0, vect_body
);
8327 STMT_VINFO_TYPE (stmt_info
) = phi_info_type
;
8331 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
8332 basic_block bb
= gimple_bb (stmt_info
->stmt
);
8333 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
8334 auto_vec
<gphi
*> new_phis
;
8335 for (unsigned i
= 0; i
< gimple_phi_num_args (stmt_info
->stmt
); ++i
)
8337 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
8339 /* Skip not yet vectorized defs. */
8340 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
8341 && SLP_TREE_VEC_STMTS (child
).is_empty ())
8344 auto_vec
<tree
> vec_oprnds
;
8345 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node
)[i
], &vec_oprnds
);
8346 if (!new_phis
.exists ())
8348 new_phis
.create (vec_oprnds
.length ());
8349 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
8351 /* Create the vectorized LC PHI node. */
8352 new_phis
.quick_push (create_phi_node (vec_dest
, bb
));
8353 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phis
[j
]);
8356 edge e
= gimple_phi_arg_edge (as_a
<gphi
*> (stmt_info
->stmt
), i
);
8357 for (unsigned j
= 0; j
< vec_oprnds
.length (); j
++)
8358 add_phi_arg (new_phis
[j
], vec_oprnds
[j
], e
, UNKNOWN_LOCATION
);
8360 /* We should have at least one already vectorized child. */
8361 gcc_assert (new_phis
.exists ());
8366 /* Vectorizes first order recurrences. An overview of the transformation
8367 is described below. Suppose we have the following loop.
8370 for (int i = 0; i < n; ++i)
8376 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8377 looks (simplified) like:
8383 i = PHI <0(scalar.preheader), i+1(scalar.body)>
8384 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8387 if (i < n) goto scalar.body
8389 In this example, _2 is a recurrence because it's value depends on the
8390 previous iteration. We vectorize this as (VF = 4)
8393 vect_init = vect_cst(..., ..., ..., 0)
8396 i = PHI <0(vector.preheader), i+4(vector.body)>
8397 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8398 vect_2 = a[i, i+1, i+2, i+3];
8399 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8400 b[i, i+1, i+2, i+3] = vect_2 - vect_3
8401 if (..) goto vector.body
8403 In this function, vectorizable_recurr, we code generate both the
8404 vector PHI node and the permute since those together compute the
8405 vectorized value of the scalar PHI. We do not yet have the
8406 backedge value to fill in there nor into the vec_perm. Those
8407 are filled in maybe_set_vectorized_backedge_value and
8410 TODO: Since the scalar loop does not have a use of the recurrence
8411 outside of the loop the natural way to implement peeling via
8412 vectorizing the live value doesn't work. For now peeling of loops
8413 with a recurrence is not implemented. For SLP the supported cases
8414 are restricted to those requiring a single vector recurrence PHI. */
8417 vectorizable_recurr (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
8418 gimple
**vec_stmt
, slp_tree slp_node
,
8419 stmt_vector_for_cost
*cost_vec
)
8421 if (!loop_vinfo
|| !is_a
<gphi
*> (stmt_info
->stmt
))
8424 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
8426 /* So far we only support first-order recurrence auto-vectorization. */
8427 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_first_order_recurrence
)
8430 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
8433 ncopies
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
8435 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
8436 poly_int64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
8437 unsigned dist
= slp_node
? SLP_TREE_LANES (slp_node
) : 1;
8438 /* We need to be able to make progress with a single vector. */
8439 if (maybe_gt (dist
* 2, nunits
))
8441 if (dump_enabled_p ())
8442 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8443 "first order recurrence exceeds half of "
8448 /* First-order recurrence autovectorization needs to handle permutation
8449 with indices = [nunits-1, nunits, nunits+1, ...]. */
8450 vec_perm_builder
sel (nunits
, 1, 3);
8451 for (int i
= 0; i
< 3; ++i
)
8452 sel
.quick_push (nunits
- dist
+ i
);
8453 vec_perm_indices
indices (sel
, 2, nunits
);
8455 if (!vec_stmt
) /* transformation not required. */
8457 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), TYPE_MODE (vectype
),
8463 /* We eventually need to set a vector type on invariant
8467 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
8468 if (!vect_maybe_update_slp_op_vectype
8469 (child
, SLP_TREE_VECTYPE (slp_node
)))
8471 if (dump_enabled_p ())
8472 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8473 "incompatible vector types for "
8478 /* The recurrence costs the initialization vector and one permute
8480 unsigned prologue_cost
= record_stmt_cost (cost_vec
, 1, scalar_to_vec
,
8481 stmt_info
, 0, vect_prologue
);
8482 unsigned inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
8483 stmt_info
, 0, vect_body
);
8484 if (dump_enabled_p ())
8485 dump_printf_loc (MSG_NOTE
, vect_location
,
8486 "vectorizable_recurr: inside_cost = %d, "
8487 "prologue_cost = %d .\n", inside_cost
,
8490 STMT_VINFO_TYPE (stmt_info
) = recurr_info_type
;
8494 edge pe
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
8495 basic_block bb
= gimple_bb (phi
);
8496 tree preheader
= PHI_ARG_DEF_FROM_EDGE (phi
, pe
);
8497 if (!useless_type_conversion_p (TREE_TYPE (vectype
), TREE_TYPE (preheader
)))
8499 gimple_seq stmts
= NULL
;
8500 preheader
= gimple_convert (&stmts
, TREE_TYPE (vectype
), preheader
);
8501 gsi_insert_seq_on_edge_immediate (pe
, stmts
);
8503 tree vec_init
= build_vector_from_val (vectype
, preheader
);
8504 vec_init
= vect_init_vector (loop_vinfo
, stmt_info
, vec_init
, vectype
, NULL
);
8506 /* Create the vectorized first-order PHI node. */
8507 tree vec_dest
= vect_get_new_vect_var (vectype
,
8508 vect_simple_var
, "vec_recur_");
8509 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
8510 add_phi_arg (new_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
8512 /* Insert shuffles the first-order recurrence autovectorization.
8513 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8514 tree perm
= vect_gen_perm_mask_checked (vectype
, indices
);
8516 /* Insert the required permute after the latch definition. The
8517 second and later operands are tentative and will be updated when we have
8518 vectorized the latch definition. */
8519 edge le
= loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo
));
8520 gimple
*latch_def
= SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi
, le
));
8521 gimple_stmt_iterator gsi2
= gsi_for_stmt (latch_def
);
8524 for (unsigned i
= 0; i
< ncopies
; ++i
)
8526 vec_dest
= make_ssa_name (vectype
);
8528 = gimple_build_assign (vec_dest
, VEC_PERM_EXPR
,
8529 i
== 0 ? gimple_phi_result (new_phi
) : NULL
,
8531 vect_finish_stmt_generation (loop_vinfo
, stmt_info
, vperm
, &gsi2
);
8534 SLP_TREE_VEC_STMTS (slp_node
).quick_push (vperm
);
8536 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (vperm
);
8540 *vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
)[0];
8544 /* Return true if VECTYPE represents a vector that requires lowering
8545 by the vector lowering pass. */
8548 vect_emulated_vector_p (tree vectype
)
8550 return (!VECTOR_MODE_P (TYPE_MODE (vectype
))
8551 && (!VECTOR_BOOLEAN_TYPE_P (vectype
)
8552 || TYPE_PRECISION (TREE_TYPE (vectype
)) != 1));
8555 /* Return true if we can emulate CODE on an integer mode representation
8559 vect_can_vectorize_without_simd_p (tree_code code
)
8577 /* Likewise, but taking a code_helper. */
8580 vect_can_vectorize_without_simd_p (code_helper code
)
8582 return (code
.is_tree_code ()
8583 && vect_can_vectorize_without_simd_p (tree_code (code
)));
8586 /* Create vector init for vectorized iv. */
8588 vect_create_nonlinear_iv_init (gimple_seq
* stmts
, tree init_expr
,
8589 tree step_expr
, poly_uint64 nunits
,
8591 enum vect_induction_op_type induction_type
)
8593 unsigned HOST_WIDE_INT const_nunits
;
8594 tree vec_shift
, vec_init
, new_name
;
8596 tree itype
= TREE_TYPE (vectype
);
8598 /* iv_loop is the loop to be vectorized. Create:
8599 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8600 new_name
= gimple_convert (stmts
, itype
, init_expr
);
8601 switch (induction_type
)
8603 case vect_step_op_shr
:
8604 case vect_step_op_shl
:
8605 /* Build the Initial value from shift_expr. */
8606 vec_init
= gimple_build_vector_from_val (stmts
,
8609 vec_shift
= gimple_build (stmts
, VEC_SERIES_EXPR
, vectype
,
8610 build_zero_cst (itype
), step_expr
);
8611 vec_init
= gimple_build (stmts
,
8612 (induction_type
== vect_step_op_shr
8613 ? RSHIFT_EXPR
: LSHIFT_EXPR
),
8614 vectype
, vec_init
, vec_shift
);
8617 case vect_step_op_neg
:
8619 vec_init
= gimple_build_vector_from_val (stmts
,
8622 tree vec_neg
= gimple_build (stmts
, NEGATE_EXPR
,
8624 /* The encoding has 2 interleaved stepped patterns. */
8625 vec_perm_builder
sel (nunits
, 2, 3);
8627 for (i
= 0; i
< 3; i
++)
8630 sel
[2 * i
+ 1] = i
+ nunits
;
8632 vec_perm_indices
indices (sel
, 2, nunits
);
8633 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8634 fail when vec_init is const vector. In that situation vec_perm is not
8637 = vect_gen_perm_mask_any (vectype
, indices
);
8638 vec_init
= gimple_build (stmts
, VEC_PERM_EXPR
,
8645 case vect_step_op_mul
:
8647 /* Use unsigned mult to avoid UD integer overflow. */
8648 gcc_assert (nunits
.is_constant (&const_nunits
));
8649 tree utype
= unsigned_type_for (itype
);
8650 tree uvectype
= build_vector_type (utype
,
8651 TYPE_VECTOR_SUBPARTS (vectype
));
8652 new_name
= gimple_convert (stmts
, utype
, new_name
);
8653 vec_init
= gimple_build_vector_from_val (stmts
,
8656 tree_vector_builder
elts (uvectype
, const_nunits
, 1);
8657 tree elt_step
= build_one_cst (utype
);
8659 elts
.quick_push (elt_step
);
8660 for (i
= 1; i
< const_nunits
; i
++)
8662 /* Create: new_name_i = new_name + step_expr. */
8663 elt_step
= gimple_build (stmts
, MULT_EXPR
,
8664 utype
, elt_step
, step_expr
);
8665 elts
.quick_push (elt_step
);
8667 /* Create a vector from [new_name_0, new_name_1, ...,
8668 new_name_nunits-1]. */
8669 tree vec_mul
= gimple_build_vector (stmts
, &elts
);
8670 vec_init
= gimple_build (stmts
, MULT_EXPR
, uvectype
,
8672 vec_init
= gimple_convert (stmts
, vectype
, vec_init
);
8683 /* Peel init_expr by skip_niter for induction_type. */
8685 vect_peel_nonlinear_iv_init (gimple_seq
* stmts
, tree init_expr
,
8686 tree skip_niters
, tree step_expr
,
8687 enum vect_induction_op_type induction_type
)
8689 gcc_assert (TREE_CODE (skip_niters
) == INTEGER_CST
);
8690 tree type
= TREE_TYPE (init_expr
);
8691 unsigned prec
= TYPE_PRECISION (type
);
8692 switch (induction_type
)
8694 case vect_step_op_neg
:
8695 if (TREE_INT_CST_LOW (skip_niters
) % 2)
8696 init_expr
= gimple_build (stmts
, NEGATE_EXPR
, type
, init_expr
);
8697 /* else no change. */
8700 case vect_step_op_shr
:
8701 case vect_step_op_shl
:
8702 skip_niters
= gimple_convert (stmts
, type
, skip_niters
);
8703 step_expr
= gimple_build (stmts
, MULT_EXPR
, type
, step_expr
, skip_niters
);
8704 /* When shift mount >= precision, need to avoid UD.
8705 In the original loop, there's no UD, and according to semantic,
8706 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
8707 if (!tree_fits_uhwi_p (step_expr
)
8708 || tree_to_uhwi (step_expr
) >= prec
)
8710 if (induction_type
== vect_step_op_shl
8711 || TYPE_UNSIGNED (type
))
8712 init_expr
= build_zero_cst (type
);
8714 init_expr
= gimple_build (stmts
, RSHIFT_EXPR
, type
,
8716 wide_int_to_tree (type
, prec
- 1));
8719 init_expr
= gimple_build (stmts
, (induction_type
== vect_step_op_shr
8720 ? RSHIFT_EXPR
: LSHIFT_EXPR
),
8721 type
, init_expr
, step_expr
);
8724 case vect_step_op_mul
:
8726 tree utype
= unsigned_type_for (type
);
8727 init_expr
= gimple_convert (stmts
, utype
, init_expr
);
8728 unsigned skipn
= TREE_INT_CST_LOW (skip_niters
);
8729 wide_int begin
= wi::to_wide (step_expr
);
8730 for (unsigned i
= 0; i
!= skipn
- 1; i
++)
8731 begin
= wi::mul (begin
, wi::to_wide (step_expr
));
8732 tree mult_expr
= wide_int_to_tree (utype
, begin
);
8733 init_expr
= gimple_build (stmts
, MULT_EXPR
, utype
, init_expr
, mult_expr
);
8734 init_expr
= gimple_convert (stmts
, type
, init_expr
);
8745 /* Create vector step for vectorized iv. */
8747 vect_create_nonlinear_iv_step (gimple_seq
* stmts
, tree step_expr
,
8749 enum vect_induction_op_type induction_type
)
8751 tree expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
8752 tree new_name
= NULL
;
8753 /* Step should be pow (step, vf) for mult induction. */
8754 if (induction_type
== vect_step_op_mul
)
8756 gcc_assert (vf
.is_constant ());
8757 wide_int begin
= wi::to_wide (step_expr
);
8759 for (unsigned i
= 0; i
!= vf
.to_constant () - 1; i
++)
8760 begin
= wi::mul (begin
, wi::to_wide (step_expr
));
8762 new_name
= wide_int_to_tree (TREE_TYPE (step_expr
), begin
);
8764 else if (induction_type
== vect_step_op_neg
)
8768 new_name
= gimple_build (stmts
, MULT_EXPR
, TREE_TYPE (step_expr
),
8774 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo
,
8775 stmt_vec_info stmt_info
,
8776 tree new_name
, tree vectype
,
8777 enum vect_induction_op_type induction_type
)
8779 /* No step is needed for neg induction. */
8780 if (induction_type
== vect_step_op_neg
)
8783 tree t
= unshare_expr (new_name
);
8784 gcc_assert (CONSTANT_CLASS_P (new_name
)
8785 || TREE_CODE (new_name
) == SSA_NAME
);
8786 tree new_vec
= build_vector_from_val (vectype
, t
);
8787 tree vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
8788 new_vec
, vectype
, NULL
);
8792 /* Update vectorized iv with vect_step, induc_def is init. */
8794 vect_update_nonlinear_iv (gimple_seq
* stmts
, tree vectype
,
8795 tree induc_def
, tree vec_step
,
8796 enum vect_induction_op_type induction_type
)
8798 tree vec_def
= induc_def
;
8799 switch (induction_type
)
8801 case vect_step_op_mul
:
8803 /* Use unsigned mult to avoid UD integer overflow. */
8805 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype
)),
8806 TYPE_VECTOR_SUBPARTS (vectype
));
8807 vec_def
= gimple_convert (stmts
, uvectype
, vec_def
);
8808 vec_step
= gimple_convert (stmts
, uvectype
, vec_step
);
8809 vec_def
= gimple_build (stmts
, MULT_EXPR
, uvectype
,
8811 vec_def
= gimple_convert (stmts
, vectype
, vec_def
);
8815 case vect_step_op_shr
:
8816 vec_def
= gimple_build (stmts
, RSHIFT_EXPR
, vectype
,
8820 case vect_step_op_shl
:
8821 vec_def
= gimple_build (stmts
, LSHIFT_EXPR
, vectype
,
8824 case vect_step_op_neg
:
8825 vec_def
= induc_def
;
8836 /* Function vectorizable_induction
8838 Check if STMT_INFO performs an nonlinear induction computation that can be
8839 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
8840 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
8842 Return true if STMT_INFO is vectorizable in this way. */
8845 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo
,
8846 stmt_vec_info stmt_info
,
8847 gimple
**vec_stmt
, slp_tree slp_node
,
8848 stmt_vector_for_cost
*cost_vec
)
8850 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8852 bool nested_in_vect_loop
= false;
8853 class loop
*iv_loop
;
8855 edge pe
= loop_preheader_edge (loop
);
8857 tree vec_init
, vec_step
;
8860 gphi
*induction_phi
;
8861 tree induc_def
, vec_dest
;
8862 tree init_expr
, step_expr
;
8864 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8866 gimple_stmt_iterator si
;
8868 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
8870 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
8871 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
8872 enum vect_induction_op_type induction_type
8873 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
);
8875 gcc_assert (induction_type
> vect_step_op_add
);
8880 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
8881 gcc_assert (ncopies
>= 1);
8883 /* FORNOW. Only handle nonlinear induction in the same loop. */
8884 if (nested_in_vect_loop_p (loop
, stmt_info
))
8886 if (dump_enabled_p ())
8887 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8888 "nonlinear induction in nested loop.\n");
8893 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
8895 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
8896 update for each iv and a permutation to generate wanted vector iv. */
8899 if (dump_enabled_p ())
8900 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8901 "SLP induction not supported for nonlinear"
8906 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype
)))
8908 if (dump_enabled_p ())
8909 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8910 "floating point nonlinear induction vectorization"
8911 " not supported.\n");
8915 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
8916 init_expr
= vect_phi_initial_value (phi
);
8917 gcc_assert (step_expr
!= NULL_TREE
&& init_expr
!= NULL
8918 && TREE_CODE (step_expr
) == INTEGER_CST
);
8919 /* step_expr should be aligned with init_expr,
8920 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
8921 step_expr
= fold_convert (TREE_TYPE (vectype
), step_expr
);
8923 if (TREE_CODE (init_expr
) == INTEGER_CST
)
8924 init_expr
= fold_convert (TREE_TYPE (vectype
), init_expr
);
8926 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype
),
8927 TREE_TYPE (init_expr
)));
8929 switch (induction_type
)
8931 case vect_step_op_neg
:
8932 if (TREE_CODE (init_expr
) != INTEGER_CST
8933 && TREE_CODE (init_expr
) != REAL_CST
)
8935 /* Check for backend support of NEGATE_EXPR and vec_perm. */
8936 if (!directly_supported_p (NEGATE_EXPR
, vectype
))
8939 /* The encoding has 2 interleaved stepped patterns. */
8940 vec_perm_builder
sel (nunits
, 2, 3);
8941 machine_mode mode
= TYPE_MODE (vectype
);
8943 for (i
= 0; i
< 3; i
++)
8946 sel
[i
* 2 + 1] = i
+ nunits
;
8948 vec_perm_indices
indices (sel
, 2, nunits
);
8949 if (!can_vec_perm_const_p (mode
, mode
, indices
))
8954 case vect_step_op_mul
:
8956 /* Check for backend support of MULT_EXPR. */
8957 if (!directly_supported_p (MULT_EXPR
, vectype
))
8960 /* ?? How to construct vector step for variable number vector.
8961 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
8962 if (!vf
.is_constant ())
8967 case vect_step_op_shr
:
8968 /* Check for backend support of RSHIFT_EXPR. */
8969 if (!directly_supported_p (RSHIFT_EXPR
, vectype
, optab_vector
))
8972 /* Don't shift more than type precision to avoid UD. */
8973 if (!tree_fits_uhwi_p (step_expr
)
8974 || maybe_ge (nunits
* tree_to_uhwi (step_expr
),
8975 TYPE_PRECISION (TREE_TYPE (init_expr
))))
8979 case vect_step_op_shl
:
8980 /* Check for backend support of RSHIFT_EXPR. */
8981 if (!directly_supported_p (LSHIFT_EXPR
, vectype
, optab_vector
))
8984 /* Don't shift more than type precision to avoid UD. */
8985 if (!tree_fits_uhwi_p (step_expr
)
8986 || maybe_ge (nunits
* tree_to_uhwi (step_expr
),
8987 TYPE_PRECISION (TREE_TYPE (init_expr
))))
8996 if (!vec_stmt
) /* transformation not required. */
8998 unsigned inside_cost
= 0, prologue_cost
= 0;
8999 /* loop cost for vec_loop. Neg induction doesn't have any
9001 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
9002 stmt_info
, 0, vect_body
);
9004 /* loop cost for vec_loop. Neg induction doesn't have any
9006 if (induction_type
== vect_step_op_neg
)
9009 /* prologue cost for vec_init and vec_step. */
9010 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
9011 stmt_info
, 0, vect_prologue
);
9013 if (dump_enabled_p ())
9014 dump_printf_loc (MSG_NOTE
, vect_location
,
9015 "vect_model_induction_cost: inside_cost = %d, "
9016 "prologue_cost = %d. \n", inside_cost
,
9019 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
9020 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9026 /* Compute a vector variable, initialized with the first VF values of
9027 the induction variable. E.g., for an iv with IV_PHI='X' and
9028 evolution S, for a vector of 4 units, we want to compute:
9029 [X, X + S, X + 2*S, X + 3*S]. */
9031 if (dump_enabled_p ())
9032 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
9034 pe
= loop_preheader_edge (iv_loop
);
9035 /* Find the first insertion point in the BB. */
9036 basic_block bb
= gimple_bb (phi
);
9037 si
= gsi_after_labels (bb
);
9039 gimple_seq stmts
= NULL
;
9041 niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
9042 /* If we are using the loop mask to "peel" for alignment then we need
9043 to adjust the start value here. */
9044 if (niters_skip
!= NULL_TREE
)
9045 init_expr
= vect_peel_nonlinear_iv_init (&stmts
, init_expr
, niters_skip
,
9046 step_expr
, induction_type
);
9048 vec_init
= vect_create_nonlinear_iv_init (&stmts
, init_expr
,
9049 step_expr
, nunits
, vectype
,
9053 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
9054 gcc_assert (!new_bb
);
9058 new_name
= vect_create_nonlinear_iv_step (&stmts
, step_expr
,
9059 vf
, induction_type
);
9062 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
9063 gcc_assert (!new_bb
);
9066 vec_step
= vect_create_nonlinear_iv_vec_step (loop_vinfo
, stmt_info
,
9069 /* Create the following def-use cycle:
9074 vec_iv = PHI <vec_init, vec_loop>
9078 vec_loop = vec_iv + vec_step; */
9080 /* Create the induction-phi that defines the induction-operand. */
9081 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
9082 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
9083 induc_def
= PHI_RESULT (induction_phi
);
9085 /* Create the iv update inside the loop. */
9087 vec_def
= vect_update_nonlinear_iv (&stmts
, vectype
,
9088 induc_def
, vec_step
,
9091 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9092 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
9094 /* Set the arguments of the phi node: */
9095 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
9096 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
9099 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (induction_phi
);
9100 *vec_stmt
= induction_phi
;
9102 /* In case that vectorization factor (VF) is bigger than the number
9103 of elements that we can fit in a vectype (nunits), we have to generate
9104 more than one vector stmt - i.e - we need to "unroll" the
9105 vector stmt by a factor VF/nunits. For more details see documentation
9106 in vectorizable_operation. */
9111 /* FORNOW. This restriction should be relaxed. */
9112 gcc_assert (!nested_in_vect_loop
);
9114 new_name
= vect_create_nonlinear_iv_step (&stmts
, step_expr
,
9115 nunits
, induction_type
);
9117 vec_step
= vect_create_nonlinear_iv_vec_step (loop_vinfo
, stmt_info
,
9120 vec_def
= induc_def
;
9121 for (i
= 1; i
< ncopies
; i
++)
9123 /* vec_i = vec_prev + vec_step. */
9125 vec_def
= vect_update_nonlinear_iv (&stmts
, vectype
,
9128 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9129 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
9130 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
9134 if (dump_enabled_p ())
9135 dump_printf_loc (MSG_NOTE
, vect_location
,
9136 "transform induction: created def-use cycle: %G%G",
9137 (gimple
*) induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
9142 /* Function vectorizable_induction
9144 Check if STMT_INFO performs an induction computation that can be vectorized.
9145 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9146 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9147 Return true if STMT_INFO is vectorizable in this way. */
9150 vectorizable_induction (loop_vec_info loop_vinfo
,
9151 stmt_vec_info stmt_info
,
9152 gimple
**vec_stmt
, slp_tree slp_node
,
9153 stmt_vector_for_cost
*cost_vec
)
9155 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9157 bool nested_in_vect_loop
= false;
9158 class loop
*iv_loop
;
9160 edge pe
= loop_preheader_edge (loop
);
9162 tree new_vec
, vec_init
, vec_step
, t
;
9165 gphi
*induction_phi
;
9166 tree induc_def
, vec_dest
;
9167 tree init_expr
, step_expr
;
9168 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
9171 gimple_stmt_iterator si
;
9172 enum vect_induction_op_type induction_type
9173 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info
);
9175 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
9179 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
9182 /* Make sure it was recognized as induction computation. */
9183 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
9186 /* Handle nonlinear induction in a separate place. */
9187 if (induction_type
!= vect_step_op_add
)
9188 return vectorizable_nonlinear_induction (loop_vinfo
, stmt_info
,
9189 vec_stmt
, slp_node
, cost_vec
);
9191 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
9192 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
9197 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
9198 gcc_assert (ncopies
>= 1);
9200 /* FORNOW. These restrictions should be relaxed. */
9201 if (nested_in_vect_loop_p (loop
, stmt_info
))
9203 imm_use_iterator imm_iter
;
9204 use_operand_p use_p
;
9211 if (dump_enabled_p ())
9212 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9213 "multiple types in nested loop.\n");
9218 latch_e
= loop_latch_edge (loop
->inner
);
9219 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
9220 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
9222 gimple
*use_stmt
= USE_STMT (use_p
);
9223 if (is_gimple_debug (use_stmt
))
9226 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
9228 exit_phi
= use_stmt
;
9234 stmt_vec_info exit_phi_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
9235 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
9236 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
9238 if (dump_enabled_p ())
9239 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9240 "inner-loop induction only used outside "
9241 "of the outer vectorized loop.\n");
9246 nested_in_vect_loop
= true;
9247 iv_loop
= loop
->inner
;
9251 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
9253 if (slp_node
&& !nunits
.is_constant ())
9255 /* The current SLP code creates the step value element-by-element. */
9256 if (dump_enabled_p ())
9257 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9258 "SLP induction not supported for variable-length"
9263 if (FLOAT_TYPE_P (vectype
) && !param_vect_induction_float
)
9265 if (dump_enabled_p ())
9266 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9267 "floating point induction vectorization disabled\n");
9271 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
9272 gcc_assert (step_expr
!= NULL_TREE
);
9273 tree step_vectype
= get_same_sized_vectype (TREE_TYPE (step_expr
), vectype
);
9275 /* Check for backend support of PLUS/MINUS_EXPR. */
9276 if (!directly_supported_p (PLUS_EXPR
, step_vectype
)
9277 || !directly_supported_p (MINUS_EXPR
, step_vectype
))
9280 if (!vec_stmt
) /* transformation not required. */
9282 unsigned inside_cost
= 0, prologue_cost
= 0;
9285 /* We eventually need to set a vector type on invariant
9289 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node
), j
, child
)
9290 if (!vect_maybe_update_slp_op_vectype
9291 (child
, SLP_TREE_VECTYPE (slp_node
)))
9293 if (dump_enabled_p ())
9294 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9295 "incompatible vector types for "
9299 /* loop cost for vec_loop. */
9301 = record_stmt_cost (cost_vec
,
9302 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
),
9303 vector_stmt
, stmt_info
, 0, vect_body
);
9304 /* prologue cost for vec_init (if not nested) and step. */
9305 prologue_cost
= record_stmt_cost (cost_vec
, 1 + !nested_in_vect_loop
,
9307 stmt_info
, 0, vect_prologue
);
9309 else /* if (!slp_node) */
9311 /* loop cost for vec_loop. */
9312 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
9313 stmt_info
, 0, vect_body
);
9314 /* prologue cost for vec_init and vec_step. */
9315 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
9316 stmt_info
, 0, vect_prologue
);
9318 if (dump_enabled_p ())
9319 dump_printf_loc (MSG_NOTE
, vect_location
,
9320 "vect_model_induction_cost: inside_cost = %d, "
9321 "prologue_cost = %d .\n", inside_cost
,
9324 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
9325 DUMP_VECT_SCOPE ("vectorizable_induction");
9331 /* Compute a vector variable, initialized with the first VF values of
9332 the induction variable. E.g., for an iv with IV_PHI='X' and
9333 evolution S, for a vector of 4 units, we want to compute:
9334 [X, X + S, X + 2*S, X + 3*S]. */
9336 if (dump_enabled_p ())
9337 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
9339 pe
= loop_preheader_edge (iv_loop
);
9340 /* Find the first insertion point in the BB. */
9341 basic_block bb
= gimple_bb (phi
);
9342 si
= gsi_after_labels (bb
);
9344 /* For SLP induction we have to generate several IVs as for example
9345 with group size 3 we need
9346 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9347 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9350 /* Enforced above. */
9351 unsigned int const_nunits
= nunits
.to_constant ();
9353 /* The initial values are vectorized, but any lanes > group_size
9356 = SLP_TREE_CHILDREN (slp_node
)[pe
->dest_idx
];
9358 /* Gather steps. Since we do not vectorize inductions as
9359 cycles we have to reconstruct the step from SCEV data. */
9360 unsigned group_size
= SLP_TREE_LANES (slp_node
);
9361 tree
*steps
= XALLOCAVEC (tree
, group_size
);
9362 tree
*inits
= XALLOCAVEC (tree
, group_size
);
9363 stmt_vec_info phi_info
;
9364 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node
), i
, phi_info
)
9366 steps
[i
] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info
);
9368 inits
[i
] = gimple_phi_arg_def (as_a
<gphi
*> (phi_info
->stmt
),
9372 /* Now generate the IVs. */
9373 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
9374 gcc_assert ((const_nunits
* nvects
) % group_size
== 0);
9376 if (nested_in_vect_loop
)
9380 /* Compute the number of distinct IVs we need. First reduce
9381 group_size if it is a multiple of const_nunits so we get
9382 one IV for a group_size of 4 but const_nunits 2. */
9383 unsigned group_sizep
= group_size
;
9384 if (group_sizep
% const_nunits
== 0)
9385 group_sizep
= group_sizep
/ const_nunits
;
9386 nivs
= least_common_multiple (group_sizep
,
9387 const_nunits
) / const_nunits
;
9389 tree stept
= TREE_TYPE (step_vectype
);
9390 tree lupdate_mul
= NULL_TREE
;
9391 if (!nested_in_vect_loop
)
9393 /* The number of iterations covered in one vector iteration. */
9394 unsigned lup_mul
= (nvects
* const_nunits
) / group_size
;
9396 = build_vector_from_val (step_vectype
,
9397 SCALAR_FLOAT_TYPE_P (stept
)
9398 ? build_real_from_wide (stept
, lup_mul
,
9400 : build_int_cstu (stept
, lup_mul
));
9402 tree peel_mul
= NULL_TREE
;
9403 gimple_seq init_stmts
= NULL
;
9404 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
))
9406 if (SCALAR_FLOAT_TYPE_P (stept
))
9407 peel_mul
= gimple_build (&init_stmts
, FLOAT_EXPR
, stept
,
9408 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
9410 peel_mul
= gimple_convert (&init_stmts
, stept
,
9411 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
));
9412 peel_mul
= gimple_build_vector_from_val (&init_stmts
,
9413 step_vectype
, peel_mul
);
9416 auto_vec
<tree
> vec_steps
;
9417 for (ivn
= 0; ivn
< nivs
; ++ivn
)
9419 tree_vector_builder
step_elts (step_vectype
, const_nunits
, 1);
9420 tree_vector_builder
init_elts (vectype
, const_nunits
, 1);
9421 tree_vector_builder
mul_elts (step_vectype
, const_nunits
, 1);
9422 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
9424 /* The scalar steps of the IVs. */
9425 tree elt
= steps
[(ivn
*const_nunits
+ eltn
) % group_size
];
9426 elt
= gimple_convert (&init_stmts
, TREE_TYPE (step_vectype
), elt
);
9427 step_elts
.quick_push (elt
);
9430 /* The scalar inits of the IVs if not vectorized. */
9431 elt
= inits
[(ivn
*const_nunits
+ eltn
) % group_size
];
9432 if (!useless_type_conversion_p (TREE_TYPE (vectype
),
9434 elt
= gimple_build (&init_stmts
, VIEW_CONVERT_EXPR
,
9435 TREE_TYPE (vectype
), elt
);
9436 init_elts
.quick_push (elt
);
9438 /* The number of steps to add to the initial values. */
9439 unsigned mul_elt
= (ivn
*const_nunits
+ eltn
) / group_size
;
9440 mul_elts
.quick_push (SCALAR_FLOAT_TYPE_P (stept
)
9441 ? build_real_from_wide (stept
,
9443 : build_int_cstu (stept
, mul_elt
));
9445 vec_step
= gimple_build_vector (&init_stmts
, &step_elts
);
9446 vec_steps
.safe_push (vec_step
);
9447 tree step_mul
= gimple_build_vector (&init_stmts
, &mul_elts
);
9449 step_mul
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
9450 step_mul
, peel_mul
);
9452 vec_init
= gimple_build_vector (&init_stmts
, &init_elts
);
9454 /* Create the induction-phi that defines the induction-operand. */
9455 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
,
9457 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
9458 induc_def
= PHI_RESULT (induction_phi
);
9460 /* Create the iv update inside the loop */
9463 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
9464 vec_step
, lupdate_mul
);
9465 gimple_seq stmts
= NULL
;
9466 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
9467 vec_def
= gimple_build (&stmts
,
9468 PLUS_EXPR
, step_vectype
, vec_def
, up
);
9469 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
9470 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9471 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
9475 vec_init
= vect_get_slp_vect_def (init_node
, ivn
);
9476 if (!nested_in_vect_loop
9477 && !integer_zerop (step_mul
))
9479 vec_def
= gimple_convert (&init_stmts
, step_vectype
, vec_init
);
9480 up
= gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
9481 vec_step
, step_mul
);
9482 vec_def
= gimple_build (&init_stmts
, PLUS_EXPR
, step_vectype
,
9484 vec_init
= gimple_convert (&init_stmts
, vectype
, vec_def
);
9487 /* Set the arguments of the phi node: */
9488 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
9490 SLP_TREE_VEC_STMTS (slp_node
).quick_push (induction_phi
);
9492 if (!nested_in_vect_loop
)
9494 /* Fill up to the number of vectors we need for the whole group. */
9495 nivs
= least_common_multiple (group_size
,
9496 const_nunits
) / const_nunits
;
9497 vec_steps
.reserve (nivs
-ivn
);
9498 for (; ivn
< nivs
; ++ivn
)
9500 SLP_TREE_VEC_STMTS (slp_node
)
9501 .quick_push (SLP_TREE_VEC_STMTS (slp_node
)[0]);
9502 vec_steps
.quick_push (vec_steps
[0]);
9506 /* Re-use IVs when we can. We are generating further vector
9507 stmts by adding VF' * stride to the IVs generated above. */
9511 = least_common_multiple (group_size
, const_nunits
) / group_size
;
9513 = build_vector_from_val (step_vectype
,
9514 SCALAR_FLOAT_TYPE_P (stept
)
9515 ? build_real_from_wide (stept
,
9517 : build_int_cstu (stept
, vfp
));
9518 for (; ivn
< nvects
; ++ivn
)
9520 gimple
*iv
= SLP_TREE_VEC_STMTS (slp_node
)[ivn
- nivs
];
9521 tree def
= gimple_get_lhs (iv
);
9523 vec_steps
[ivn
- nivs
]
9524 = gimple_build (&init_stmts
, MULT_EXPR
, step_vectype
,
9525 vec_steps
[ivn
- nivs
], lupdate_mul
);
9526 gimple_seq stmts
= NULL
;
9527 def
= gimple_convert (&stmts
, step_vectype
, def
);
9528 def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
9529 def
, vec_steps
[ivn
% nivs
]);
9530 def
= gimple_convert (&stmts
, vectype
, def
);
9531 if (gimple_code (iv
) == GIMPLE_PHI
)
9532 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9535 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
9536 gsi_insert_seq_after (&tgsi
, stmts
, GSI_CONTINUE_LINKING
);
9538 SLP_TREE_VEC_STMTS (slp_node
)
9539 .quick_push (SSA_NAME_DEF_STMT (def
));
9543 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, init_stmts
);
9544 gcc_assert (!new_bb
);
9549 init_expr
= vect_phi_initial_value (phi
);
9551 gimple_seq stmts
= NULL
;
9552 if (!nested_in_vect_loop
)
9554 /* Convert the initial value to the IV update type. */
9555 tree new_type
= TREE_TYPE (step_expr
);
9556 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
9558 /* If we are using the loop mask to "peel" for alignment then we need
9559 to adjust the start value here. */
9560 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
9561 if (skip_niters
!= NULL_TREE
)
9563 if (FLOAT_TYPE_P (vectype
))
9564 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
9567 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
9568 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
9569 skip_niters
, step_expr
);
9570 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
9571 init_expr
, skip_step
);
9577 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
9578 gcc_assert (!new_bb
);
9581 /* Create the vector that holds the initial_value of the induction. */
9582 if (nested_in_vect_loop
)
9584 /* iv_loop is nested in the loop to be vectorized. init_expr had already
9585 been created during vectorization of previous stmts. We obtain it
9586 from the STMT_VINFO_VEC_STMT of the defining stmt. */
9587 auto_vec
<tree
> vec_inits
;
9588 vect_get_vec_defs_for_operand (loop_vinfo
, stmt_info
, 1,
9589 init_expr
, &vec_inits
);
9590 vec_init
= vec_inits
[0];
9591 /* If the initial value is not of proper type, convert it. */
9592 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
9595 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
9599 build1 (VIEW_CONVERT_EXPR
, vectype
,
9601 vec_init
= gimple_assign_lhs (new_stmt
);
9602 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
9604 gcc_assert (!new_bb
);
9609 /* iv_loop is the loop to be vectorized. Create:
9610 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
9612 new_name
= gimple_convert (&stmts
, TREE_TYPE (step_expr
), init_expr
);
9614 unsigned HOST_WIDE_INT const_nunits
;
9615 if (nunits
.is_constant (&const_nunits
))
9617 tree_vector_builder
elts (step_vectype
, const_nunits
, 1);
9618 elts
.quick_push (new_name
);
9619 for (i
= 1; i
< const_nunits
; i
++)
9621 /* Create: new_name_i = new_name + step_expr */
9622 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
9623 new_name
, step_expr
);
9624 elts
.quick_push (new_name
);
9626 /* Create a vector from [new_name_0, new_name_1, ...,
9627 new_name_nunits-1] */
9628 vec_init
= gimple_build_vector (&stmts
, &elts
);
9630 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
9631 /* Build the initial value directly from a VEC_SERIES_EXPR. */
9632 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, step_vectype
,
9633 new_name
, step_expr
);
9637 [base, base, base, ...]
9638 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9639 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
9640 gcc_assert (flag_associative_math
);
9641 tree index
= build_index_vector (step_vectype
, 0, 1);
9642 tree base_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
9644 tree step_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
9646 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, step_vectype
, index
);
9647 vec_init
= gimple_build (&stmts
, MULT_EXPR
, step_vectype
,
9648 vec_init
, step_vec
);
9649 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
9650 vec_init
, base_vec
);
9652 vec_init
= gimple_convert (&stmts
, vectype
, vec_init
);
9656 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
9657 gcc_assert (!new_bb
);
9662 /* Create the vector that holds the step of the induction. */
9663 if (nested_in_vect_loop
)
9664 /* iv_loop is nested in the loop to be vectorized. Generate:
9665 vec_step = [S, S, S, S] */
9666 new_name
= step_expr
;
9669 /* iv_loop is the loop to be vectorized. Generate:
9670 vec_step = [VF*S, VF*S, VF*S, VF*S] */
9671 gimple_seq seq
= NULL
;
9672 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
9674 expr
= build_int_cst (integer_type_node
, vf
);
9675 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
9678 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
9679 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
9683 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
9684 gcc_assert (!new_bb
);
9688 t
= unshare_expr (new_name
);
9689 gcc_assert (CONSTANT_CLASS_P (new_name
)
9690 || TREE_CODE (new_name
) == SSA_NAME
);
9691 new_vec
= build_vector_from_val (step_vectype
, t
);
9692 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
9693 new_vec
, step_vectype
, NULL
);
9696 /* Create the following def-use cycle:
9701 vec_iv = PHI <vec_init, vec_loop>
9705 vec_loop = vec_iv + vec_step; */
9707 /* Create the induction-phi that defines the induction-operand. */
9708 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
9709 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
9710 induc_def
= PHI_RESULT (induction_phi
);
9712 /* Create the iv update inside the loop */
9714 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
9715 vec_def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
9716 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
9717 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9718 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
9720 /* Set the arguments of the phi node: */
9721 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
9722 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
9725 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (induction_phi
);
9726 *vec_stmt
= induction_phi
;
9728 /* In case that vectorization factor (VF) is bigger than the number
9729 of elements that we can fit in a vectype (nunits), we have to generate
9730 more than one vector stmt - i.e - we need to "unroll" the
9731 vector stmt by a factor VF/nunits. For more details see documentation
9732 in vectorizable_operation. */
9736 gimple_seq seq
= NULL
;
9737 /* FORNOW. This restriction should be relaxed. */
9738 gcc_assert (!nested_in_vect_loop
);
9740 /* Create the vector that holds the step of the induction. */
9741 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
9743 expr
= build_int_cst (integer_type_node
, nunits
);
9744 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
9747 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
9748 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
9752 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
9753 gcc_assert (!new_bb
);
9756 t
= unshare_expr (new_name
);
9757 gcc_assert (CONSTANT_CLASS_P (new_name
)
9758 || TREE_CODE (new_name
) == SSA_NAME
);
9759 new_vec
= build_vector_from_val (step_vectype
, t
);
9760 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
9761 new_vec
, step_vectype
, NULL
);
9763 vec_def
= induc_def
;
9764 for (i
= 1; i
< ncopies
; i
++)
9766 /* vec_i = vec_prev + vec_step */
9767 gimple_seq stmts
= NULL
;
9768 vec_def
= gimple_convert (&stmts
, step_vectype
, vec_def
);
9769 vec_def
= gimple_build (&stmts
,
9770 PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
9771 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
9773 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
9774 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
9775 STMT_VINFO_VEC_STMTS (stmt_info
).safe_push (new_stmt
);
9779 if (dump_enabled_p ())
9780 dump_printf_loc (MSG_NOTE
, vect_location
,
9781 "transform induction: created def-use cycle: %G%G",
9782 (gimple
*) induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
9787 /* Function vectorizable_live_operation.
9789 STMT_INFO computes a value that is used outside the loop. Check if
9790 it can be supported. */
9793 vectorizable_live_operation (vec_info
*vinfo
,
9794 stmt_vec_info stmt_info
,
9795 gimple_stmt_iterator
*gsi
,
9796 slp_tree slp_node
, slp_instance slp_node_instance
,
9797 int slp_index
, bool vec_stmt_p
,
9798 stmt_vector_for_cost
*cost_vec
)
9800 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
9801 imm_use_iterator imm_iter
;
9802 tree lhs
, lhs_type
, bitsize
;
9803 tree vectype
= (slp_node
9804 ? SLP_TREE_VECTYPE (slp_node
)
9805 : STMT_VINFO_VECTYPE (stmt_info
));
9806 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
9809 auto_vec
<tree
> vec_oprnds
;
9811 poly_uint64 vec_index
= 0;
9813 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
9815 /* If a stmt of a reduction is live, vectorize it via
9816 vect_create_epilog_for_reduction. vectorizable_reduction assessed
9817 validity so just trigger the transform here. */
9818 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
)))
9824 /* For reduction chains the meta-info is attached to
9825 the group leader. */
9826 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
9827 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
9828 /* For SLP reductions we vectorize the epilogue for
9829 all involved stmts together. */
9830 else if (slp_index
!= 0)
9833 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
9834 gcc_assert (reduc_info
->is_reduc_info
);
9835 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
9836 || STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
)
9838 vect_create_epilog_for_reduction (loop_vinfo
, stmt_info
, slp_node
,
9843 /* If STMT is not relevant and it is a simple assignment and its inputs are
9844 invariant then it can remain in place, unvectorized. The original last
9845 scalar value that it computes will be used. */
9846 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
9848 gcc_assert (is_simple_and_all_uses_invariant (stmt_info
, loop_vinfo
));
9849 if (dump_enabled_p ())
9850 dump_printf_loc (MSG_NOTE
, vect_location
,
9851 "statement is simple and uses invariant. Leaving in "
9859 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
9863 gcc_assert (slp_index
>= 0);
9865 /* Get the last occurrence of the scalar index from the concatenation of
9866 all the slp vectors. Calculate which slp vector it is and the index
9868 int num_scalar
= SLP_TREE_LANES (slp_node
);
9869 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
9870 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
9872 /* Calculate which vector contains the result, and which lane of
9873 that vector we need. */
9874 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
9876 if (dump_enabled_p ())
9877 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9878 "Cannot determine which vector holds the"
9879 " final result.\n");
9886 /* No transformation required. */
9887 if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
))
9889 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
9890 OPTIMIZE_FOR_SPEED
))
9892 if (dump_enabled_p ())
9893 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9894 "can't operate on partial vectors "
9895 "because the target doesn't support extract "
9896 "last reduction.\n");
9897 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
9901 if (dump_enabled_p ())
9902 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9903 "can't operate on partial vectors "
9904 "because an SLP statement is live after "
9906 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
9908 else if (ncopies
> 1)
9910 if (dump_enabled_p ())
9911 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9912 "can't operate on partial vectors "
9913 "because ncopies is greater than 1.\n");
9914 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo
) = false;
9918 gcc_assert (ncopies
== 1 && !slp_node
);
9919 vect_record_loop_mask (loop_vinfo
,
9920 &LOOP_VINFO_MASKS (loop_vinfo
),
9924 /* ??? Enable for loop costing as well. */
9926 record_stmt_cost (cost_vec
, 1, vec_to_scalar
, stmt_info
, NULL_TREE
,
9931 /* Use the lhs of the original scalar statement. */
9932 gimple
*stmt
= vect_orig_stmt (stmt_info
)->stmt
;
9933 if (dump_enabled_p ())
9934 dump_printf_loc (MSG_NOTE
, vect_location
, "extracting lane for live "
9937 lhs
= gimple_get_lhs (stmt
);
9938 lhs_type
= TREE_TYPE (lhs
);
9940 bitsize
= vector_element_bits_tree (vectype
);
9942 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
9943 tree vec_lhs
, bitstart
;
9947 gcc_assert (!loop_vinfo
|| !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
9949 /* Get the correct slp vectorized stmt. */
9950 vec_stmt
= SLP_TREE_VEC_STMTS (slp_node
)[vec_entry
];
9951 vec_lhs
= gimple_get_lhs (vec_stmt
);
9953 /* Get entry to use. */
9954 bitstart
= bitsize_int (vec_index
);
9955 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
9959 /* For multiple copies, get the last copy. */
9960 vec_stmt
= STMT_VINFO_VEC_STMTS (stmt_info
).last ();
9961 vec_lhs
= gimple_get_lhs (vec_stmt
);
9963 /* Get the last lane in the vector. */
9964 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitsize_int (nunits
- 1));
9969 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
9970 requirement, insert one phi node for it. It looks like:
9977 # vec_lhs' = PHI <vec_lhs>
9978 new_tree = lane_extract <vec_lhs', ...>;
9981 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9982 basic_block exit_bb
= single_exit (loop
)->dest
;
9983 gcc_assert (single_pred_p (exit_bb
));
9985 tree vec_lhs_phi
= copy_ssa_name (vec_lhs
);
9986 gimple
*phi
= create_phi_node (vec_lhs_phi
, exit_bb
);
9987 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, vec_lhs
);
9989 gimple_seq stmts
= NULL
;
9991 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
9995 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
9997 where VEC_LHS is the vectorized live-out result and MASK is
9998 the loop mask for the final iteration. */
9999 gcc_assert (ncopies
== 1 && !slp_node
);
10000 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
10001 tree mask
= vect_get_loop_mask (gsi
, &LOOP_VINFO_MASKS (loop_vinfo
),
10003 tree scalar_res
= gimple_build (&stmts
, CFN_EXTRACT_LAST
, scalar_type
,
10004 mask
, vec_lhs_phi
);
10006 /* Convert the extracted vector element to the scalar type. */
10007 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
10011 tree bftype
= TREE_TYPE (vectype
);
10012 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
10013 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
10014 new_tree
= build3 (BIT_FIELD_REF
, bftype
,
10015 vec_lhs_phi
, bitsize
, bitstart
);
10016 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
10017 &stmts
, true, NULL_TREE
);
10022 gimple_stmt_iterator exit_gsi
= gsi_after_labels (exit_bb
);
10023 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
10025 /* Remove existing phi from lhs and create one copy from new_tree. */
10026 tree lhs_phi
= NULL_TREE
;
10027 gimple_stmt_iterator gsi
;
10028 for (gsi
= gsi_start_phis (exit_bb
);
10029 !gsi_end_p (gsi
); gsi_next (&gsi
))
10031 gimple
*phi
= gsi_stmt (gsi
);
10032 if ((gimple_phi_arg_def (phi
, 0) == lhs
))
10034 remove_phi_node (&gsi
, false);
10035 lhs_phi
= gimple_phi_result (phi
);
10036 gimple
*copy
= gimple_build_assign (lhs_phi
, new_tree
);
10037 gsi_insert_before (&exit_gsi
, copy
, GSI_SAME_STMT
);
10043 /* Replace use of lhs with newly computed result. If the use stmt is a
10044 single arg PHI, just replace all uses of PHI result. It's necessary
10045 because lcssa PHI defining lhs may be before newly inserted stmt. */
10046 use_operand_p use_p
;
10047 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
10048 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
10049 && !is_gimple_debug (use_stmt
))
10051 if (gimple_code (use_stmt
) == GIMPLE_PHI
10052 && gimple_phi_num_args (use_stmt
) == 1)
10054 replace_uses_by (gimple_phi_result (use_stmt
), new_tree
);
10058 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
10059 SET_USE (use_p
, new_tree
);
10061 update_stmt (use_stmt
);
10066 /* For basic-block vectorization simply insert the lane-extraction. */
10067 tree bftype
= TREE_TYPE (vectype
);
10068 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
10069 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
10070 tree new_tree
= build3 (BIT_FIELD_REF
, bftype
,
10071 vec_lhs
, bitsize
, bitstart
);
10072 gimple_seq stmts
= NULL
;
10073 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
10074 &stmts
, true, NULL_TREE
);
10075 if (TREE_CODE (new_tree
) == SSA_NAME
10076 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs
))
10077 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree
) = 1;
10078 if (is_a
<gphi
*> (vec_stmt
))
10080 gimple_stmt_iterator si
= gsi_after_labels (gimple_bb (vec_stmt
));
10081 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
10085 gimple_stmt_iterator si
= gsi_for_stmt (vec_stmt
);
10086 gsi_insert_seq_after (&si
, stmts
, GSI_SAME_STMT
);
10089 /* Replace use of lhs with newly computed result. If the use stmt is a
10090 single arg PHI, just replace all uses of PHI result. It's necessary
10091 because lcssa PHI defining lhs may be before newly inserted stmt. */
10092 use_operand_p use_p
;
10093 stmt_vec_info use_stmt_info
;
10094 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
10095 if (!is_gimple_debug (use_stmt
)
10096 && (!(use_stmt_info
= vinfo
->lookup_stmt (use_stmt
))
10097 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
))))
10099 /* ??? This can happen when the live lane ends up being
10100 used in a vector construction code-generated by an
10101 external SLP node (and code-generation for that already
10102 happened). See gcc.dg/vect/bb-slp-47.c.
10103 Doing this is what would happen if that vector CTOR
10104 were not code-generated yet so it is not too bad.
10105 ??? In fact we'd likely want to avoid this situation
10106 in the first place. */
10107 if (TREE_CODE (new_tree
) == SSA_NAME
10108 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
10109 && gimple_code (use_stmt
) != GIMPLE_PHI
10110 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree
),
10113 enum tree_code code
= gimple_assign_rhs_code (use_stmt
);
10114 gcc_assert (code
== CONSTRUCTOR
10115 || code
== VIEW_CONVERT_EXPR
10116 || CONVERT_EXPR_CODE_P (code
));
10117 if (dump_enabled_p ())
10118 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10119 "Using original scalar computation for "
10120 "live lane because use preceeds vector "
10124 /* ??? It can also happen that we end up pulling a def into
10125 a loop where replacing out-of-loop uses would require
10126 a new LC SSA PHI node. Retain the original scalar in
10127 those cases as well. PR98064. */
10128 if (TREE_CODE (new_tree
) == SSA_NAME
10129 && !SSA_NAME_IS_DEFAULT_DEF (new_tree
)
10130 && (gimple_bb (use_stmt
)->loop_father
10131 != gimple_bb (vec_stmt
)->loop_father
)
10132 && !flow_loop_nested_p (gimple_bb (vec_stmt
)->loop_father
,
10133 gimple_bb (use_stmt
)->loop_father
))
10135 if (dump_enabled_p ())
10136 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10137 "Using original scalar computation for "
10138 "live lane because there is an out-of-loop "
10139 "definition for it\n");
10142 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
10143 SET_USE (use_p
, new_tree
);
10144 update_stmt (use_stmt
);
10151 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10154 vect_loop_kill_debug_uses (class loop
*loop
, stmt_vec_info stmt_info
)
10156 ssa_op_iter op_iter
;
10157 imm_use_iterator imm_iter
;
10158 def_operand_p def_p
;
10161 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt_info
->stmt
, op_iter
, SSA_OP_DEF
)
10163 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
10167 if (!is_gimple_debug (ustmt
))
10170 bb
= gimple_bb (ustmt
);
10172 if (!flow_bb_inside_loop_p (loop
, bb
))
10174 if (gimple_debug_bind_p (ustmt
))
10176 if (dump_enabled_p ())
10177 dump_printf_loc (MSG_NOTE
, vect_location
,
10178 "killing debug use\n");
10180 gimple_debug_bind_reset_value (ustmt
);
10181 update_stmt (ustmt
);
10184 gcc_unreachable ();
10190 /* Given loop represented by LOOP_VINFO, return true if computation of
10191 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10195 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
10197 /* Constant case. */
10198 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
10200 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
10201 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
10203 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
10204 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
10205 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
10210 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
10211 /* Check the upper bound of loop niters. */
10212 if (get_max_loop_iterations (loop
, &max
))
10214 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
10215 signop sgn
= TYPE_SIGN (type
);
10216 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
10217 if (max
< type_max
)
10223 /* Return a mask type with half the number of elements as OLD_TYPE,
10224 given that it should have mode NEW_MODE. */
10227 vect_halve_mask_nunits (tree old_type
, machine_mode new_mode
)
10229 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (old_type
), 2);
10230 return build_truth_vector_type_for_mode (nunits
, new_mode
);
10233 /* Return a mask type with twice as many elements as OLD_TYPE,
10234 given that it should have mode NEW_MODE. */
10237 vect_double_mask_nunits (tree old_type
, machine_mode new_mode
)
10239 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (old_type
) * 2;
10240 return build_truth_vector_type_for_mode (nunits
, new_mode
);
10243 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10244 contain a sequence of NVECTORS masks that each control a vector of type
10245 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10246 these vector masks with the vector version of SCALAR_MASK. */
10249 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
10250 unsigned int nvectors
, tree vectype
, tree scalar_mask
)
10252 gcc_assert (nvectors
!= 0);
10253 if (masks
->length () < nvectors
)
10254 masks
->safe_grow_cleared (nvectors
, true);
10255 rgroup_controls
*rgm
= &(*masks
)[nvectors
- 1];
10256 /* The number of scalars per iteration and the number of vectors are
10257 both compile-time constants. */
10258 unsigned int nscalars_per_iter
10259 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
10260 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
10264 scalar_cond_masked_key
cond (scalar_mask
, nvectors
);
10265 loop_vinfo
->scalar_cond_masked_set
.add (cond
);
10268 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
10270 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
10271 rgm
->type
= truth_type_for (vectype
);
10276 /* Given a complete set of masks MASKS, extract mask number INDEX
10277 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10278 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10280 See the comment above vec_loop_masks for more details about the mask
10284 vect_get_loop_mask (gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
10285 unsigned int nvectors
, tree vectype
, unsigned int index
)
10287 rgroup_controls
*rgm
= &(*masks
)[nvectors
- 1];
10288 tree mask_type
= rgm
->type
;
10290 /* Populate the rgroup's mask array, if this is the first time we've
10292 if (rgm
->controls
.is_empty ())
10294 rgm
->controls
.safe_grow_cleared (nvectors
, true);
10295 for (unsigned int i
= 0; i
< nvectors
; ++i
)
10297 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
10298 /* Provide a dummy definition until the real one is available. */
10299 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
10300 rgm
->controls
[i
] = mask
;
10304 tree mask
= rgm
->controls
[index
];
10305 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
10306 TYPE_VECTOR_SUBPARTS (vectype
)))
10308 /* A loop mask for data type X can be reused for data type Y
10309 if X has N times more elements than Y and if Y's elements
10310 are N times bigger than X's. In this case each sequence
10311 of N elements in the loop mask will be all-zero or all-one.
10312 We can then view-convert the mask so that each sequence of
10313 N elements is replaced by a single element. */
10314 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
10315 TYPE_VECTOR_SUBPARTS (vectype
)));
10316 gimple_seq seq
= NULL
;
10317 mask_type
= truth_type_for (vectype
);
10318 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
10320 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
10325 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10326 lengths for controlling an operation on VECTYPE. The operation splits
10327 each element of VECTYPE into FACTOR separate subelements, measuring the
10328 length as a number of these subelements. */
10331 vect_record_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
10332 unsigned int nvectors
, tree vectype
, unsigned int factor
)
10334 gcc_assert (nvectors
!= 0);
10335 if (lens
->length () < nvectors
)
10336 lens
->safe_grow_cleared (nvectors
, true);
10337 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
10339 /* The number of scalars per iteration, scalar occupied bytes and
10340 the number of vectors are both compile-time constants. */
10341 unsigned int nscalars_per_iter
10342 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
10343 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
10345 if (rgl
->max_nscalars_per_iter
< nscalars_per_iter
)
10347 /* For now, we only support cases in which all loads and stores fall back
10348 to VnQI or none do. */
10349 gcc_assert (!rgl
->max_nscalars_per_iter
10350 || (rgl
->factor
== 1 && factor
== 1)
10351 || (rgl
->max_nscalars_per_iter
* rgl
->factor
10352 == nscalars_per_iter
* factor
));
10353 rgl
->max_nscalars_per_iter
= nscalars_per_iter
;
10354 rgl
->type
= vectype
;
10355 rgl
->factor
= factor
;
10359 /* Given a complete set of length LENS, extract length number INDEX for an
10360 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
10363 vect_get_loop_len (loop_vec_info loop_vinfo
, vec_loop_lens
*lens
,
10364 unsigned int nvectors
, unsigned int index
)
10366 rgroup_controls
*rgl
= &(*lens
)[nvectors
- 1];
10367 bool use_bias_adjusted_len
=
10368 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo
) != 0;
10370 /* Populate the rgroup's len array, if this is the first time we've
10372 if (rgl
->controls
.is_empty ())
10374 rgl
->controls
.safe_grow_cleared (nvectors
, true);
10375 for (unsigned int i
= 0; i
< nvectors
; ++i
)
10377 tree len_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
10378 gcc_assert (len_type
!= NULL_TREE
);
10380 tree len
= make_temp_ssa_name (len_type
, NULL
, "loop_len");
10382 /* Provide a dummy definition until the real one is available. */
10383 SSA_NAME_DEF_STMT (len
) = gimple_build_nop ();
10384 rgl
->controls
[i
] = len
;
10386 if (use_bias_adjusted_len
)
10388 gcc_assert (i
== 0);
10389 tree adjusted_len
=
10390 make_temp_ssa_name (len_type
, NULL
, "adjusted_loop_len");
10391 SSA_NAME_DEF_STMT (adjusted_len
) = gimple_build_nop ();
10392 rgl
->bias_adjusted_ctrl
= adjusted_len
;
10397 if (use_bias_adjusted_len
)
10398 return rgl
->bias_adjusted_ctrl
;
10400 return rgl
->controls
[index
];
10403 /* Scale profiling counters by estimation for LOOP which is vectorized
10407 scale_profile_for_vect_loop (class loop
*loop
, unsigned vf
)
10409 edge preheader
= loop_preheader_edge (loop
);
10410 /* Reduce loop iterations by the vectorization factor. */
10411 gcov_type new_est_niter
= niter_for_unrolled_loop (loop
, vf
);
10412 profile_count freq_h
= loop
->header
->count
, freq_e
= preheader
->count ();
10414 if (freq_h
.nonzero_p ())
10416 profile_probability p
;
10418 /* Avoid dropping loop body profile counter to 0 because of zero count
10419 in loop's preheader. */
10420 if (!(freq_e
== profile_count::zero ()))
10421 freq_e
= freq_e
.force_nonzero ();
10422 p
= (freq_e
* (new_est_niter
+ 1)).probability_in (freq_h
);
10423 scale_loop_frequencies (loop
, p
);
10426 edge exit_e
= single_exit (loop
);
10427 exit_e
->probability
= profile_probability::always () / (new_est_niter
+ 1);
10429 edge exit_l
= single_pred_edge (loop
->latch
);
10430 profile_probability prob
= exit_l
->probability
;
10431 exit_l
->probability
= exit_e
->probability
.invert ();
10432 if (prob
.initialized_p () && exit_l
->probability
.initialized_p ())
10433 scale_bbs_frequencies (&loop
->latch
, 1, exit_l
->probability
/ prob
);
10436 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10437 latch edge values originally defined by it. */
10440 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo
,
10441 stmt_vec_info def_stmt_info
)
10443 tree def
= gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
10444 if (!def
|| TREE_CODE (def
) != SSA_NAME
)
10446 stmt_vec_info phi_info
;
10447 imm_use_iterator iter
;
10448 use_operand_p use_p
;
10449 FOR_EACH_IMM_USE_FAST (use_p
, iter
, def
)
10451 gphi
*phi
= dyn_cast
<gphi
*> (USE_STMT (use_p
));
10454 if (!(gimple_bb (phi
)->loop_father
->header
== gimple_bb (phi
)
10455 && (phi_info
= loop_vinfo
->lookup_stmt (phi
))
10456 && STMT_VINFO_RELEVANT_P (phi_info
)))
10458 loop_p loop
= gimple_bb (phi
)->loop_father
;
10459 edge e
= loop_latch_edge (loop
);
10460 if (PHI_ARG_DEF_FROM_EDGE (phi
, e
) != def
)
10463 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info
))
10464 && STMT_VINFO_REDUC_TYPE (phi_info
) != FOLD_LEFT_REDUCTION
10465 && STMT_VINFO_REDUC_TYPE (phi_info
) != EXTRACT_LAST_REDUCTION
)
10467 vec
<gimple
*> &phi_defs
= STMT_VINFO_VEC_STMTS (phi_info
);
10468 vec
<gimple
*> &latch_defs
= STMT_VINFO_VEC_STMTS (def_stmt_info
);
10469 gcc_assert (phi_defs
.length () == latch_defs
.length ());
10470 for (unsigned i
= 0; i
< phi_defs
.length (); ++i
)
10471 add_phi_arg (as_a
<gphi
*> (phi_defs
[i
]),
10472 gimple_get_lhs (latch_defs
[i
]), e
,
10473 gimple_phi_arg_location (phi
, e
->dest_idx
));
10475 else if (STMT_VINFO_DEF_TYPE (phi_info
) == vect_first_order_recurrence
)
10477 /* For first order recurrences we have to update both uses of
10478 the latch definition, the one in the PHI node and the one
10479 in the generated VEC_PERM_EXPR. */
10480 vec
<gimple
*> &phi_defs
= STMT_VINFO_VEC_STMTS (phi_info
);
10481 vec
<gimple
*> &latch_defs
= STMT_VINFO_VEC_STMTS (def_stmt_info
);
10482 gcc_assert (phi_defs
.length () == latch_defs
.length ());
10483 tree phidef
= gimple_assign_rhs1 (phi_defs
[0]);
10484 gphi
*vphi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (phidef
));
10485 for (unsigned i
= 0; i
< phi_defs
.length (); ++i
)
10487 gassign
*perm
= as_a
<gassign
*> (phi_defs
[i
]);
10489 gimple_assign_set_rhs1 (perm
, gimple_get_lhs (latch_defs
[i
-1]));
10490 gimple_assign_set_rhs2 (perm
, gimple_get_lhs (latch_defs
[i
]));
10491 update_stmt (perm
);
10493 add_phi_arg (vphi
, gimple_get_lhs (latch_defs
.last ()), e
,
10494 gimple_phi_arg_location (phi
, e
->dest_idx
));
10499 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10500 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10504 vect_transform_loop_stmt (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
10505 gimple_stmt_iterator
*gsi
, stmt_vec_info
*seen_store
)
10507 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
10508 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
10510 if (dump_enabled_p ())
10511 dump_printf_loc (MSG_NOTE
, vect_location
,
10512 "------>vectorizing statement: %G", stmt_info
->stmt
);
10514 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
10515 vect_loop_kill_debug_uses (loop
, stmt_info
);
10517 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
10518 && !STMT_VINFO_LIVE_P (stmt_info
))
10521 if (STMT_VINFO_VECTYPE (stmt_info
))
10524 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
10525 if (!STMT_SLP_TYPE (stmt_info
)
10526 && maybe_ne (nunits
, vf
)
10527 && dump_enabled_p ())
10528 /* For SLP VF is set according to unrolling factor, and not
10529 to vector size, hence for SLP this print is not valid. */
10530 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
10533 /* Pure SLP statements have already been vectorized. We still need
10534 to apply loop vectorization to hybrid SLP statements. */
10535 if (PURE_SLP_STMT (stmt_info
))
10538 if (dump_enabled_p ())
10539 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
10541 if (vect_transform_stmt (loop_vinfo
, stmt_info
, gsi
, NULL
, NULL
))
10542 *seen_store
= stmt_info
;
10547 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10548 in the hash_map with its corresponding values. */
10551 find_in_mapping (tree t
, void *context
)
10553 hash_map
<tree
,tree
>* mapping
= (hash_map
<tree
, tree
>*) context
;
10555 tree
*value
= mapping
->get (t
);
10556 return value
? *value
: t
;
10559 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10560 original loop that has now been vectorized.
10562 The inits of the data_references need to be advanced with the number of
10563 iterations of the main loop. This has been computed in vect_do_peeling and
10564 is stored in parameter ADVANCE. We first restore the data_references
10565 initial offset with the values recored in ORIG_DRS_INIT.
10567 Since the loop_vec_info of this EPILOGUE was constructed for the original
10568 loop, its stmt_vec_infos all point to the original statements. These need
10569 to be updated to point to their corresponding copies as well as the SSA_NAMES
10570 in their PATTERN_DEF_SEQs and RELATED_STMTs.
10572 The data_reference's connections also need to be updated. Their
10573 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10574 stmt_vec_infos, their statements need to point to their corresponding copy,
10575 if they are gather loads or scatter stores then their reference needs to be
10576 updated to point to its corresponding copy and finally we set
10577 'base_misaligned' to false as we have already peeled for alignment in the
10578 prologue of the main loop. */
10581 update_epilogue_loop_vinfo (class loop
*epilogue
, tree advance
)
10583 loop_vec_info epilogue_vinfo
= loop_vec_info_for_loop (epilogue
);
10584 auto_vec
<gimple
*> stmt_worklist
;
10585 hash_map
<tree
,tree
> mapping
;
10586 gimple
*orig_stmt
, *new_stmt
;
10587 gimple_stmt_iterator epilogue_gsi
;
10588 gphi_iterator epilogue_phi_gsi
;
10589 stmt_vec_info stmt_vinfo
= NULL
, related_vinfo
;
10590 basic_block
*epilogue_bbs
= get_loop_body (epilogue
);
10593 free (LOOP_VINFO_BBS (epilogue_vinfo
));
10594 LOOP_VINFO_BBS (epilogue_vinfo
) = epilogue_bbs
;
10596 /* Advance data_reference's with the number of iterations of the previous
10597 loop and its prologue. */
10598 vect_update_inits_of_drs (epilogue_vinfo
, advance
, PLUS_EXPR
);
10601 /* The EPILOGUE loop is a copy of the original loop so they share the same
10602 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10603 point to the copied statements. We also create a mapping of all LHS' in
10604 the original loop and all the LHS' in the EPILOGUE and create worklists to
10605 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
10606 for (unsigned i
= 0; i
< epilogue
->num_nodes
; ++i
)
10608 for (epilogue_phi_gsi
= gsi_start_phis (epilogue_bbs
[i
]);
10609 !gsi_end_p (epilogue_phi_gsi
); gsi_next (&epilogue_phi_gsi
))
10611 new_stmt
= epilogue_phi_gsi
.phi ();
10613 gcc_assert (gimple_uid (new_stmt
) > 0);
10615 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
10617 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
10618 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
10620 mapping
.put (gimple_phi_result (orig_stmt
),
10621 gimple_phi_result (new_stmt
));
10622 /* PHI nodes can not have patterns or related statements. */
10623 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
) == NULL
10624 && STMT_VINFO_RELATED_STMT (stmt_vinfo
) == NULL
);
10627 for (epilogue_gsi
= gsi_start_bb (epilogue_bbs
[i
]);
10628 !gsi_end_p (epilogue_gsi
); gsi_next (&epilogue_gsi
))
10630 new_stmt
= gsi_stmt (epilogue_gsi
);
10631 if (is_gimple_debug (new_stmt
))
10634 gcc_assert (gimple_uid (new_stmt
) > 0);
10636 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
10638 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
10639 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
10641 if (tree old_lhs
= gimple_get_lhs (orig_stmt
))
10642 mapping
.put (old_lhs
, gimple_get_lhs (new_stmt
));
10644 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
))
10646 gimple_seq seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
);
10647 for (gimple_stmt_iterator gsi
= gsi_start (seq
);
10648 !gsi_end_p (gsi
); gsi_next (&gsi
))
10649 stmt_worklist
.safe_push (gsi_stmt (gsi
));
10652 related_vinfo
= STMT_VINFO_RELATED_STMT (stmt_vinfo
);
10653 if (related_vinfo
!= NULL
&& related_vinfo
!= stmt_vinfo
)
10655 gimple
*stmt
= STMT_VINFO_STMT (related_vinfo
);
10656 stmt_worklist
.safe_push (stmt
);
10657 /* Set BB such that the assert in
10658 'get_initial_def_for_reduction' is able to determine that
10659 the BB of the related stmt is inside this loop. */
10660 gimple_set_bb (stmt
,
10661 gimple_bb (new_stmt
));
10662 related_vinfo
= STMT_VINFO_RELATED_STMT (related_vinfo
);
10663 gcc_assert (related_vinfo
== NULL
10664 || related_vinfo
== stmt_vinfo
);
10669 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
10670 using the original main loop and thus need to be updated to refer to the
10671 cloned variables used in the epilogue. */
10672 for (unsigned i
= 0; i
< stmt_worklist
.length (); ++i
)
10674 gimple
*stmt
= stmt_worklist
[i
];
10677 for (unsigned j
= 1; j
< gimple_num_ops (stmt
); ++j
)
10679 tree op
= gimple_op (stmt
, j
);
10680 if ((new_op
= mapping
.get(op
)))
10681 gimple_set_op (stmt
, j
, *new_op
);
10684 /* PR92429: The last argument of simplify_replace_tree disables
10685 folding when replacing arguments. This is required as
10686 otherwise you might end up with different statements than the
10687 ones analyzed in vect_loop_analyze, leading to different
10689 op
= simplify_replace_tree (op
, NULL_TREE
, NULL_TREE
,
10690 &find_in_mapping
, &mapping
, false);
10691 gimple_set_op (stmt
, j
, op
);
10696 struct data_reference
*dr
;
10697 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (epilogue_vinfo
);
10698 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
10700 orig_stmt
= DR_STMT (dr
);
10701 gcc_assert (gimple_uid (orig_stmt
) > 0);
10702 stmt_vinfo
= epilogue_vinfo
->stmt_vec_infos
[gimple_uid (orig_stmt
) - 1];
10703 /* Data references for gather loads and scatter stores do not use the
10704 updated offset we set using ADVANCE. Instead we have to make sure the
10705 reference in the data references point to the corresponding copy of
10706 the original in the epilogue. */
10707 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo
))
10708 == VMAT_GATHER_SCATTER
)
10711 = simplify_replace_tree (DR_REF (dr
), NULL_TREE
, NULL_TREE
,
10712 &find_in_mapping
, &mapping
);
10713 DR_BASE_ADDRESS (dr
)
10714 = simplify_replace_tree (DR_BASE_ADDRESS (dr
), NULL_TREE
, NULL_TREE
,
10715 &find_in_mapping
, &mapping
);
10717 DR_STMT (dr
) = STMT_VINFO_STMT (stmt_vinfo
);
10718 stmt_vinfo
->dr_aux
.stmt
= stmt_vinfo
;
10719 /* The vector size of the epilogue is smaller than that of the main loop
10720 so the alignment is either the same or lower. This means the dr will
10721 thus by definition be aligned. */
10722 STMT_VINFO_DR_INFO (stmt_vinfo
)->base_misaligned
= false;
10725 epilogue_vinfo
->shared
->datarefs_copy
.release ();
10726 epilogue_vinfo
->shared
->save_datarefs ();
10729 /* Function vect_transform_loop.
10731 The analysis phase has determined that the loop is vectorizable.
10732 Vectorize the loop - created vectorized stmts to replace the scalar
10733 stmts in the loop, and update the loop exit condition.
10734 Returns scalar epilogue loop if any. */
10737 vect_transform_loop (loop_vec_info loop_vinfo
, gimple
*loop_vectorized_call
)
10739 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
10740 class loop
*epilogue
= NULL
;
10741 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
10742 int nbbs
= loop
->num_nodes
;
10744 tree niters_vector
= NULL_TREE
;
10745 tree step_vector
= NULL_TREE
;
10746 tree niters_vector_mult_vf
= NULL_TREE
;
10747 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
10748 unsigned int lowest_vf
= constant_lower_bound (vf
);
10750 bool check_profitability
= false;
10753 DUMP_VECT_SCOPE ("vec_transform_loop");
10755 loop_vinfo
->shared
->check_datarefs ();
10757 /* Use the more conservative vectorization threshold. If the number
10758 of iterations is constant assume the cost check has been performed
10759 by our caller. If the threshold makes all loops profitable that
10760 run at least the (estimated) vectorization factor number of times
10761 checking is pointless, too. */
10762 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
10763 if (vect_apply_runtime_profitability_check_p (loop_vinfo
))
10765 if (dump_enabled_p ())
10766 dump_printf_loc (MSG_NOTE
, vect_location
,
10767 "Profitability threshold is %d loop iterations.\n",
10769 check_profitability
= true;
10772 /* Make sure there exists a single-predecessor exit bb. Do this before
10774 edge e
= single_exit (loop
);
10775 if (! single_pred_p (e
->dest
))
10777 split_loop_exit_edge (e
, true);
10778 if (dump_enabled_p ())
10779 dump_printf (MSG_NOTE
, "split exit edge\n");
10782 /* Version the loop first, if required, so the profitability check
10785 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
10788 = vect_loop_versioning (loop_vinfo
, loop_vectorized_call
);
10789 sloop
->force_vectorize
= false;
10790 check_profitability
= false;
10793 /* Make sure there exists a single-predecessor exit bb also on the
10794 scalar loop copy. Do this after versioning but before peeling
10795 so CFG structure is fine for both scalar and if-converted loop
10796 to make slpeel_duplicate_current_defs_from_edges face matched
10797 loop closed PHI nodes on the exit. */
10798 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
10800 e
= single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
));
10801 if (! single_pred_p (e
->dest
))
10803 split_loop_exit_edge (e
, true);
10804 if (dump_enabled_p ())
10805 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
10809 tree niters
= vect_build_loop_niters (loop_vinfo
);
10810 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
10811 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
10812 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
10814 drs_init_vec orig_drs_init
;
10816 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
10817 &step_vector
, &niters_vector_mult_vf
, th
,
10818 check_profitability
, niters_no_overflow
,
10821 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
)
10822 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
).initialized_p ())
10823 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
),
10824 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
));
10826 if (niters_vector
== NULL_TREE
)
10828 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
10829 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
)
10830 && known_eq (lowest_vf
, vf
))
10833 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
10834 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
10835 step_vector
= build_one_cst (TREE_TYPE (niters
));
10837 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
10838 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
10839 &step_vector
, niters_no_overflow
);
10841 /* vect_do_peeling subtracted the number of peeled prologue
10842 iterations from LOOP_VINFO_NITERS. */
10843 vect_gen_vector_loop_niters (loop_vinfo
, LOOP_VINFO_NITERS (loop_vinfo
),
10844 &niters_vector
, &step_vector
,
10845 niters_no_overflow
);
10848 /* 1) Make sure the loop header has exactly two entries
10849 2) Make sure we have a preheader basic block. */
10851 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
10853 split_edge (loop_preheader_edge (loop
));
10855 if (vect_use_loop_mask_for_alignment_p (loop_vinfo
))
10856 /* This will deal with any possible peeling. */
10857 vect_prepare_for_masked_peels (loop_vinfo
);
10859 /* Schedule the SLP instances first, then handle loop vectorization
10861 if (!loop_vinfo
->slp_instances
.is_empty ())
10863 DUMP_VECT_SCOPE ("scheduling SLP instances");
10864 vect_schedule_slp (loop_vinfo
, LOOP_VINFO_SLP_INSTANCES (loop_vinfo
));
10867 /* FORNOW: the vectorizer supports only loops which body consist
10868 of one basic block (header + empty latch). When the vectorizer will
10869 support more involved loop forms, the order by which the BBs are
10870 traversed need to be reconsidered. */
10872 for (i
= 0; i
< nbbs
; i
++)
10874 basic_block bb
= bbs
[i
];
10875 stmt_vec_info stmt_info
;
10877 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
10880 gphi
*phi
= si
.phi ();
10881 if (dump_enabled_p ())
10882 dump_printf_loc (MSG_NOTE
, vect_location
,
10883 "------>vectorizing phi: %G", (gimple
*) phi
);
10884 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
10888 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
10889 vect_loop_kill_debug_uses (loop
, stmt_info
);
10891 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
10892 && !STMT_VINFO_LIVE_P (stmt_info
))
10895 if (STMT_VINFO_VECTYPE (stmt_info
)
10897 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
10898 && dump_enabled_p ())
10899 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
10901 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
10902 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
10903 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
10904 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
10905 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_first_order_recurrence
10906 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
10907 && ! PURE_SLP_STMT (stmt_info
))
10909 if (dump_enabled_p ())
10910 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
10911 vect_transform_stmt (loop_vinfo
, stmt_info
, NULL
, NULL
, NULL
);
10915 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
10918 gphi
*phi
= si
.phi ();
10919 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
10923 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
10924 && !STMT_VINFO_LIVE_P (stmt_info
))
10927 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
10928 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
10929 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
10930 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
10931 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
10932 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_first_order_recurrence
)
10933 && ! PURE_SLP_STMT (stmt_info
))
10934 maybe_set_vectorized_backedge_value (loop_vinfo
, stmt_info
);
10937 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
10940 stmt
= gsi_stmt (si
);
10941 /* During vectorization remove existing clobber stmts. */
10942 if (gimple_clobber_p (stmt
))
10944 unlink_stmt_vdef (stmt
);
10945 gsi_remove (&si
, true);
10946 release_defs (stmt
);
10950 /* Ignore vector stmts created in the outer loop. */
10951 stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
10953 /* vector stmts created in the outer-loop during vectorization of
10954 stmts in an inner-loop may not have a stmt_info, and do not
10955 need to be vectorized. */
10956 stmt_vec_info seen_store
= NULL
;
10959 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
10961 gimple
*def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
10962 for (gimple_stmt_iterator subsi
= gsi_start (def_seq
);
10963 !gsi_end_p (subsi
); gsi_next (&subsi
))
10965 stmt_vec_info pat_stmt_info
10966 = loop_vinfo
->lookup_stmt (gsi_stmt (subsi
));
10967 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
10970 stmt_vec_info pat_stmt_info
10971 = STMT_VINFO_RELATED_STMT (stmt_info
);
10972 if (vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
10974 maybe_set_vectorized_backedge_value (loop_vinfo
,
10979 if (vect_transform_loop_stmt (loop_vinfo
, stmt_info
, &si
,
10981 maybe_set_vectorized_backedge_value (loop_vinfo
,
10988 if (STMT_VINFO_GROUPED_ACCESS (seen_store
))
10989 /* Interleaving. If IS_STORE is TRUE, the
10990 vectorization of the interleaving chain was
10991 completed - free all the stores in the chain. */
10992 vect_remove_stores (loop_vinfo
,
10993 DR_GROUP_FIRST_ELEMENT (seen_store
));
10995 /* Free the attached stmt_vec_info and remove the stmt. */
10996 loop_vinfo
->remove_stmt (stmt_info
);
11001 /* Stub out scalar statements that must not survive vectorization.
11002 Doing this here helps with grouped statements, or statements that
11003 are involved in patterns. */
11004 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
11005 !gsi_end_p (gsi
); gsi_next (&gsi
))
11007 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
11008 if (!call
|| !gimple_call_internal_p (call
))
11010 internal_fn ifn
= gimple_call_internal_fn (call
);
11011 if (ifn
== IFN_MASK_LOAD
)
11013 tree lhs
= gimple_get_lhs (call
);
11014 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
11016 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
11017 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
11018 gsi_replace (&gsi
, new_stmt
, true);
11021 else if (conditional_internal_fn_code (ifn
) != ERROR_MARK
)
11023 tree lhs
= gimple_get_lhs (call
);
11024 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
11027 = gimple_call_arg (call
, gimple_call_num_args (call
) - 1);
11028 gimple
*new_stmt
= gimple_build_assign (lhs
, else_arg
);
11029 gsi_replace (&gsi
, new_stmt
, true);
11033 } /* BBs in loop */
11035 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11036 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11037 if (integer_onep (step_vector
))
11038 niters_no_overflow
= true;
11039 vect_set_loop_condition (loop
, loop_vinfo
, niters_vector
, step_vector
,
11040 niters_vector_mult_vf
, !niters_no_overflow
);
11042 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
11043 scale_profile_for_vect_loop (loop
, assumed_vf
);
11045 /* True if the final iteration might not handle a full vector's
11046 worth of scalar iterations. */
11047 bool final_iter_may_be_partial
11048 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
);
11049 /* The minimum number of iterations performed by the epilogue. This
11050 is 1 when peeling for gaps because we always need a final scalar
11052 int min_epilogue_iters
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
11053 /* +1 to convert latch counts to loop iteration counts,
11054 -min_epilogue_iters to remove iterations that cannot be performed
11055 by the vector code. */
11056 int bias_for_lowest
= 1 - min_epilogue_iters
;
11057 int bias_for_assumed
= bias_for_lowest
;
11058 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
11059 if (alignment_npeels
&& LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo
))
11061 /* When the amount of peeling is known at compile time, the first
11062 iteration will have exactly alignment_npeels active elements.
11063 In the worst case it will have at least one. */
11064 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
11065 bias_for_lowest
+= lowest_vf
- min_first_active
;
11066 bias_for_assumed
+= assumed_vf
- min_first_active
;
11068 /* In these calculations the "- 1" converts loop iteration counts
11069 back to latch counts. */
11070 if (loop
->any_upper_bound
)
11072 loop_vec_info main_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
11073 loop
->nb_iterations_upper_bound
11074 = (final_iter_may_be_partial
11075 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
11077 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
11080 /* Both peeling for alignment and peeling for gaps can end up
11081 with the scalar epilogue running for more than VF-1 iterations. */
11082 && !main_vinfo
->peeling_for_alignment
11083 && !main_vinfo
->peeling_for_gaps
)
11085 unsigned int bound
;
11086 poly_uint64 main_iters
11087 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo
),
11088 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo
));
11090 = upper_bound (main_iters
,
11091 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo
));
11092 if (can_div_away_from_zero_p (main_iters
,
11093 LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
11095 loop
->nb_iterations_upper_bound
11096 = wi::umin ((widest_int
) (bound
- 1),
11097 loop
->nb_iterations_upper_bound
);
11100 if (loop
->any_likely_upper_bound
)
11101 loop
->nb_iterations_likely_upper_bound
11102 = (final_iter_may_be_partial
11103 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
11104 + bias_for_lowest
, lowest_vf
) - 1
11105 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
11106 + bias_for_lowest
, lowest_vf
) - 1);
11107 if (loop
->any_estimate
)
11108 loop
->nb_iterations_estimate
11109 = (final_iter_may_be_partial
11110 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
11112 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
11115 if (dump_enabled_p ())
11117 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
11119 dump_printf_loc (MSG_NOTE
, vect_location
,
11120 "LOOP VECTORIZED\n");
11122 dump_printf_loc (MSG_NOTE
, vect_location
,
11123 "OUTER LOOP VECTORIZED\n");
11124 dump_printf (MSG_NOTE
, "\n");
11127 dump_printf_loc (MSG_NOTE
, vect_location
,
11128 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11129 GET_MODE_NAME (loop_vinfo
->vector_mode
));
11132 /* Loops vectorized with a variable factor won't benefit from
11133 unrolling/peeling. */
11134 if (!vf
.is_constant ())
11137 if (dump_enabled_p ())
11138 dump_printf_loc (MSG_NOTE
, vect_location
, "Disabling unrolling due to"
11139 " variable-length vectorization factor\n");
11141 /* Free SLP instances here because otherwise stmt reference counting
11143 slp_instance instance
;
11144 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
11145 vect_free_slp_instance (instance
);
11146 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
11147 /* Clear-up safelen field since its value is invalid after vectorization
11148 since vectorized loop can have loop-carried dependencies. */
11153 update_epilogue_loop_vinfo (epilogue
, advance
);
11155 epilogue
->simduid
= loop
->simduid
;
11156 epilogue
->force_vectorize
= loop
->force_vectorize
;
11157 epilogue
->dont_vectorize
= false;
11163 /* The code below is trying to perform simple optimization - revert
11164 if-conversion for masked stores, i.e. if the mask of a store is zero
11165 do not perform it and all stored value producers also if possible.
11167 for (i=0; i<n; i++)
11173 this transformation will produce the following semi-hammock:
11175 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11177 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11178 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11179 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11180 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11181 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11182 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11187 optimize_mask_stores (class loop
*loop
)
11189 basic_block
*bbs
= get_loop_body (loop
);
11190 unsigned nbbs
= loop
->num_nodes
;
11193 class loop
*bb_loop
;
11194 gimple_stmt_iterator gsi
;
11196 auto_vec
<gimple
*> worklist
;
11197 auto_purge_vect_location sentinel
;
11199 vect_location
= find_loop_location (loop
);
11200 /* Pick up all masked stores in loop if any. */
11201 for (i
= 0; i
< nbbs
; i
++)
11204 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
11207 stmt
= gsi_stmt (gsi
);
11208 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
11209 worklist
.safe_push (stmt
);
11214 if (worklist
.is_empty ())
11217 /* Loop has masked stores. */
11218 while (!worklist
.is_empty ())
11220 gimple
*last
, *last_store
;
11223 basic_block store_bb
, join_bb
;
11224 gimple_stmt_iterator gsi_to
;
11225 tree vdef
, new_vdef
;
11230 last
= worklist
.pop ();
11231 mask
= gimple_call_arg (last
, 2);
11232 bb
= gimple_bb (last
);
11233 /* Create then_bb and if-then structure in CFG, then_bb belongs to
11234 the same loop as if_bb. It could be different to LOOP when two
11235 level loop-nest is vectorized and mask_store belongs to the inner
11237 e
= split_block (bb
, last
);
11238 bb_loop
= bb
->loop_father
;
11239 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
11241 store_bb
= create_empty_bb (bb
);
11242 add_bb_to_loop (store_bb
, bb_loop
);
11243 e
->flags
= EDGE_TRUE_VALUE
;
11244 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
11245 /* Put STORE_BB to likely part. */
11246 efalse
->probability
= profile_probability::unlikely ();
11247 store_bb
->count
= efalse
->count ();
11248 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
11249 if (dom_info_available_p (CDI_DOMINATORS
))
11250 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
11251 if (dump_enabled_p ())
11252 dump_printf_loc (MSG_NOTE
, vect_location
,
11253 "Create new block %d to sink mask stores.",
11255 /* Create vector comparison with boolean result. */
11256 vectype
= TREE_TYPE (mask
);
11257 zero
= build_zero_cst (vectype
);
11258 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
11259 gsi
= gsi_last_bb (bb
);
11260 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
11261 /* Create new PHI node for vdef of the last masked store:
11262 .MEM_2 = VDEF <.MEM_1>
11263 will be converted to
11264 .MEM.3 = VDEF <.MEM_1>
11265 and new PHI node will be created in join bb
11266 .MEM_2 = PHI <.MEM_1, .MEM_3>
11268 vdef
= gimple_vdef (last
);
11269 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
11270 gimple_set_vdef (last
, new_vdef
);
11271 phi
= create_phi_node (vdef
, join_bb
);
11272 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
11274 /* Put all masked stores with the same mask to STORE_BB if possible. */
11277 gimple_stmt_iterator gsi_from
;
11278 gimple
*stmt1
= NULL
;
11280 /* Move masked store to STORE_BB. */
11282 gsi
= gsi_for_stmt (last
);
11284 /* Shift GSI to the previous stmt for further traversal. */
11286 gsi_to
= gsi_start_bb (store_bb
);
11287 gsi_move_before (&gsi_from
, &gsi_to
);
11288 /* Setup GSI_TO to the non-empty block start. */
11289 gsi_to
= gsi_start_bb (store_bb
);
11290 if (dump_enabled_p ())
11291 dump_printf_loc (MSG_NOTE
, vect_location
,
11292 "Move stmt to created bb\n%G", last
);
11293 /* Move all stored value producers if possible. */
11294 while (!gsi_end_p (gsi
))
11297 imm_use_iterator imm_iter
;
11298 use_operand_p use_p
;
11301 /* Skip debug statements. */
11302 if (is_gimple_debug (gsi_stmt (gsi
)))
11307 stmt1
= gsi_stmt (gsi
);
11308 /* Do not consider statements writing to memory or having
11309 volatile operand. */
11310 if (gimple_vdef (stmt1
)
11311 || gimple_has_volatile_ops (stmt1
))
11315 lhs
= gimple_get_lhs (stmt1
);
11319 /* LHS of vectorized stmt must be SSA_NAME. */
11320 if (TREE_CODE (lhs
) != SSA_NAME
)
11323 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
11325 /* Remove dead scalar statement. */
11326 if (has_zero_uses (lhs
))
11328 gsi_remove (&gsi_from
, true);
11333 /* Check that LHS does not have uses outside of STORE_BB. */
11335 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
11338 use_stmt
= USE_STMT (use_p
);
11339 if (is_gimple_debug (use_stmt
))
11341 if (gimple_bb (use_stmt
) != store_bb
)
11350 if (gimple_vuse (stmt1
)
11351 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
11354 /* Can move STMT1 to STORE_BB. */
11355 if (dump_enabled_p ())
11356 dump_printf_loc (MSG_NOTE
, vect_location
,
11357 "Move stmt to created bb\n%G", stmt1
);
11358 gsi_move_before (&gsi_from
, &gsi_to
);
11359 /* Shift GSI_TO for further insertion. */
11360 gsi_prev (&gsi_to
);
11362 /* Put other masked stores with the same mask to STORE_BB. */
11363 if (worklist
.is_empty ()
11364 || gimple_call_arg (worklist
.last (), 2) != mask
11365 || worklist
.last () != stmt1
)
11367 last
= worklist
.pop ();
11369 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);
11373 /* Decide whether it is possible to use a zero-based induction variable
11374 when vectorizing LOOP_VINFO with partial vectors. If it is, return
11375 the value that the induction variable must be able to hold in order
11376 to ensure that the rgroups eventually have no active vector elements.
11377 Return -1 otherwise. */
11380 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo
)
11382 tree niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
11383 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
11384 unsigned HOST_WIDE_INT max_vf
= vect_max_vf (loop_vinfo
);
11386 /* Calculate the value that the induction variable must be able
11387 to hit in order to ensure that we end the loop with an all-false mask.
11388 This involves adding the maximum number of inactive trailing scalar
11390 widest_int iv_limit
= -1;
11391 if (max_loop_iterations (loop
, &iv_limit
))
11395 /* Add the maximum number of skipped iterations to the
11396 maximum iteration count. */
11397 if (TREE_CODE (niters_skip
) == INTEGER_CST
)
11398 iv_limit
+= wi::to_widest (niters_skip
);
11400 iv_limit
+= max_vf
- 1;
11402 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
11403 /* Make a conservatively-correct assumption. */
11404 iv_limit
+= max_vf
- 1;
11406 /* IV_LIMIT is the maximum number of latch iterations, which is also
11407 the maximum in-range IV value. Round this value down to the previous
11408 vector alignment boundary and then add an extra full iteration. */
11409 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
11410 iv_limit
= (iv_limit
& -(int) known_alignment (vf
)) + max_vf
;
11415 /* For the given rgroup_controls RGC, check whether an induction variable
11416 would ever hit a value that produces a set of all-false masks or zero
11417 lengths before wrapping around. Return true if it's possible to wrap
11418 around before hitting the desirable value, otherwise return false. */
11421 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo
, rgroup_controls
*rgc
)
11423 widest_int iv_limit
= vect_iv_limit_for_partial_vectors (loop_vinfo
);
11425 if (iv_limit
== -1)
11428 tree compare_type
= LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo
);
11429 unsigned int compare_precision
= TYPE_PRECISION (compare_type
);
11430 unsigned nitems
= rgc
->max_nscalars_per_iter
* rgc
->factor
;
11432 if (wi::min_precision (iv_limit
* nitems
, UNSIGNED
) > compare_precision
)