]> gcc.gnu.org Git - gcc.git/blob - gcc/tree-vect-slp.c
backport: re PR tree-optimization/92222 (ice in useless_type_conversion_p, at gimple...
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48
49
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51 FINAL_P is true if we have vectorized the instance or if we have
52 made a final decision not to vectorize the statements in any way. */
53
54 static void
55 vect_free_slp_tree (slp_tree node, bool final_p)
56 {
57 int i;
58 slp_tree child;
59
60 if (--node->refcnt != 0)
61 return;
62
63 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
64 vect_free_slp_tree (child, final_p);
65
66 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
67 Some statements might no longer exist, after having been
68 removed by vect_transform_stmt. Updating the remaining
69 statements would be redundant. */
70 if (!final_p)
71 {
72 stmt_vec_info stmt_info;
73 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
74 {
75 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
76 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
77 }
78 }
79
80 SLP_TREE_CHILDREN (node).release ();
81 SLP_TREE_SCALAR_STMTS (node).release ();
82 SLP_TREE_VEC_STMTS (node).release ();
83 SLP_TREE_LOAD_PERMUTATION (node).release ();
84
85 free (node);
86 }
87
88 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
89 have vectorized the instance or if we have made a final decision not
90 to vectorize the statements in any way. */
91
92 void
93 vect_free_slp_instance (slp_instance instance, bool final_p)
94 {
95 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
96 SLP_INSTANCE_LOADS (instance).release ();
97 free (instance);
98 }
99
100
101 /* Create an SLP node for SCALAR_STMTS. */
102
103 static slp_tree
104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
105 {
106 slp_tree node;
107 stmt_vec_info stmt_info = scalar_stmts[0];
108 unsigned int nops;
109
110 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
111 nops = gimple_call_num_args (stmt);
112 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
113 {
114 nops = gimple_num_ops (stmt) - 1;
115 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
116 nops++;
117 }
118 else if (is_a <gphi *> (stmt_info->stmt))
119 nops = 0;
120 else
121 return NULL;
122
123 node = XNEW (struct _slp_tree);
124 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
125 SLP_TREE_VEC_STMTS (node).create (0);
126 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
127 SLP_TREE_CHILDREN (node).create (nops);
128 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
129 SLP_TREE_TWO_OPERATORS (node) = false;
130 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
131 node->refcnt = 1;
132 node->max_nunits = 1;
133
134 unsigned i;
135 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
136 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
137
138 return node;
139 }
140
141
142 /* This structure is used in creation of an SLP tree. Each instance
143 corresponds to the same operand in a group of scalar stmts in an SLP
144 node. */
145 typedef struct _slp_oprnd_info
146 {
147 /* Def-stmts for the operands. */
148 vec<stmt_vec_info> def_stmts;
149 /* Information about the first statement, its vector def-type, type, the
150 operand itself in case it's constant, and an indication if it's a pattern
151 stmt. */
152 tree first_op_type;
153 enum vect_def_type first_dt;
154 bool any_pattern;
155 } *slp_oprnd_info;
156
157
158 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
159 operand. */
160 static vec<slp_oprnd_info>
161 vect_create_oprnd_info (int nops, int group_size)
162 {
163 int i;
164 slp_oprnd_info oprnd_info;
165 vec<slp_oprnd_info> oprnds_info;
166
167 oprnds_info.create (nops);
168 for (i = 0; i < nops; i++)
169 {
170 oprnd_info = XNEW (struct _slp_oprnd_info);
171 oprnd_info->def_stmts.create (group_size);
172 oprnd_info->first_dt = vect_uninitialized_def;
173 oprnd_info->first_op_type = NULL_TREE;
174 oprnd_info->any_pattern = false;
175 oprnds_info.quick_push (oprnd_info);
176 }
177
178 return oprnds_info;
179 }
180
181
182 /* Free operands info. */
183
184 static void
185 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
186 {
187 int i;
188 slp_oprnd_info oprnd_info;
189
190 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
191 {
192 oprnd_info->def_stmts.release ();
193 XDELETE (oprnd_info);
194 }
195
196 oprnds_info.release ();
197 }
198
199
200 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
201 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
202 of the chain. */
203
204 int
205 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
206 stmt_vec_info first_stmt_info)
207 {
208 stmt_vec_info next_stmt_info = first_stmt_info;
209 int result = 0;
210
211 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
212 return -1;
213
214 do
215 {
216 if (next_stmt_info == stmt_info)
217 return result;
218 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
219 if (next_stmt_info)
220 result += DR_GROUP_GAP (next_stmt_info);
221 }
222 while (next_stmt_info);
223
224 return -1;
225 }
226
227 /* Check whether it is possible to load COUNT elements of type ELT_MODE
228 using the method implemented by duplicate_and_interleave. Return true
229 if so, returning the number of intermediate vectors in *NVECTORS_OUT
230 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
231 (if nonnull). */
232
233 bool
234 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
235 unsigned int *nvectors_out,
236 tree *vector_type_out,
237 tree *permutes)
238 {
239 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
240 poly_int64 nelts;
241 unsigned int nvectors = 1;
242 for (;;)
243 {
244 scalar_int_mode int_mode;
245 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
246 if (multiple_p (current_vector_size, elt_bytes, &nelts)
247 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
248 {
249 tree int_type = build_nonstandard_integer_type
250 (GET_MODE_BITSIZE (int_mode), 1);
251 tree vector_type = build_vector_type (int_type, nelts);
252 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
253 {
254 vec_perm_builder sel1 (nelts, 2, 3);
255 vec_perm_builder sel2 (nelts, 2, 3);
256 poly_int64 half_nelts = exact_div (nelts, 2);
257 for (unsigned int i = 0; i < 3; ++i)
258 {
259 sel1.quick_push (i);
260 sel1.quick_push (i + nelts);
261 sel2.quick_push (half_nelts + i);
262 sel2.quick_push (half_nelts + i + nelts);
263 }
264 vec_perm_indices indices1 (sel1, 2, nelts);
265 vec_perm_indices indices2 (sel2, 2, nelts);
266 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
267 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
268 {
269 if (nvectors_out)
270 *nvectors_out = nvectors;
271 if (vector_type_out)
272 *vector_type_out = vector_type;
273 if (permutes)
274 {
275 permutes[0] = vect_gen_perm_mask_checked (vector_type,
276 indices1);
277 permutes[1] = vect_gen_perm_mask_checked (vector_type,
278 indices2);
279 }
280 return true;
281 }
282 }
283 }
284 if (!multiple_p (elt_bytes, 2, &elt_bytes))
285 return false;
286 nvectors *= 2;
287 }
288 }
289
290 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
291 they are of a valid type and that they match the defs of the first stmt of
292 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
293 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
294 indicates swap is required for cond_expr stmts. Specifically, *SWAP
295 is 1 if STMT is cond and operands of comparison need to be swapped;
296 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
297 If there is any operand swap in this function, *SWAP is set to non-zero
298 value.
299 If there was a fatal error return -1; if the error could be corrected by
300 swapping operands of father node of this one, return 1; if everything is
301 ok return 0. */
302 static int
303 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
304 vec<stmt_vec_info> stmts, unsigned stmt_num,
305 vec<slp_oprnd_info> *oprnds_info)
306 {
307 stmt_vec_info stmt_info = stmts[stmt_num];
308 tree oprnd;
309 unsigned int i, number_of_oprnds;
310 enum vect_def_type dt = vect_uninitialized_def;
311 slp_oprnd_info oprnd_info;
312 int first_op_idx = 1;
313 unsigned int commutative_op = -1U;
314 bool first_op_cond = false;
315 bool first = stmt_num == 0;
316
317 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
318 {
319 number_of_oprnds = gimple_call_num_args (stmt);
320 first_op_idx = 3;
321 if (gimple_call_internal_p (stmt))
322 {
323 internal_fn ifn = gimple_call_internal_fn (stmt);
324 commutative_op = first_commutative_argument (ifn);
325 }
326 }
327 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
328 {
329 enum tree_code code = gimple_assign_rhs_code (stmt);
330 number_of_oprnds = gimple_num_ops (stmt) - 1;
331 /* Swap can only be done for cond_expr if asked to, otherwise we
332 could result in different comparison code to the first stmt. */
333 if (gimple_assign_rhs_code (stmt) == COND_EXPR
334 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
335 {
336 first_op_cond = true;
337 number_of_oprnds++;
338 }
339 else
340 commutative_op = commutative_tree_code (code) ? 0U : -1U;
341 }
342 else
343 return -1;
344
345 bool swapped = (*swap != 0);
346 gcc_assert (!swapped || first_op_cond);
347 for (i = 0; i < number_of_oprnds; i++)
348 {
349 again:
350 if (first_op_cond)
351 {
352 /* Map indicating how operands of cond_expr should be swapped. */
353 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
354 int *map = maps[*swap];
355
356 if (i < 2)
357 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
358 first_op_idx), map[i]);
359 else
360 oprnd = gimple_op (stmt_info->stmt, map[i]);
361 }
362 else
363 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
364
365 oprnd_info = (*oprnds_info)[i];
366
367 stmt_vec_info def_stmt_info;
368 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
369 {
370 if (dump_enabled_p ())
371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
372 "Build SLP failed: can't analyze def for %T\n",
373 oprnd);
374
375 return -1;
376 }
377
378 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
379 oprnd_info->any_pattern = true;
380
381 if (first)
382 {
383 oprnd_info->first_dt = dt;
384 oprnd_info->first_op_type = TREE_TYPE (oprnd);
385 }
386 else
387 {
388 /* Not first stmt of the group, check that the def-stmt/s match
389 the def-stmt/s of the first stmt. Allow different definition
390 types for reduction chains: the first stmt must be a
391 vect_reduction_def (a phi node), and the rest
392 vect_internal_def. */
393 tree type = TREE_TYPE (oprnd);
394 if ((oprnd_info->first_dt != dt
395 && !(oprnd_info->first_dt == vect_reduction_def
396 && dt == vect_internal_def)
397 && !((oprnd_info->first_dt == vect_external_def
398 || oprnd_info->first_dt == vect_constant_def)
399 && (dt == vect_external_def
400 || dt == vect_constant_def)))
401 || !types_compatible_p (oprnd_info->first_op_type, type))
402 {
403 /* Try swapping operands if we got a mismatch. */
404 if (i == commutative_op && !swapped)
405 {
406 swapped = true;
407 goto again;
408 }
409
410 if (dump_enabled_p ())
411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412 "Build SLP failed: different types\n");
413
414 return 1;
415 }
416 if ((dt == vect_constant_def
417 || dt == vect_external_def)
418 && !current_vector_size.is_constant ()
419 && (TREE_CODE (type) == BOOLEAN_TYPE
420 || !can_duplicate_and_interleave_p (stmts.length (),
421 TYPE_MODE (type))))
422 {
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
425 "Build SLP failed: invalid type of def "
426 "for variable-length SLP %T\n", oprnd);
427 return -1;
428 }
429 }
430
431 /* Check the types of the definitions. */
432 switch (dt)
433 {
434 case vect_constant_def:
435 case vect_external_def:
436 break;
437
438 case vect_reduction_def:
439 case vect_induction_def:
440 case vect_internal_def:
441 oprnd_info->def_stmts.quick_push (def_stmt_info);
442 break;
443
444 default:
445 /* FORNOW: Not supported. */
446 if (dump_enabled_p ())
447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
448 "Build SLP failed: illegal type of def %T\n",
449 oprnd);
450
451 return -1;
452 }
453 }
454
455 /* Swap operands. */
456 if (swapped)
457 {
458 /* If there are already uses of this stmt in a SLP instance then
459 we've committed to the operand order and can't swap it. */
460 if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
461 {
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
464 "Build SLP failed: cannot swap operands of "
465 "shared stmt %G", stmt_info->stmt);
466 return -1;
467 }
468
469 if (first_op_cond)
470 {
471 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
472 tree cond = gimple_assign_rhs1 (stmt);
473 enum tree_code code = TREE_CODE (cond);
474
475 /* Swap. */
476 if (*swap == 1)
477 {
478 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
479 &TREE_OPERAND (cond, 1));
480 TREE_SET_CODE (cond, swap_tree_comparison (code));
481 }
482 /* Invert. */
483 else
484 {
485 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
486 gimple_assign_rhs3_ptr (stmt));
487 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
488 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
489 gcc_assert (code != ERROR_MARK);
490 TREE_SET_CODE (cond, code);
491 }
492 }
493 else
494 {
495 unsigned int op = commutative_op + first_op_idx;
496 swap_ssa_operands (stmt_info->stmt,
497 gimple_op_ptr (stmt_info->stmt, op),
498 gimple_op_ptr (stmt_info->stmt, op + 1));
499 }
500 if (dump_enabled_p ())
501 dump_printf_loc (MSG_NOTE, vect_location,
502 "swapped operands to match def types in %G",
503 stmt_info->stmt);
504 }
505
506 *swap = swapped;
507 return 0;
508 }
509
510 /* Return true if call statements CALL1 and CALL2 are similar enough
511 to be combined into the same SLP group. */
512
513 static bool
514 compatible_calls_p (gcall *call1, gcall *call2)
515 {
516 unsigned int nargs = gimple_call_num_args (call1);
517 if (nargs != gimple_call_num_args (call2))
518 return false;
519
520 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
521 return false;
522
523 if (gimple_call_internal_p (call1))
524 {
525 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
526 TREE_TYPE (gimple_call_lhs (call2))))
527 return false;
528 for (unsigned int i = 0; i < nargs; ++i)
529 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
530 TREE_TYPE (gimple_call_arg (call2, i))))
531 return false;
532 }
533 else
534 {
535 if (!operand_equal_p (gimple_call_fn (call1),
536 gimple_call_fn (call2), 0))
537 return false;
538
539 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
540 return false;
541 }
542 return true;
543 }
544
545 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
546 caller's attempt to find the vector type in STMT_INFO with the narrowest
547 element type. Return true if VECTYPE is nonnull and if it is valid
548 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
549 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
550 vect_build_slp_tree. */
551
552 static bool
553 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
554 tree vectype, poly_uint64 *max_nunits)
555 {
556 if (!vectype)
557 {
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 "Build SLP failed: unsupported data-type in %G\n",
561 stmt_info->stmt);
562 /* Fatal mismatch. */
563 return false;
564 }
565
566 /* If populating the vector type requires unrolling then fail
567 before adjusting *max_nunits for basic-block vectorization. */
568 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
569 unsigned HOST_WIDE_INT const_nunits;
570 if (STMT_VINFO_BB_VINFO (stmt_info)
571 && (!nunits.is_constant (&const_nunits)
572 || const_nunits > group_size))
573 {
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
576 "Build SLP failed: unrolling required "
577 "in basic block SLP\n");
578 /* Fatal mismatch. */
579 return false;
580 }
581
582 /* In case of multiple types we need to detect the smallest type. */
583 vect_update_max_nunits (max_nunits, vectype);
584 return true;
585 }
586
587 /* STMTS is a group of GROUP_SIZE SLP statements in which some
588 statements do the same operation as the first statement and in which
589 the others do ALT_STMT_CODE. Return true if we can take one vector
590 of the first operation and one vector of the second and permute them
591 to get the required result. VECTYPE is the type of the vector that
592 would be permuted. */
593
594 static bool
595 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
596 unsigned int group_size, tree vectype,
597 tree_code alt_stmt_code)
598 {
599 unsigned HOST_WIDE_INT count;
600 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
601 return false;
602
603 vec_perm_builder sel (count, count, 1);
604 for (unsigned int i = 0; i < count; ++i)
605 {
606 unsigned int elt = i;
607 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
608 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
609 elt += count;
610 sel.quick_push (elt);
611 }
612 vec_perm_indices indices (sel, 2, count);
613 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
614 }
615
616 /* Verify if the scalar stmts STMTS are isomorphic, require data
617 permutation or are of unsupported types of operation. Return
618 true if they are, otherwise return false and indicate in *MATCHES
619 which stmts are not isomorphic to the first one. If MATCHES[0]
620 is false then this indicates the comparison could not be
621 carried out or the stmts will never be vectorized by SLP.
622
623 Note COND_EXPR is possibly ismorphic to another one after swapping its
624 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
625 the first stmt by swapping the two operands of comparison; set SWAP[i]
626 to 2 if stmt I is isormorphic to the first stmt by inverting the code
627 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
628 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
629
630 static bool
631 vect_build_slp_tree_1 (unsigned char *swap,
632 vec<stmt_vec_info> stmts, unsigned int group_size,
633 poly_uint64 *max_nunits, bool *matches,
634 bool *two_operators)
635 {
636 unsigned int i;
637 stmt_vec_info first_stmt_info = stmts[0];
638 enum tree_code first_stmt_code = ERROR_MARK;
639 enum tree_code alt_stmt_code = ERROR_MARK;
640 enum tree_code rhs_code = ERROR_MARK;
641 enum tree_code first_cond_code = ERROR_MARK;
642 tree lhs;
643 bool need_same_oprnds = false;
644 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
645 optab optab;
646 int icode;
647 machine_mode optab_op2_mode;
648 machine_mode vec_mode;
649 stmt_vec_info first_load = NULL, prev_first_load = NULL;
650
651 /* For every stmt in NODE find its def stmt/s. */
652 stmt_vec_info stmt_info;
653 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
654 {
655 gimple *stmt = stmt_info->stmt;
656 swap[i] = 0;
657 matches[i] = false;
658
659 if (dump_enabled_p ())
660 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
661
662 /* Fail to vectorize statements marked as unvectorizable. */
663 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
664 {
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "Build SLP failed: unvectorizable statement %G",
668 stmt);
669 /* Fatal mismatch. */
670 matches[0] = false;
671 return false;
672 }
673
674 lhs = gimple_get_lhs (stmt);
675 if (lhs == NULL_TREE)
676 {
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: not GIMPLE_ASSIGN nor "
680 "GIMPLE_CALL %G", stmt);
681 /* Fatal mismatch. */
682 matches[0] = false;
683 return false;
684 }
685
686 tree nunits_vectype;
687 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
688 &nunits_vectype)
689 || (nunits_vectype
690 && !vect_record_max_nunits (stmt_info, group_size,
691 nunits_vectype, max_nunits)))
692 {
693 /* Fatal mismatch. */
694 matches[0] = false;
695 return false;
696 }
697
698 gcc_assert (vectype);
699
700 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
701 {
702 rhs_code = CALL_EXPR;
703 if ((gimple_call_internal_p (call_stmt)
704 && (!vectorizable_internal_fn_p
705 (gimple_call_internal_fn (call_stmt))))
706 || gimple_call_tail_p (call_stmt)
707 || gimple_call_noreturn_p (call_stmt)
708 || !gimple_call_nothrow_p (call_stmt)
709 || gimple_call_chain (call_stmt))
710 {
711 if (dump_enabled_p ())
712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
713 "Build SLP failed: unsupported call type %G",
714 call_stmt);
715 /* Fatal mismatch. */
716 matches[0] = false;
717 return false;
718 }
719 }
720 else
721 rhs_code = gimple_assign_rhs_code (stmt);
722
723 /* Check the operation. */
724 if (i == 0)
725 {
726 first_stmt_code = rhs_code;
727
728 /* Shift arguments should be equal in all the packed stmts for a
729 vector shift with scalar shift operand. */
730 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
731 || rhs_code == LROTATE_EXPR
732 || rhs_code == RROTATE_EXPR)
733 {
734 if (vectype == boolean_type_node)
735 {
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738 "Build SLP failed: shift of a"
739 " boolean.\n");
740 /* Fatal mismatch. */
741 matches[0] = false;
742 return false;
743 }
744
745 vec_mode = TYPE_MODE (vectype);
746
747 /* First see if we have a vector/vector shift. */
748 optab = optab_for_tree_code (rhs_code, vectype,
749 optab_vector);
750
751 if (!optab
752 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
753 {
754 /* No vector/vector shift, try for a vector/scalar shift. */
755 optab = optab_for_tree_code (rhs_code, vectype,
756 optab_scalar);
757
758 if (!optab)
759 {
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762 "Build SLP failed: no optab.\n");
763 /* Fatal mismatch. */
764 matches[0] = false;
765 return false;
766 }
767 icode = (int) optab_handler (optab, vec_mode);
768 if (icode == CODE_FOR_nothing)
769 {
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "Build SLP failed: "
773 "op not supported by target.\n");
774 /* Fatal mismatch. */
775 matches[0] = false;
776 return false;
777 }
778 optab_op2_mode = insn_data[icode].operand[2].mode;
779 if (!VECTOR_MODE_P (optab_op2_mode))
780 {
781 need_same_oprnds = true;
782 first_op1 = gimple_assign_rhs2 (stmt);
783 }
784 }
785 }
786 else if (rhs_code == WIDEN_LSHIFT_EXPR)
787 {
788 need_same_oprnds = true;
789 first_op1 = gimple_assign_rhs2 (stmt);
790 }
791 }
792 else
793 {
794 if (first_stmt_code != rhs_code
795 && alt_stmt_code == ERROR_MARK)
796 alt_stmt_code = rhs_code;
797 if (first_stmt_code != rhs_code
798 && (first_stmt_code != IMAGPART_EXPR
799 || rhs_code != REALPART_EXPR)
800 && (first_stmt_code != REALPART_EXPR
801 || rhs_code != IMAGPART_EXPR)
802 /* Handle mismatches in plus/minus by computing both
803 and merging the results. */
804 && !((first_stmt_code == PLUS_EXPR
805 || first_stmt_code == MINUS_EXPR)
806 && (alt_stmt_code == PLUS_EXPR
807 || alt_stmt_code == MINUS_EXPR)
808 && rhs_code == alt_stmt_code)
809 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
810 && (first_stmt_code == ARRAY_REF
811 || first_stmt_code == BIT_FIELD_REF
812 || first_stmt_code == INDIRECT_REF
813 || first_stmt_code == COMPONENT_REF
814 || first_stmt_code == MEM_REF)))
815 {
816 if (dump_enabled_p ())
817 {
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "Build SLP failed: different operation "
820 "in stmt %G", stmt);
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "original stmt %G", first_stmt_info->stmt);
823 }
824 /* Mismatch. */
825 continue;
826 }
827
828 if (need_same_oprnds
829 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
830 {
831 if (dump_enabled_p ())
832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
833 "Build SLP failed: different shift "
834 "arguments in %G", stmt);
835 /* Mismatch. */
836 continue;
837 }
838
839 if (rhs_code == CALL_EXPR)
840 {
841 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
842 as_a <gcall *> (stmt)))
843 {
844 if (dump_enabled_p ())
845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
846 "Build SLP failed: different calls in %G",
847 stmt);
848 /* Mismatch. */
849 continue;
850 }
851 }
852 }
853
854 /* Grouped store or load. */
855 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
856 {
857 if (REFERENCE_CLASS_P (lhs))
858 {
859 /* Store. */
860 ;
861 }
862 else
863 {
864 /* Load. */
865 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
866 if (prev_first_load)
867 {
868 /* Check that there are no loads from different interleaving
869 chains in the same node. */
870 if (prev_first_load != first_load)
871 {
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
874 vect_location,
875 "Build SLP failed: different "
876 "interleaving chains in one node %G",
877 stmt);
878 /* Mismatch. */
879 continue;
880 }
881 }
882 else
883 prev_first_load = first_load;
884 }
885 } /* Grouped access. */
886 else
887 {
888 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
889 {
890 /* Not grouped load. */
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
893 "Build SLP failed: not grouped load %G", stmt);
894
895 /* FORNOW: Not grouped loads are not supported. */
896 /* Fatal mismatch. */
897 matches[0] = false;
898 return false;
899 }
900
901 /* Not memory operation. */
902 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
903 && TREE_CODE_CLASS (rhs_code) != tcc_unary
904 && TREE_CODE_CLASS (rhs_code) != tcc_expression
905 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
906 && rhs_code != CALL_EXPR)
907 {
908 if (dump_enabled_p ())
909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
910 "Build SLP failed: operation unsupported %G",
911 stmt);
912 /* Fatal mismatch. */
913 matches[0] = false;
914 return false;
915 }
916
917 if (rhs_code == COND_EXPR)
918 {
919 tree cond_expr = gimple_assign_rhs1 (stmt);
920 enum tree_code cond_code = TREE_CODE (cond_expr);
921 enum tree_code swap_code = ERROR_MARK;
922 enum tree_code invert_code = ERROR_MARK;
923
924 if (i == 0)
925 first_cond_code = TREE_CODE (cond_expr);
926 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
927 {
928 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
929 swap_code = swap_tree_comparison (cond_code);
930 invert_code = invert_tree_comparison (cond_code, honor_nans);
931 }
932
933 if (first_cond_code == cond_code)
934 ;
935 /* Isomorphic can be achieved by swapping. */
936 else if (first_cond_code == swap_code)
937 swap[i] = 1;
938 /* Isomorphic can be achieved by inverting. */
939 else if (first_cond_code == invert_code)
940 swap[i] = 2;
941 else
942 {
943 if (dump_enabled_p ())
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 "Build SLP failed: different"
946 " operation %G", stmt);
947 /* Mismatch. */
948 continue;
949 }
950 }
951 }
952
953 matches[i] = true;
954 }
955
956 for (i = 0; i < group_size; ++i)
957 if (!matches[i])
958 return false;
959
960 /* If we allowed a two-operation SLP node verify the target can cope
961 with the permute we are going to use. */
962 if (alt_stmt_code != ERROR_MARK
963 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
964 {
965 if (vectype == boolean_type_node
966 || !vect_two_operations_perm_ok_p (stmts, group_size,
967 vectype, alt_stmt_code))
968 {
969 for (i = 0; i < group_size; ++i)
970 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
971 {
972 matches[i] = false;
973 if (dump_enabled_p ())
974 {
975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
976 "Build SLP failed: different operation "
977 "in stmt %G", stmts[i]->stmt);
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
979 "original stmt %G", first_stmt_info->stmt);
980 }
981 }
982 return false;
983 }
984 *two_operators = true;
985 }
986
987 return true;
988 }
989
990 /* Traits for the hash_set to record failed SLP builds for a stmt set.
991 Note we never remove apart from at destruction time so we do not
992 need a special value for deleted that differs from empty. */
993 struct bst_traits
994 {
995 typedef vec <stmt_vec_info> value_type;
996 typedef vec <stmt_vec_info> compare_type;
997 static inline hashval_t hash (value_type);
998 static inline bool equal (value_type existing, value_type candidate);
999 static inline bool is_empty (value_type x) { return !x.exists (); }
1000 static inline bool is_deleted (value_type x) { return !x.exists (); }
1001 static inline void mark_empty (value_type &x) { x.release (); }
1002 static inline void mark_deleted (value_type &x) { x.release (); }
1003 static inline void remove (value_type &x) { x.release (); }
1004 };
1005 inline hashval_t
1006 bst_traits::hash (value_type x)
1007 {
1008 inchash::hash h;
1009 for (unsigned i = 0; i < x.length (); ++i)
1010 h.add_int (gimple_uid (x[i]->stmt));
1011 return h.end ();
1012 }
1013 inline bool
1014 bst_traits::equal (value_type existing, value_type candidate)
1015 {
1016 if (existing.length () != candidate.length ())
1017 return false;
1018 for (unsigned i = 0; i < existing.length (); ++i)
1019 if (existing[i] != candidate[i])
1020 return false;
1021 return true;
1022 }
1023
1024 typedef hash_map <vec <gimple *>, slp_tree,
1025 simple_hashmap_traits <bst_traits, slp_tree> >
1026 scalar_stmts_to_slp_tree_map_t;
1027
1028 static slp_tree
1029 vect_build_slp_tree_2 (vec_info *vinfo,
1030 vec<stmt_vec_info> stmts, unsigned int group_size,
1031 poly_uint64 *max_nunits,
1032 bool *matches, unsigned *npermutes, unsigned *tree_size,
1033 unsigned max_tree_size,
1034 scalar_stmts_to_slp_tree_map_t *bst_map);
1035
1036 static slp_tree
1037 vect_build_slp_tree (vec_info *vinfo,
1038 vec<stmt_vec_info> stmts, unsigned int group_size,
1039 poly_uint64 *max_nunits,
1040 bool *matches, unsigned *npermutes, unsigned *tree_size,
1041 unsigned max_tree_size,
1042 scalar_stmts_to_slp_tree_map_t *bst_map)
1043 {
1044 if (slp_tree *leader = bst_map->get (stmts))
1045 {
1046 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1048 *leader ? "" : "failed ", *leader);
1049 if (*leader)
1050 {
1051 (*leader)->refcnt++;
1052 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1053 }
1054 return *leader;
1055 }
1056 poly_uint64 this_max_nunits = 1;
1057 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1058 &this_max_nunits,
1059 matches, npermutes, tree_size,
1060 max_tree_size, bst_map);
1061 if (res)
1062 {
1063 res->max_nunits = this_max_nunits;
1064 vect_update_max_nunits (max_nunits, this_max_nunits);
1065 /* Keep a reference for the bst_map use. */
1066 res->refcnt++;
1067 }
1068 bst_map->put (stmts.copy (), res);
1069 return res;
1070 }
1071
1072 /* Recursively build an SLP tree starting from NODE.
1073 Fail (and return a value not equal to zero) if def-stmts are not
1074 isomorphic, require data permutation or are of unsupported types of
1075 operation. Otherwise, return 0.
1076 The value returned is the depth in the SLP tree where a mismatch
1077 was found. */
1078
1079 static slp_tree
1080 vect_build_slp_tree_2 (vec_info *vinfo,
1081 vec<stmt_vec_info> stmts, unsigned int group_size,
1082 poly_uint64 *max_nunits,
1083 bool *matches, unsigned *npermutes, unsigned *tree_size,
1084 unsigned max_tree_size,
1085 scalar_stmts_to_slp_tree_map_t *bst_map)
1086 {
1087 unsigned nops, i, this_tree_size = 0;
1088 poly_uint64 this_max_nunits = *max_nunits;
1089 slp_tree node;
1090
1091 matches[0] = false;
1092
1093 stmt_vec_info stmt_info = stmts[0];
1094 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1095 nops = gimple_call_num_args (stmt);
1096 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1097 {
1098 nops = gimple_num_ops (stmt) - 1;
1099 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1100 nops++;
1101 }
1102 else if (is_a <gphi *> (stmt_info->stmt))
1103 nops = 0;
1104 else
1105 return NULL;
1106
1107 /* If the SLP node is a PHI (induction or reduction), terminate
1108 the recursion. */
1109 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1110 {
1111 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1112 tree vectype = get_vectype_for_scalar_type (scalar_type);
1113 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1114 return NULL;
1115
1116 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1117 /* Induction from different IVs is not supported. */
1118 if (def_type == vect_induction_def)
1119 {
1120 stmt_vec_info other_info;
1121 FOR_EACH_VEC_ELT (stmts, i, other_info)
1122 if (stmt_info != other_info)
1123 return NULL;
1124 }
1125 else if (def_type == vect_reduction_def
1126 || def_type == vect_double_reduction_def
1127 || def_type == vect_nested_cycle)
1128 {
1129 /* Else def types have to match. */
1130 stmt_vec_info other_info;
1131 FOR_EACH_VEC_ELT (stmts, i, other_info)
1132 {
1133 /* But for reduction chains only check on the first stmt. */
1134 if (!STMT_VINFO_DATA_REF (other_info)
1135 && REDUC_GROUP_FIRST_ELEMENT (other_info)
1136 && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info)
1137 continue;
1138 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1139 return NULL;
1140 }
1141 }
1142 else
1143 return NULL;
1144 node = vect_create_new_slp_node (stmts);
1145 return node;
1146 }
1147
1148
1149 bool two_operators = false;
1150 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1151 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1152 &this_max_nunits, matches, &two_operators))
1153 return NULL;
1154
1155 /* If the SLP node is a load, terminate the recursion. */
1156 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1157 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1158 {
1159 *max_nunits = this_max_nunits;
1160 node = vect_create_new_slp_node (stmts);
1161 return node;
1162 }
1163
1164 /* Get at the operands, verifying they are compatible. */
1165 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1166 slp_oprnd_info oprnd_info;
1167 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1168 {
1169 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1170 stmts, i, &oprnds_info);
1171 if (res != 0)
1172 matches[(res == -1) ? 0 : i] = false;
1173 if (!matches[0])
1174 break;
1175 }
1176 for (i = 0; i < group_size; ++i)
1177 if (!matches[i])
1178 {
1179 vect_free_oprnd_info (oprnds_info);
1180 return NULL;
1181 }
1182
1183 auto_vec<slp_tree, 4> children;
1184
1185 stmt_info = stmts[0];
1186
1187 if (tree_size)
1188 max_tree_size -= *tree_size;
1189
1190 /* Create SLP_TREE nodes for the definition node/s. */
1191 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1192 {
1193 slp_tree child;
1194 unsigned old_tree_size = this_tree_size;
1195 unsigned int j;
1196
1197 if (oprnd_info->first_dt != vect_internal_def
1198 && oprnd_info->first_dt != vect_reduction_def
1199 && oprnd_info->first_dt != vect_induction_def)
1200 continue;
1201
1202 if (++this_tree_size > max_tree_size)
1203 {
1204 if (dump_enabled_p ())
1205 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1206 vect_location,
1207 "Build SLP failed: SLP tree too large\n");
1208 FOR_EACH_VEC_ELT (children, j, child)
1209 vect_free_slp_tree (child, false);
1210 vect_free_oprnd_info (oprnds_info);
1211 return NULL;
1212 }
1213
1214 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1215 group_size, &this_max_nunits,
1216 matches, npermutes,
1217 &this_tree_size,
1218 max_tree_size, bst_map)) != NULL)
1219 {
1220 /* If we have all children of child built up from scalars then just
1221 throw that away and build it up this node from scalars. */
1222 if (!SLP_TREE_CHILDREN (child).is_empty ()
1223 /* ??? Rejecting patterns this way doesn't work. We'd have to
1224 do extra work to cancel the pattern so the uses see the
1225 scalar version. */
1226 && !oprnd_info->any_pattern)
1227 {
1228 slp_tree grandchild;
1229
1230 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1231 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1232 break;
1233 if (!grandchild)
1234 {
1235 /* Roll back. */
1236 this_tree_size = old_tree_size;
1237 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1238 vect_free_slp_tree (grandchild, false);
1239 SLP_TREE_CHILDREN (child).truncate (0);
1240
1241 if (dump_enabled_p ())
1242 dump_printf_loc (MSG_NOTE, vect_location,
1243 "Building parent vector operands from "
1244 "scalars instead\n");
1245 oprnd_info->def_stmts = vNULL;
1246 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1247 children.safe_push (child);
1248 continue;
1249 }
1250 }
1251
1252 oprnd_info->def_stmts = vNULL;
1253 children.safe_push (child);
1254 continue;
1255 }
1256
1257 /* If the SLP build failed fatally and we analyze a basic-block
1258 simply treat nodes we fail to build as externally defined
1259 (and thus build vectors from the scalar defs).
1260 The cost model will reject outright expensive cases.
1261 ??? This doesn't treat cases where permutation ultimatively
1262 fails (or we don't try permutation below). Ideally we'd
1263 even compute a permutation that will end up with the maximum
1264 SLP tree size... */
1265 if (is_a <bb_vec_info> (vinfo)
1266 && !matches[0]
1267 /* ??? Rejecting patterns this way doesn't work. We'd have to
1268 do extra work to cancel the pattern so the uses see the
1269 scalar version. */
1270 && !is_pattern_stmt_p (stmt_info)
1271 && !oprnd_info->any_pattern)
1272 {
1273 if (dump_enabled_p ())
1274 dump_printf_loc (MSG_NOTE, vect_location,
1275 "Building vector operands from scalars\n");
1276 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1277 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1278 children.safe_push (child);
1279 oprnd_info->def_stmts = vNULL;
1280 continue;
1281 }
1282
1283 /* If the SLP build for operand zero failed and operand zero
1284 and one can be commutated try that for the scalar stmts
1285 that failed the match. */
1286 if (i == 0
1287 /* A first scalar stmt mismatch signals a fatal mismatch. */
1288 && matches[0]
1289 /* ??? For COND_EXPRs we can swap the comparison operands
1290 as well as the arms under some constraints. */
1291 && nops == 2
1292 && oprnds_info[1]->first_dt == vect_internal_def
1293 && is_gimple_assign (stmt_info->stmt)
1294 /* Swapping operands for reductions breaks assumptions later on. */
1295 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1296 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1297 /* Do so only if the number of not successful permutes was nor more
1298 than a cut-ff as re-trying the recursive match on
1299 possibly each level of the tree would expose exponential
1300 behavior. */
1301 && *npermutes < 4)
1302 {
1303 /* See whether we can swap the matching or the non-matching
1304 stmt operands. */
1305 bool swap_not_matching = true;
1306 do
1307 {
1308 for (j = 0; j < group_size; ++j)
1309 {
1310 if (matches[j] != !swap_not_matching)
1311 continue;
1312 stmt_vec_info stmt_info = stmts[j];
1313 /* Verify if we can swap operands of this stmt. */
1314 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1315 if (!stmt
1316 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1317 {
1318 if (!swap_not_matching)
1319 goto fail;
1320 swap_not_matching = false;
1321 break;
1322 }
1323 /* Verify if we can safely swap or if we committed to a
1324 specific operand order already.
1325 ??? Instead of modifying GIMPLE stmts here we could
1326 record whether we want to swap operands in the SLP
1327 node and temporarily do that when processing it
1328 (or wrap operand accessors in a helper). */
1329 else if (swap[j] != 0
1330 || STMT_VINFO_NUM_SLP_USES (stmt_info))
1331 {
1332 if (!swap_not_matching)
1333 {
1334 if (dump_enabled_p ())
1335 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1336 vect_location,
1337 "Build SLP failed: cannot swap "
1338 "operands of shared stmt %G",
1339 stmts[j]->stmt);
1340 goto fail;
1341 }
1342 swap_not_matching = false;
1343 break;
1344 }
1345 }
1346 }
1347 while (j != group_size);
1348
1349 /* Swap mismatched definition stmts. */
1350 if (dump_enabled_p ())
1351 dump_printf_loc (MSG_NOTE, vect_location,
1352 "Re-trying with swapped operands of stmts ");
1353 for (j = 0; j < group_size; ++j)
1354 if (matches[j] == !swap_not_matching)
1355 {
1356 std::swap (oprnds_info[0]->def_stmts[j],
1357 oprnds_info[1]->def_stmts[j]);
1358 if (dump_enabled_p ())
1359 dump_printf (MSG_NOTE, "%d ", j);
1360 }
1361 if (dump_enabled_p ())
1362 dump_printf (MSG_NOTE, "\n");
1363 /* And try again with scratch 'matches' ... */
1364 bool *tem = XALLOCAVEC (bool, group_size);
1365 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1366 group_size, &this_max_nunits,
1367 tem, npermutes,
1368 &this_tree_size,
1369 max_tree_size, bst_map)) != NULL)
1370 {
1371 /* ... so if successful we can apply the operand swapping
1372 to the GIMPLE IL. This is necessary because for example
1373 vect_get_slp_defs uses operand indexes and thus expects
1374 canonical operand order. This is also necessary even
1375 if we end up building the operand from scalars as
1376 we'll continue to process swapped operand two. */
1377 for (j = 0; j < group_size; ++j)
1378 gimple_set_plf (stmts[j]->stmt, GF_PLF_1, false);
1379 for (j = 0; j < group_size; ++j)
1380 if (matches[j] == !swap_not_matching)
1381 {
1382 gassign *stmt = as_a <gassign *> (stmts[j]->stmt);
1383 /* Avoid swapping operands twice. */
1384 if (gimple_plf (stmt, GF_PLF_1))
1385 continue;
1386 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1387 gimple_assign_rhs2_ptr (stmt));
1388 gimple_set_plf (stmt, GF_PLF_1, true);
1389 }
1390 /* Verify we swap all duplicates or none. */
1391 if (flag_checking)
1392 for (j = 0; j < group_size; ++j)
1393 gcc_assert (gimple_plf (stmts[j]->stmt, GF_PLF_1)
1394 == (matches[j] == !swap_not_matching));
1395
1396 /* If we have all children of child built up from scalars then
1397 just throw that away and build it up this node from scalars. */
1398 if (!SLP_TREE_CHILDREN (child).is_empty ()
1399 /* ??? Rejecting patterns this way doesn't work. We'd have
1400 to do extra work to cancel the pattern so the uses see the
1401 scalar version. */
1402 && !oprnd_info->any_pattern)
1403 {
1404 unsigned int j;
1405 slp_tree grandchild;
1406
1407 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1408 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1409 break;
1410 if (!grandchild)
1411 {
1412 /* Roll back. */
1413 this_tree_size = old_tree_size;
1414 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1415 vect_free_slp_tree (grandchild, false);
1416 SLP_TREE_CHILDREN (child).truncate (0);
1417
1418 if (dump_enabled_p ())
1419 dump_printf_loc (MSG_NOTE, vect_location,
1420 "Building parent vector operands from "
1421 "scalars instead\n");
1422 oprnd_info->def_stmts = vNULL;
1423 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1424 children.safe_push (child);
1425 continue;
1426 }
1427 }
1428
1429 oprnd_info->def_stmts = vNULL;
1430 children.safe_push (child);
1431 continue;
1432 }
1433
1434 ++*npermutes;
1435 }
1436
1437 fail:
1438 gcc_assert (child == NULL);
1439 FOR_EACH_VEC_ELT (children, j, child)
1440 vect_free_slp_tree (child, false);
1441 vect_free_oprnd_info (oprnds_info);
1442 return NULL;
1443 }
1444
1445 vect_free_oprnd_info (oprnds_info);
1446
1447 if (tree_size)
1448 *tree_size += this_tree_size;
1449 *max_nunits = this_max_nunits;
1450
1451 node = vect_create_new_slp_node (stmts);
1452 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1453 SLP_TREE_CHILDREN (node).splice (children);
1454 return node;
1455 }
1456
1457 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1458
1459 static void
1460 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1461 slp_tree node, hash_set<slp_tree> &visited)
1462 {
1463 int i;
1464 stmt_vec_info stmt_info;
1465 slp_tree child;
1466
1467 if (visited.add (node))
1468 return;
1469
1470 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1471 dump_user_location_t user_loc = loc.get_user_location ();
1472 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n",
1473 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1474 ? " (external)" : "", node,
1475 estimated_poly_value (node->max_nunits));
1476 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1477 dump_printf_loc (metadata, user_loc, "\tstmt %d %G", i, stmt_info->stmt);
1478 if (SLP_TREE_CHILDREN (node).is_empty ())
1479 return;
1480 dump_printf_loc (metadata, user_loc, "\tchildren");
1481 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1482 dump_printf (dump_kind, " %p", (void *)child);
1483 dump_printf (dump_kind, "\n");
1484 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1485 vect_print_slp_tree (dump_kind, loc, child, visited);
1486 }
1487
1488 static void
1489 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1490 slp_tree node)
1491 {
1492 hash_set<slp_tree> visited;
1493 vect_print_slp_tree (dump_kind, loc, node, visited);
1494 }
1495
1496 /* Mark the tree rooted at NODE with PURE_SLP. */
1497
1498 static void
1499 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1500 {
1501 int i;
1502 stmt_vec_info stmt_info;
1503 slp_tree child;
1504
1505 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1506 return;
1507
1508 if (visited.add (node))
1509 return;
1510
1511 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1512 STMT_SLP_TYPE (stmt_info) = pure_slp;
1513
1514 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1515 vect_mark_slp_stmts (child, visited);
1516 }
1517
1518 static void
1519 vect_mark_slp_stmts (slp_tree node)
1520 {
1521 hash_set<slp_tree> visited;
1522 vect_mark_slp_stmts (node, visited);
1523 }
1524
1525 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1526
1527 static void
1528 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1529 {
1530 int i;
1531 stmt_vec_info stmt_info;
1532 slp_tree child;
1533
1534 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1535 return;
1536
1537 if (visited.add (node))
1538 return;
1539
1540 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1541 {
1542 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1543 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1544 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1545 }
1546
1547 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1548 vect_mark_slp_stmts_relevant (child, visited);
1549 }
1550
1551 static void
1552 vect_mark_slp_stmts_relevant (slp_tree node)
1553 {
1554 hash_set<slp_tree> visited;
1555 vect_mark_slp_stmts_relevant (node, visited);
1556 }
1557
1558
1559 /* Rearrange the statements of NODE according to PERMUTATION. */
1560
1561 static void
1562 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1563 vec<unsigned> permutation,
1564 hash_set<slp_tree> &visited)
1565 {
1566 stmt_vec_info stmt_info;
1567 vec<stmt_vec_info> tmp_stmts;
1568 unsigned int i;
1569 slp_tree child;
1570
1571 if (visited.add (node))
1572 return;
1573
1574 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1575 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1576
1577 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1578 tmp_stmts.create (group_size);
1579 tmp_stmts.quick_grow_cleared (group_size);
1580
1581 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1582 tmp_stmts[permutation[i]] = stmt_info;
1583
1584 SLP_TREE_SCALAR_STMTS (node).release ();
1585 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1586 }
1587
1588
1589 /* Attempt to reorder stmts in a reduction chain so that we don't
1590 require any load permutation. Return true if that was possible,
1591 otherwise return false. */
1592
1593 static bool
1594 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1595 {
1596 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1597 unsigned int i, j;
1598 unsigned int lidx;
1599 slp_tree node, load;
1600
1601 /* Compare all the permutation sequences to the first one. We know
1602 that at least one load is permuted. */
1603 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1604 if (!node->load_permutation.exists ())
1605 return false;
1606 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1607 {
1608 if (!load->load_permutation.exists ())
1609 return false;
1610 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1611 if (lidx != node->load_permutation[j])
1612 return false;
1613 }
1614
1615 /* Check that the loads in the first sequence are different and there
1616 are no gaps between them. */
1617 auto_sbitmap load_index (group_size);
1618 bitmap_clear (load_index);
1619 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1620 {
1621 if (lidx >= group_size)
1622 return false;
1623 if (bitmap_bit_p (load_index, lidx))
1624 return false;
1625
1626 bitmap_set_bit (load_index, lidx);
1627 }
1628 for (i = 0; i < group_size; i++)
1629 if (!bitmap_bit_p (load_index, i))
1630 return false;
1631
1632 /* This permutation is valid for reduction. Since the order of the
1633 statements in the nodes is not important unless they are memory
1634 accesses, we can rearrange the statements in all the nodes
1635 according to the order of the loads. */
1636 hash_set<slp_tree> visited;
1637 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1638 node->load_permutation, visited);
1639
1640 /* We are done, no actual permutations need to be generated. */
1641 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1642 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1643 {
1644 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1645 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1646 /* But we have to keep those permutations that are required because
1647 of handling of gaps. */
1648 if (known_eq (unrolling_factor, 1U)
1649 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1650 && DR_GROUP_GAP (first_stmt_info) == 0))
1651 SLP_TREE_LOAD_PERMUTATION (node).release ();
1652 else
1653 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1654 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1655 }
1656
1657 return true;
1658 }
1659
1660 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1661
1662 static void
1663 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1664 hash_set<slp_tree> &visited)
1665 {
1666 if (visited.add (node))
1667 return;
1668
1669 if (SLP_TREE_CHILDREN (node).length () == 0)
1670 {
1671 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1672 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
1673 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
1674 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1675 SLP_INSTANCE_LOADS (inst).safe_push (node);
1676 }
1677 else
1678 {
1679 unsigned i;
1680 slp_tree child;
1681 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1682 vect_gather_slp_loads (inst, child, visited);
1683 }
1684 }
1685
1686 static void
1687 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1688 {
1689 hash_set<slp_tree> visited;
1690 vect_gather_slp_loads (inst, node, visited);
1691 }
1692
1693 /* Check if the required load permutations in the SLP instance
1694 SLP_INSTN are supported. */
1695
1696 static bool
1697 vect_supported_load_permutation_p (slp_instance slp_instn)
1698 {
1699 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1700 unsigned int i, j, k, next;
1701 slp_tree node;
1702
1703 if (dump_enabled_p ())
1704 {
1705 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1706 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1707 if (node->load_permutation.exists ())
1708 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1709 dump_printf (MSG_NOTE, "%d ", next);
1710 else
1711 for (k = 0; k < group_size; ++k)
1712 dump_printf (MSG_NOTE, "%d ", k);
1713 dump_printf (MSG_NOTE, "\n");
1714 }
1715
1716 /* In case of reduction every load permutation is allowed, since the order
1717 of the reduction statements is not important (as opposed to the case of
1718 grouped stores). The only condition we need to check is that all the
1719 load nodes are of the same size and have the same permutation (and then
1720 rearrange all the nodes of the SLP instance according to this
1721 permutation). */
1722
1723 /* Check that all the load nodes are of the same size. */
1724 /* ??? Can't we assert this? */
1725 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1726 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1727 return false;
1728
1729 node = SLP_INSTANCE_TREE (slp_instn);
1730 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1731
1732 /* Reduction (there are no data-refs in the root).
1733 In reduction chain the order of the loads is not important. */
1734 if (!STMT_VINFO_DATA_REF (stmt_info)
1735 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1736 vect_attempt_slp_rearrange_stmts (slp_instn);
1737
1738 /* In basic block vectorization we allow any subchain of an interleaving
1739 chain.
1740 FORNOW: not supported in loop SLP because of realignment compications. */
1741 if (STMT_VINFO_BB_VINFO (stmt_info))
1742 {
1743 /* Check whether the loads in an instance form a subchain and thus
1744 no permutation is necessary. */
1745 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1746 {
1747 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1748 continue;
1749 bool subchain_p = true;
1750 stmt_vec_info next_load_info = NULL;
1751 stmt_vec_info load_info;
1752 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1753 {
1754 if (j != 0
1755 && (next_load_info != load_info
1756 || DR_GROUP_GAP (load_info) != 1))
1757 {
1758 subchain_p = false;
1759 break;
1760 }
1761 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1762 }
1763 if (subchain_p)
1764 SLP_TREE_LOAD_PERMUTATION (node).release ();
1765 else
1766 {
1767 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1768 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1769 unsigned HOST_WIDE_INT nunits;
1770 unsigned k, maxk = 0;
1771 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1772 if (k > maxk)
1773 maxk = k;
1774 /* In BB vectorization we may not actually use a loaded vector
1775 accessing elements in excess of DR_GROUP_SIZE. */
1776 tree vectype = STMT_VINFO_VECTYPE (group_info);
1777 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1778 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1779 {
1780 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1782 "BB vectorization with gaps at the end of "
1783 "a load is not supported\n");
1784 return false;
1785 }
1786
1787 /* Verify the permutation can be generated. */
1788 vec<tree> tem;
1789 unsigned n_perms;
1790 if (!vect_transform_slp_perm_load (node, tem, NULL,
1791 1, slp_instn, true, &n_perms))
1792 {
1793 if (dump_enabled_p ())
1794 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1795 vect_location,
1796 "unsupported load permutation\n");
1797 return false;
1798 }
1799 }
1800 }
1801 return true;
1802 }
1803
1804 /* For loop vectorization verify we can generate the permutation. Be
1805 conservative about the vectorization factor, there are permutations
1806 that will use three vector inputs only starting from a specific factor
1807 and the vectorization factor is not yet final.
1808 ??? The SLP instance unrolling factor might not be the maximum one. */
1809 unsigned n_perms;
1810 poly_uint64 test_vf
1811 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1812 LOOP_VINFO_VECT_FACTOR
1813 (STMT_VINFO_LOOP_VINFO (stmt_info)));
1814 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1815 if (node->load_permutation.exists ()
1816 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1817 slp_instn, true, &n_perms))
1818 return false;
1819
1820 return true;
1821 }
1822
1823
1824 /* Find the last store in SLP INSTANCE. */
1825
1826 stmt_vec_info
1827 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1828 {
1829 stmt_vec_info last = NULL;
1830 stmt_vec_info stmt_vinfo;
1831
1832 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1833 {
1834 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1835 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1836 }
1837
1838 return last;
1839 }
1840
1841 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1842 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1843 (also containing the first GROUP1_SIZE stmts, since stores are
1844 consecutive), the second containing the remainder.
1845 Return the first stmt in the second group. */
1846
1847 static stmt_vec_info
1848 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1849 {
1850 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1851 gcc_assert (group1_size > 0);
1852 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1853 gcc_assert (group2_size > 0);
1854 DR_GROUP_SIZE (first_vinfo) = group1_size;
1855
1856 stmt_vec_info stmt_info = first_vinfo;
1857 for (unsigned i = group1_size; i > 1; i--)
1858 {
1859 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1860 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1861 }
1862 /* STMT is now the last element of the first group. */
1863 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1864 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1865
1866 DR_GROUP_SIZE (group2) = group2_size;
1867 for (stmt_info = group2; stmt_info;
1868 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1869 {
1870 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1871 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1872 }
1873
1874 /* For the second group, the DR_GROUP_GAP is that before the original group,
1875 plus skipping over the first vector. */
1876 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1877
1878 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1879 DR_GROUP_GAP (first_vinfo) += group2_size;
1880
1881 if (dump_enabled_p ())
1882 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1883 group1_size, group2_size);
1884
1885 return group2;
1886 }
1887
1888 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1889 statements and a vector of NUNITS elements. */
1890
1891 static poly_uint64
1892 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1893 {
1894 return exact_div (common_multiple (nunits, group_size), group_size);
1895 }
1896
1897 /* Analyze an SLP instance starting from a group of grouped stores. Call
1898 vect_build_slp_tree to build a tree of packed stmts if possible.
1899 Return FALSE if it's impossible to SLP any stmt in the loop. */
1900
1901 static bool
1902 vect_analyze_slp_instance (vec_info *vinfo,
1903 stmt_vec_info stmt_info, unsigned max_tree_size)
1904 {
1905 slp_instance new_instance;
1906 slp_tree node;
1907 unsigned int group_size;
1908 tree vectype, scalar_type = NULL_TREE;
1909 unsigned int i;
1910 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1911 vec<stmt_vec_info> scalar_stmts;
1912
1913 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1914 {
1915 scalar_type = TREE_TYPE (DR_REF (dr));
1916 vectype = get_vectype_for_scalar_type (scalar_type);
1917 group_size = DR_GROUP_SIZE (stmt_info);
1918 }
1919 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1920 {
1921 gcc_assert (is_a <loop_vec_info> (vinfo));
1922 vectype = STMT_VINFO_VECTYPE (stmt_info);
1923 group_size = REDUC_GROUP_SIZE (stmt_info);
1924 }
1925 else
1926 {
1927 gcc_assert (is_a <loop_vec_info> (vinfo));
1928 vectype = STMT_VINFO_VECTYPE (stmt_info);
1929 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1930 }
1931
1932 if (!vectype)
1933 {
1934 if (dump_enabled_p ())
1935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1936 "Build SLP failed: unsupported data-type %T\n",
1937 scalar_type);
1938
1939 return false;
1940 }
1941 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1942
1943 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1944 scalar_stmts.create (group_size);
1945 stmt_vec_info next_info = stmt_info;
1946 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1947 {
1948 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1949 while (next_info)
1950 {
1951 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1952 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
1953 }
1954 }
1955 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1956 {
1957 /* Collect the reduction stmts and store them in
1958 SLP_TREE_SCALAR_STMTS. */
1959 while (next_info)
1960 {
1961 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1962 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
1963 }
1964 /* Mark the first element of the reduction chain as reduction to properly
1965 transform the node. In the reduction analysis phase only the last
1966 element of the chain is marked as reduction. */
1967 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
1968 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
1969 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
1970 }
1971 else
1972 {
1973 /* Collect reduction statements. */
1974 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1975 for (i = 0; reductions.iterate (i, &next_info); i++)
1976 scalar_stmts.safe_push (next_info);
1977 }
1978
1979 /* Build the tree for the SLP instance. */
1980 bool *matches = XALLOCAVEC (bool, group_size);
1981 unsigned npermutes = 0;
1982 scalar_stmts_to_slp_tree_map_t *bst_map
1983 = new scalar_stmts_to_slp_tree_map_t ();
1984 poly_uint64 max_nunits = nunits;
1985 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
1986 &max_nunits, matches, &npermutes,
1987 NULL, max_tree_size, bst_map);
1988 /* The map keeps a reference on SLP nodes built, release that. */
1989 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1990 it != bst_map->end (); ++it)
1991 if ((*it).second)
1992 vect_free_slp_tree ((*it).second, false);
1993 delete bst_map;
1994 if (node != NULL)
1995 {
1996 /* Calculate the unrolling factor based on the smallest type. */
1997 poly_uint64 unrolling_factor
1998 = calculate_unrolling_factor (max_nunits, group_size);
1999
2000 if (maybe_ne (unrolling_factor, 1U)
2001 && is_a <bb_vec_info> (vinfo))
2002 {
2003 unsigned HOST_WIDE_INT const_max_nunits;
2004 if (!max_nunits.is_constant (&const_max_nunits)
2005 || const_max_nunits > group_size)
2006 {
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009 "Build SLP failed: store group "
2010 "size not a multiple of the vector size "
2011 "in basic block SLP\n");
2012 vect_free_slp_tree (node, false);
2013 return false;
2014 }
2015 /* Fatal mismatch. */
2016 matches[group_size / const_max_nunits * const_max_nunits] = false;
2017 vect_free_slp_tree (node, false);
2018 }
2019 else
2020 {
2021 /* Create a new SLP instance. */
2022 new_instance = XNEW (struct _slp_instance);
2023 SLP_INSTANCE_TREE (new_instance) = node;
2024 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2025 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2026 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2027 vect_gather_slp_loads (new_instance, node);
2028
2029 /* Compute the load permutation. */
2030 slp_tree load_node;
2031 bool loads_permuted = false;
2032 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2033 {
2034 vec<unsigned> load_permutation;
2035 int j;
2036 stmt_vec_info load_info;
2037 bool this_load_permuted = false;
2038 load_permutation.create (group_size);
2039 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2040 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2041 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2042 {
2043 int load_place = vect_get_place_in_interleaving_chain
2044 (load_info, first_stmt_info);
2045 gcc_assert (load_place != -1);
2046 if (load_place != j)
2047 this_load_permuted = true;
2048 load_permutation.safe_push (load_place);
2049 }
2050 if (!this_load_permuted
2051 /* The load requires permutation when unrolling exposes
2052 a gap either because the group is larger than the SLP
2053 group-size or because there is a gap between the groups. */
2054 && (known_eq (unrolling_factor, 1U)
2055 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2056 && DR_GROUP_GAP (first_stmt_info) == 0)))
2057 {
2058 load_permutation.release ();
2059 continue;
2060 }
2061 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2062 loads_permuted = true;
2063 }
2064
2065 if (loads_permuted)
2066 {
2067 if (!vect_supported_load_permutation_p (new_instance))
2068 {
2069 if (dump_enabled_p ())
2070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2071 "Build SLP failed: unsupported load "
2072 "permutation %G", stmt_info->stmt);
2073 vect_free_slp_instance (new_instance, false);
2074 return false;
2075 }
2076 }
2077
2078 /* If the loads and stores can be handled with load/store-lan
2079 instructions do not generate this SLP instance. */
2080 if (is_a <loop_vec_info> (vinfo)
2081 && loads_permuted
2082 && dr && vect_store_lanes_supported (vectype, group_size, false))
2083 {
2084 slp_tree load_node;
2085 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2086 {
2087 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2088 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2089 /* Use SLP for strided accesses (or if we can't load-lanes). */
2090 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2091 || ! vect_load_lanes_supported
2092 (STMT_VINFO_VECTYPE (stmt_vinfo),
2093 DR_GROUP_SIZE (stmt_vinfo), false))
2094 break;
2095 }
2096 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2097 {
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100 "Built SLP cancelled: can use "
2101 "load/store-lanes\n");
2102 vect_free_slp_instance (new_instance, false);
2103 return false;
2104 }
2105 }
2106
2107 vinfo->slp_instances.safe_push (new_instance);
2108
2109 if (dump_enabled_p ())
2110 {
2111 dump_printf_loc (MSG_NOTE, vect_location,
2112 "Final SLP tree for instance:\n");
2113 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2114 }
2115
2116 return true;
2117 }
2118 }
2119 else
2120 {
2121 /* Failed to SLP. */
2122 /* Free the allocated memory. */
2123 scalar_stmts.release ();
2124 }
2125
2126 /* For basic block SLP, try to break the group up into multiples of the
2127 vector size. */
2128 unsigned HOST_WIDE_INT const_nunits;
2129 if (is_a <bb_vec_info> (vinfo)
2130 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2131 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2132 && nunits.is_constant (&const_nunits))
2133 {
2134 /* We consider breaking the group only on VF boundaries from the existing
2135 start. */
2136 for (i = 0; i < group_size; i++)
2137 if (!matches[i]) break;
2138
2139 if (i >= const_nunits && i < group_size)
2140 {
2141 /* Split into two groups at the first vector boundary before i. */
2142 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2143 unsigned group1_size = i & ~(const_nunits - 1);
2144
2145 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2146 group1_size);
2147 bool res = vect_analyze_slp_instance (vinfo, stmt_info,
2148 max_tree_size);
2149 /* If the first non-match was in the middle of a vector,
2150 skip the rest of that vector. */
2151 if (group1_size < i)
2152 {
2153 i = group1_size + const_nunits;
2154 if (i < group_size)
2155 rest = vect_split_slp_store_group (rest, const_nunits);
2156 }
2157 if (i < group_size)
2158 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2159 return res;
2160 }
2161 /* Even though the first vector did not all match, we might be able to SLP
2162 (some) of the remainder. FORNOW ignore this possibility. */
2163 }
2164
2165 return false;
2166 }
2167
2168
2169 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2170 trees of packed scalar stmts if SLP is possible. */
2171
2172 opt_result
2173 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2174 {
2175 unsigned int i;
2176 stmt_vec_info first_element;
2177
2178 DUMP_VECT_SCOPE ("vect_analyze_slp");
2179
2180 /* Find SLP sequences starting from groups of grouped stores. */
2181 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2182 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2183
2184 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2185 {
2186 if (loop_vinfo->reduction_chains.length () > 0)
2187 {
2188 /* Find SLP sequences starting from reduction chains. */
2189 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2190 if (! vect_analyze_slp_instance (vinfo, first_element,
2191 max_tree_size))
2192 {
2193 /* Dissolve reduction chain group. */
2194 stmt_vec_info vinfo = first_element;
2195 while (vinfo)
2196 {
2197 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2198 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2199 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2200 vinfo = next;
2201 }
2202 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2203 }
2204 }
2205
2206 /* Find SLP sequences starting from groups of reductions. */
2207 if (loop_vinfo->reductions.length () > 1)
2208 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2209 max_tree_size);
2210 }
2211
2212 return opt_result::success ();
2213 }
2214
2215
2216 /* For each possible SLP instance decide whether to SLP it and calculate overall
2217 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2218 least one instance. */
2219
2220 bool
2221 vect_make_slp_decision (loop_vec_info loop_vinfo)
2222 {
2223 unsigned int i;
2224 poly_uint64 unrolling_factor = 1;
2225 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2226 slp_instance instance;
2227 int decided_to_slp = 0;
2228
2229 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2230
2231 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2232 {
2233 /* FORNOW: SLP if you can. */
2234 /* All unroll factors have the form current_vector_size * X for some
2235 rational X, so they must have a common multiple. */
2236 unrolling_factor
2237 = force_common_multiple (unrolling_factor,
2238 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2239
2240 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2241 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2242 loop-based vectorization. Such stmts will be marked as HYBRID. */
2243 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2244 decided_to_slp++;
2245 }
2246
2247 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2248
2249 if (decided_to_slp && dump_enabled_p ())
2250 {
2251 dump_printf_loc (MSG_NOTE, vect_location,
2252 "Decided to SLP %d instances. Unrolling factor ",
2253 decided_to_slp);
2254 dump_dec (MSG_NOTE, unrolling_factor);
2255 dump_printf (MSG_NOTE, "\n");
2256 }
2257
2258 return (decided_to_slp > 0);
2259 }
2260
2261
2262 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2263 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2264
2265 static void
2266 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
2267 hash_map<slp_tree, unsigned> &visited)
2268 {
2269 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
2270 imm_use_iterator imm_iter;
2271 gimple *use_stmt;
2272 stmt_vec_info use_vinfo;
2273 slp_tree child;
2274 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2275 int j;
2276
2277 /* We need to union stype over the incoming graph edges but we still
2278 want to limit recursion to stay O(N+E). */
2279 bool only_edge = (++visited.get_or_insert (node) < node->refcnt);
2280
2281 /* Propagate hybrid down the SLP tree. */
2282 if (stype == hybrid)
2283 ;
2284 else if (HYBRID_SLP_STMT (stmt_vinfo))
2285 stype = hybrid;
2286 else if (!only_edge)
2287 {
2288 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2289 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2290 /* If we get a pattern stmt here we have to use the LHS of the
2291 original stmt for immediate uses. */
2292 gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
2293 tree def;
2294 if (gimple_code (stmt) == GIMPLE_PHI)
2295 def = gimple_phi_result (stmt);
2296 else
2297 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2298 if (def)
2299 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2300 {
2301 use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
2302 if (!use_vinfo)
2303 continue;
2304 use_vinfo = vect_stmt_to_vectorize (use_vinfo);
2305 if (!STMT_SLP_TYPE (use_vinfo)
2306 && (STMT_VINFO_RELEVANT (use_vinfo)
2307 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2308 && !(gimple_code (use_stmt) == GIMPLE_PHI
2309 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2310 {
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2313 "def in non-SLP stmt: %G", use_stmt);
2314 stype = hybrid;
2315 }
2316 }
2317 }
2318
2319 if (stype == hybrid
2320 && !HYBRID_SLP_STMT (stmt_vinfo))
2321 {
2322 if (dump_enabled_p ())
2323 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2324 stmt_vinfo->stmt);
2325 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2326 }
2327
2328 if (!only_edge)
2329 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2330 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2331 vect_detect_hybrid_slp_stmts (child, i, stype, visited);
2332 }
2333
2334 static void
2335 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2336 {
2337 hash_map<slp_tree, unsigned> visited;
2338 vect_detect_hybrid_slp_stmts (node, i, stype, visited);
2339 }
2340
2341 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2342
2343 static tree
2344 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2345 {
2346 walk_stmt_info *wi = (walk_stmt_info *)data;
2347 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2348
2349 if (wi->is_lhs)
2350 return NULL_TREE;
2351
2352 stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
2353 if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
2354 {
2355 if (dump_enabled_p ())
2356 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2357 def_stmt_info->stmt);
2358 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2359 }
2360
2361 return NULL_TREE;
2362 }
2363
2364 static tree
2365 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2366 walk_stmt_info *wi)
2367 {
2368 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2369 stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
2370 /* If the stmt is in a SLP instance then this isn't a reason
2371 to mark use definitions in other SLP instances as hybrid. */
2372 if (! STMT_SLP_TYPE (use_vinfo)
2373 && (STMT_VINFO_RELEVANT (use_vinfo)
2374 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2375 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2376 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2377 ;
2378 else
2379 *handled = true;
2380 return NULL_TREE;
2381 }
2382
2383 /* Find stmts that must be both vectorized and SLPed. */
2384
2385 void
2386 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2387 {
2388 unsigned int i;
2389 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2390 slp_instance instance;
2391
2392 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2393
2394 /* First walk all pattern stmt in the loop and mark defs of uses as
2395 hybrid because immediate uses in them are not recorded. */
2396 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2397 {
2398 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2399 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2400 gsi_next (&gsi))
2401 {
2402 gimple *stmt = gsi_stmt (gsi);
2403 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2404 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2405 {
2406 walk_stmt_info wi;
2407 memset (&wi, 0, sizeof (wi));
2408 wi.info = loop_vinfo;
2409 gimple_stmt_iterator gsi2
2410 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
2411 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2412 vect_detect_hybrid_slp_1, &wi);
2413 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2414 vect_detect_hybrid_slp_2,
2415 vect_detect_hybrid_slp_1, &wi);
2416 }
2417 }
2418 }
2419
2420 /* Then walk the SLP instance trees marking stmts with uses in
2421 non-SLP stmts as hybrid, also propagating hybrid down the
2422 SLP tree, collecting the above info on-the-fly. */
2423 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2424 {
2425 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2426 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2427 i, pure_slp);
2428 }
2429 }
2430
2431
2432 /* Initialize a bb_vec_info struct for the statements between
2433 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2434
2435 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2436 gimple_stmt_iterator region_end_in,
2437 vec_info_shared *shared)
2438 : vec_info (vec_info::bb, init_cost (NULL), shared),
2439 bb (gsi_bb (region_begin_in)),
2440 region_begin (region_begin_in),
2441 region_end (region_end_in)
2442 {
2443 gimple_stmt_iterator gsi;
2444
2445 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2446 gsi_next (&gsi))
2447 {
2448 gimple *stmt = gsi_stmt (gsi);
2449 gimple_set_uid (stmt, 0);
2450 add_stmt (stmt);
2451 }
2452
2453 bb->aux = this;
2454 }
2455
2456
2457 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2458 stmts in the basic block. */
2459
2460 _bb_vec_info::~_bb_vec_info ()
2461 {
2462 for (gimple_stmt_iterator si = region_begin;
2463 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2464 /* Reset region marker. */
2465 gimple_set_uid (gsi_stmt (si), -1);
2466
2467 bb->aux = NULL;
2468 }
2469
2470 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2471 given then that child nodes have already been processed, and that
2472 their def types currently match their SLP node's def type. */
2473
2474 static bool
2475 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2476 slp_instance node_instance,
2477 stmt_vector_for_cost *cost_vec)
2478 {
2479 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2480 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2481
2482 /* For BB vectorization vector types are assigned here.
2483 Memory accesses already got their vector type assigned
2484 in vect_analyze_data_refs. */
2485 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2486 if (bb_vinfo
2487 && ! STMT_VINFO_DATA_REF (stmt_info))
2488 {
2489 tree vectype, nunits_vectype;
2490 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2491 &nunits_vectype))
2492 /* We checked this when building the node. */
2493 gcc_unreachable ();
2494 if (vectype == boolean_type_node)
2495 {
2496 vectype = vect_get_mask_type_for_stmt (stmt_info);
2497 if (!vectype)
2498 /* vect_get_mask_type_for_stmt has already explained the
2499 failure. */
2500 return false;
2501 }
2502
2503 stmt_vec_info sstmt_info;
2504 unsigned int i;
2505 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2506 STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2507 }
2508
2509 /* Calculate the number of vector statements to be created for the
2510 scalar stmts in this node. For SLP reductions it is equal to the
2511 number of vector statements in the children (which has already been
2512 calculated by the recursive call). Otherwise it is the number of
2513 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2514 VF divided by the number of elements in a vector. */
2515 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2516 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2517 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2518 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2519 else
2520 {
2521 poly_uint64 vf;
2522 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2523 vf = loop_vinfo->vectorization_factor;
2524 else
2525 vf = 1;
2526 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2527 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2528 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2529 = vect_get_num_vectors (vf * group_size, vectype);
2530 }
2531
2532 bool dummy;
2533 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2534 }
2535
2536 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2537 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2538
2539 Return true if the operations are supported. */
2540
2541 static bool
2542 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2543 slp_instance node_instance,
2544 scalar_stmts_to_slp_tree_map_t *visited,
2545 scalar_stmts_to_slp_tree_map_t *lvisited,
2546 stmt_vector_for_cost *cost_vec)
2547 {
2548 int i, j;
2549 slp_tree child;
2550
2551 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2552 return true;
2553
2554 /* If we already analyzed the exact same set of scalar stmts we're done.
2555 We share the generated vector stmts for those. */
2556 slp_tree *leader;
2557 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2558 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2559 {
2560 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2561 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2562 return true;
2563 }
2564
2565 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2566 doesn't result in any issue since we throw away the lvisited set
2567 when we fail. */
2568 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2569
2570 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2571 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2572 visited, lvisited, cost_vec))
2573 return false;
2574
2575 /* ??? We have to catch the case late where two first scalar stmts appear
2576 in multiple SLP children with different def type and fail. Remember
2577 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2578 match it when that is vect_internal_def. */
2579 auto_vec<vect_def_type, 4> dt;
2580 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2581 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2582 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2583
2584 /* Push SLP node def-type to stmt operands. */
2585 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2586 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2587 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2588 = SLP_TREE_DEF_TYPE (child);
2589
2590 /* Check everything worked out. */
2591 bool res = true;
2592 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2593 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2594 {
2595 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2596 != SLP_TREE_DEF_TYPE (child))
2597 res = false;
2598 }
2599 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) != dt[j])
2600 res = false;
2601 if (!res && dump_enabled_p ())
2602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2603 "not vectorized: same operand with different "
2604 "def type in stmt.\n");
2605
2606 if (res)
2607 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2608 cost_vec);
2609
2610 /* Restore def-types. */
2611 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2612 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2613
2614 return res;
2615 }
2616
2617
2618 /* Analyze statements in SLP instances of VINFO. Return true if the
2619 operations are supported. */
2620
2621 bool
2622 vect_slp_analyze_operations (vec_info *vinfo)
2623 {
2624 slp_instance instance;
2625 int i;
2626
2627 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2628
2629 scalar_stmts_to_slp_tree_map_t *visited
2630 = new scalar_stmts_to_slp_tree_map_t ();
2631 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2632 {
2633 scalar_stmts_to_slp_tree_map_t lvisited;
2634 stmt_vector_for_cost cost_vec;
2635 cost_vec.create (2);
2636 if (!vect_slp_analyze_node_operations (vinfo,
2637 SLP_INSTANCE_TREE (instance),
2638 instance, visited, &lvisited,
2639 &cost_vec))
2640 {
2641 slp_tree node = SLP_INSTANCE_TREE (instance);
2642 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2643 if (dump_enabled_p ())
2644 dump_printf_loc (MSG_NOTE, vect_location,
2645 "removing SLP instance operations starting from: %G",
2646 stmt_info->stmt);
2647 vect_free_slp_instance (instance, false);
2648 vinfo->slp_instances.ordered_remove (i);
2649 cost_vec.release ();
2650 }
2651 else
2652 {
2653 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2654 x != lvisited.end(); ++x)
2655 visited->put ((*x).first.copy (), (*x).second);
2656 i++;
2657
2658 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2659 cost_vec.release ();
2660 }
2661 }
2662 delete visited;
2663
2664 return !vinfo->slp_instances.is_empty ();
2665 }
2666
2667
2668 /* Compute the scalar cost of the SLP node NODE and its children
2669 and return it. Do not account defs that are marked in LIFE and
2670 update LIFE according to uses of NODE. */
2671
2672 static void
2673 vect_bb_slp_scalar_cost (basic_block bb,
2674 slp_tree node, vec<bool, va_heap> *life,
2675 stmt_vector_for_cost *cost_vec,
2676 hash_set<slp_tree> &visited)
2677 {
2678 unsigned i;
2679 stmt_vec_info stmt_info;
2680 slp_tree child;
2681
2682 if (visited.add (node))
2683 return;
2684
2685 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2686 {
2687 gimple *stmt = stmt_info->stmt;
2688 vec_info *vinfo = stmt_info->vinfo;
2689 ssa_op_iter op_iter;
2690 def_operand_p def_p;
2691
2692 if ((*life)[i])
2693 continue;
2694
2695 /* If there is a non-vectorized use of the defs then the scalar
2696 stmt is kept live in which case we do not account it or any
2697 required defs in the SLP children in the scalar cost. This
2698 way we make the vectorization more costly when compared to
2699 the scalar cost. */
2700 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2701 {
2702 imm_use_iterator use_iter;
2703 gimple *use_stmt;
2704 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2705 if (!is_gimple_debug (use_stmt))
2706 {
2707 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2708 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2709 {
2710 (*life)[i] = true;
2711 BREAK_FROM_IMM_USE_STMT (use_iter);
2712 }
2713 }
2714 }
2715 if ((*life)[i])
2716 continue;
2717
2718 /* Count scalar stmts only once. */
2719 if (gimple_visited_p (stmt))
2720 continue;
2721 gimple_set_visited (stmt, true);
2722
2723 vect_cost_for_stmt kind;
2724 if (STMT_VINFO_DATA_REF (stmt_info))
2725 {
2726 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2727 kind = scalar_load;
2728 else
2729 kind = scalar_store;
2730 }
2731 else
2732 kind = scalar_stmt;
2733 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2734 }
2735
2736 auto_vec<bool, 20> subtree_life;
2737 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2738 {
2739 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2740 {
2741 /* Do not directly pass LIFE to the recursive call, copy it to
2742 confine changes in the callee to the current child/subtree. */
2743 subtree_life.safe_splice (*life);
2744 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
2745 visited);
2746 subtree_life.truncate (0);
2747 }
2748 }
2749 }
2750
2751 static void
2752 vect_bb_slp_scalar_cost (basic_block bb,
2753 slp_tree node, vec<bool, va_heap> *life,
2754 stmt_vector_for_cost *cost_vec)
2755 {
2756 hash_set<slp_tree> visited;
2757 vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited);
2758 }
2759
2760 /* Check if vectorization of the basic block is profitable. */
2761
2762 static bool
2763 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2764 {
2765 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2766 slp_instance instance;
2767 int i;
2768 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2769 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2770
2771 /* Calculate scalar cost. */
2772 stmt_vector_for_cost scalar_costs;
2773 scalar_costs.create (0);
2774 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2775 {
2776 auto_vec<bool, 20> life;
2777 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2778 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2779 SLP_INSTANCE_TREE (instance),
2780 &life, &scalar_costs);
2781 }
2782 void *target_cost_data = init_cost (NULL);
2783 add_stmt_costs (target_cost_data, &scalar_costs);
2784 scalar_costs.release ();
2785 unsigned dummy;
2786 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2787 destroy_cost_data (target_cost_data);
2788
2789 /* Unset visited flag. */
2790 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2791 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2792 gimple_set_visited (gsi_stmt (gsi), false);
2793
2794 /* Complete the target-specific cost calculation. */
2795 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2796 &vec_inside_cost, &vec_epilogue_cost);
2797
2798 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2799
2800 if (dump_enabled_p ())
2801 {
2802 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2803 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2804 vec_inside_cost);
2805 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2806 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2807 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2808 }
2809
2810 /* Vectorization is profitable if its cost is more than the cost of scalar
2811 version. Note that we err on the vector side for equal cost because
2812 the cost estimate is otherwise quite pessimistic (constant uses are
2813 free on the scalar side but cost a load on the vector side for
2814 example). */
2815 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2816 return false;
2817
2818 return true;
2819 }
2820
2821 /* Check if the basic block can be vectorized. Returns a bb_vec_info
2822 if so and sets fatal to true if failure is independent of
2823 current_vector_size. */
2824
2825 static bb_vec_info
2826 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2827 gimple_stmt_iterator region_end,
2828 vec<data_reference_p> datarefs, int n_stmts,
2829 bool &fatal, vec_info_shared *shared)
2830 {
2831 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
2832
2833 bb_vec_info bb_vinfo;
2834 slp_instance instance;
2835 int i;
2836 poly_uint64 min_vf = 2;
2837
2838 /* The first group of checks is independent of the vector size. */
2839 fatal = true;
2840
2841 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2842 {
2843 if (dump_enabled_p ())
2844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2845 "not vectorized: too many instructions in "
2846 "basic block.\n");
2847 free_data_refs (datarefs);
2848 return NULL;
2849 }
2850
2851 bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
2852 if (!bb_vinfo)
2853 return NULL;
2854
2855 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
2856 bb_vinfo->shared->save_datarefs ();
2857
2858 /* Analyze the data references. */
2859
2860 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
2861 {
2862 if (dump_enabled_p ())
2863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2864 "not vectorized: unhandled data-ref in basic "
2865 "block.\n");
2866
2867 delete bb_vinfo;
2868 return NULL;
2869 }
2870
2871 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2872 {
2873 if (dump_enabled_p ())
2874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2875 "not vectorized: not enough data-refs in "
2876 "basic block.\n");
2877
2878 delete bb_vinfo;
2879 return NULL;
2880 }
2881
2882 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2883 {
2884 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2886 "not vectorized: unhandled data access in "
2887 "basic block.\n");
2888
2889 delete bb_vinfo;
2890 return NULL;
2891 }
2892
2893 /* If there are no grouped stores in the region there is no need
2894 to continue with pattern recog as vect_analyze_slp will fail
2895 anyway. */
2896 if (bb_vinfo->grouped_stores.is_empty ())
2897 {
2898 if (dump_enabled_p ())
2899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2900 "not vectorized: no grouped stores in "
2901 "basic block.\n");
2902
2903 delete bb_vinfo;
2904 return NULL;
2905 }
2906
2907 /* While the rest of the analysis below depends on it in some way. */
2908 fatal = false;
2909
2910 vect_pattern_recog (bb_vinfo);
2911
2912 /* Check the SLP opportunities in the basic block, analyze and build SLP
2913 trees. */
2914 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2915 {
2916 if (dump_enabled_p ())
2917 {
2918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2919 "Failed to SLP the basic block.\n");
2920 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2921 "not vectorized: failed to find SLP opportunities "
2922 "in basic block.\n");
2923 }
2924
2925 delete bb_vinfo;
2926 return NULL;
2927 }
2928
2929 vect_record_base_alignments (bb_vinfo);
2930
2931 /* Analyze and verify the alignment of data references and the
2932 dependence in the SLP instances. */
2933 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
2934 {
2935 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2936 || ! vect_slp_analyze_instance_dependence (instance))
2937 {
2938 slp_tree node = SLP_INSTANCE_TREE (instance);
2939 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2940 if (dump_enabled_p ())
2941 dump_printf_loc (MSG_NOTE, vect_location,
2942 "removing SLP instance operations starting from: %G",
2943 stmt_info->stmt);
2944 vect_free_slp_instance (instance, false);
2945 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2946 continue;
2947 }
2948
2949 /* Mark all the statements that we want to vectorize as pure SLP and
2950 relevant. */
2951 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2952 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2953
2954 i++;
2955 }
2956 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
2957 {
2958 delete bb_vinfo;
2959 return NULL;
2960 }
2961
2962 if (!vect_slp_analyze_operations (bb_vinfo))
2963 {
2964 if (dump_enabled_p ())
2965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2966 "not vectorized: bad operation in basic block.\n");
2967
2968 delete bb_vinfo;
2969 return NULL;
2970 }
2971
2972 /* Cost model: check if the vectorization is worthwhile. */
2973 if (!unlimited_cost_model (NULL)
2974 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2975 {
2976 if (dump_enabled_p ())
2977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2978 "not vectorized: vectorization is not "
2979 "profitable.\n");
2980
2981 delete bb_vinfo;
2982 return NULL;
2983 }
2984
2985 if (dump_enabled_p ())
2986 dump_printf_loc (MSG_NOTE, vect_location,
2987 "Basic block will be vectorized using SLP\n");
2988
2989 return bb_vinfo;
2990 }
2991
2992
2993 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
2994 true if anything in the basic-block was vectorized. */
2995
2996 bool
2997 vect_slp_bb (basic_block bb)
2998 {
2999 bb_vec_info bb_vinfo;
3000 gimple_stmt_iterator gsi;
3001 bool any_vectorized = false;
3002 auto_vector_sizes vector_sizes;
3003
3004 /* Autodetect first vector size we try. */
3005 current_vector_size = 0;
3006 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
3007 unsigned int next_size = 0;
3008
3009 gsi = gsi_start_bb (bb);
3010
3011 poly_uint64 autodetected_vector_size = 0;
3012 while (1)
3013 {
3014 if (gsi_end_p (gsi))
3015 break;
3016
3017 gimple_stmt_iterator region_begin = gsi;
3018 vec<data_reference_p> datarefs = vNULL;
3019 int insns = 0;
3020
3021 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3022 {
3023 gimple *stmt = gsi_stmt (gsi);
3024 if (is_gimple_debug (stmt))
3025 continue;
3026 insns++;
3027
3028 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3029 vect_location = stmt;
3030
3031 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3032 break;
3033 }
3034
3035 /* Skip leading unhandled stmts. */
3036 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3037 {
3038 gsi_next (&gsi);
3039 continue;
3040 }
3041
3042 gimple_stmt_iterator region_end = gsi;
3043
3044 bool vectorized = false;
3045 bool fatal = false;
3046 vec_info_shared shared;
3047 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3048 datarefs, insns, fatal, &shared);
3049 if (bb_vinfo
3050 && dbg_cnt (vect_slp))
3051 {
3052 if (dump_enabled_p ())
3053 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3054
3055 bb_vinfo->shared->check_datarefs ();
3056 vect_schedule_slp (bb_vinfo);
3057
3058 unsigned HOST_WIDE_INT bytes;
3059 if (dump_enabled_p ())
3060 {
3061 if (current_vector_size.is_constant (&bytes))
3062 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3063 "basic block part vectorized using %wu byte "
3064 "vectors\n", bytes);
3065 else
3066 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3067 "basic block part vectorized using variable "
3068 "length vectors\n");
3069 }
3070
3071 vectorized = true;
3072 }
3073 delete bb_vinfo;
3074
3075 any_vectorized |= vectorized;
3076
3077 if (next_size == 0)
3078 autodetected_vector_size = current_vector_size;
3079
3080 if (next_size < vector_sizes.length ()
3081 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3082 next_size += 1;
3083
3084 if (vectorized
3085 || next_size == vector_sizes.length ()
3086 || known_eq (current_vector_size, 0U)
3087 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3088 vector sizes will fail do not bother iterating. */
3089 || fatal)
3090 {
3091 if (gsi_end_p (region_end))
3092 break;
3093
3094 /* Skip the unhandled stmt. */
3095 gsi_next (&gsi);
3096
3097 /* And reset vector sizes. */
3098 current_vector_size = 0;
3099 next_size = 0;
3100 }
3101 else
3102 {
3103 /* Try the next biggest vector size. */
3104 current_vector_size = vector_sizes[next_size++];
3105 if (dump_enabled_p ())
3106 {
3107 dump_printf_loc (MSG_NOTE, vect_location,
3108 "***** Re-trying analysis with "
3109 "vector size ");
3110 dump_dec (MSG_NOTE, current_vector_size);
3111 dump_printf (MSG_NOTE, "\n");
3112 }
3113
3114 /* Start over. */
3115 gsi = region_begin;
3116 }
3117 }
3118
3119 return any_vectorized;
3120 }
3121
3122
3123 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3124
3125 static bool
3126 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
3127 {
3128 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3129 tree op, vectype;
3130 enum vect_def_type dt;
3131
3132 /* For comparison and COND_EXPR type is chosen depending
3133 on the non-constant other comparison operand. */
3134 if (TREE_CODE_CLASS (code) == tcc_comparison)
3135 {
3136 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3137 op = gimple_assign_rhs1 (stmt);
3138
3139 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3140 gcc_unreachable ();
3141
3142 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3143 }
3144
3145 if (code == COND_EXPR)
3146 {
3147 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3148 tree cond = gimple_assign_rhs1 (stmt);
3149
3150 if (TREE_CODE (cond) == SSA_NAME)
3151 op = cond;
3152 else
3153 op = TREE_OPERAND (cond, 0);
3154
3155 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3156 gcc_unreachable ();
3157
3158 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3159 }
3160
3161 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3162 }
3163
3164 /* Build a variable-length vector in which the elements in ELTS are repeated
3165 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3166 RESULTS and add any new instructions to SEQ.
3167
3168 The approach we use is:
3169
3170 (1) Find a vector mode VM with integer elements of mode IM.
3171
3172 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3173 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3174 from small vectors to IM.
3175
3176 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3177
3178 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3179 correct byte contents.
3180
3181 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3182
3183 We try to find the largest IM for which this sequence works, in order
3184 to cut down on the number of interleaves. */
3185
3186 void
3187 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3188 unsigned int nresults, vec<tree> &results)
3189 {
3190 unsigned int nelts = elts.length ();
3191 tree element_type = TREE_TYPE (vector_type);
3192
3193 /* (1) Find a vector mode VM with integer elements of mode IM. */
3194 unsigned int nvectors = 1;
3195 tree new_vector_type;
3196 tree permutes[2];
3197 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3198 &nvectors, &new_vector_type,
3199 permutes))
3200 gcc_unreachable ();
3201
3202 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3203 unsigned int partial_nelts = nelts / nvectors;
3204 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3205
3206 tree_vector_builder partial_elts;
3207 auto_vec<tree, 32> pieces (nvectors * 2);
3208 pieces.quick_grow (nvectors * 2);
3209 for (unsigned int i = 0; i < nvectors; ++i)
3210 {
3211 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3212 ELTS' has mode IM. */
3213 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3214 for (unsigned int j = 0; j < partial_nelts; ++j)
3215 partial_elts.quick_push (elts[i * partial_nelts + j]);
3216 tree t = gimple_build_vector (seq, &partial_elts);
3217 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3218 TREE_TYPE (new_vector_type), t);
3219
3220 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3221 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3222 }
3223
3224 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3225 correct byte contents.
3226
3227 We need to repeat the following operation log2(nvectors) times:
3228
3229 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3230 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3231
3232 However, if each input repeats every N elements and the VF is
3233 a multiple of N * 2, the HI result is the same as the LO. */
3234 unsigned int in_start = 0;
3235 unsigned int out_start = nvectors;
3236 unsigned int hi_start = nvectors / 2;
3237 /* A bound on the number of outputs needed to produce NRESULTS results
3238 in the final iteration. */
3239 unsigned int noutputs_bound = nvectors * nresults;
3240 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3241 {
3242 noutputs_bound /= 2;
3243 unsigned int limit = MIN (noutputs_bound, nvectors);
3244 for (unsigned int i = 0; i < limit; ++i)
3245 {
3246 if ((i & 1) != 0
3247 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3248 2 * in_repeat))
3249 {
3250 pieces[out_start + i] = pieces[out_start + i - 1];
3251 continue;
3252 }
3253
3254 tree output = make_ssa_name (new_vector_type);
3255 tree input1 = pieces[in_start + (i / 2)];
3256 tree input2 = pieces[in_start + (i / 2) + hi_start];
3257 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3258 input1, input2,
3259 permutes[i & 1]);
3260 gimple_seq_add_stmt (seq, stmt);
3261 pieces[out_start + i] = output;
3262 }
3263 std::swap (in_start, out_start);
3264 }
3265
3266 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3267 results.reserve (nresults);
3268 for (unsigned int i = 0; i < nresults; ++i)
3269 if (i < nvectors)
3270 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3271 pieces[in_start + i]));
3272 else
3273 results.quick_push (results[i - nvectors]);
3274 }
3275
3276
3277 /* For constant and loop invariant defs of SLP_NODE this function returns
3278 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3279 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3280 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3281 REDUC_INDEX is the index of the reduction operand in the statements, unless
3282 it is -1. */
3283
3284 static void
3285 vect_get_constant_vectors (tree op, slp_tree slp_node,
3286 vec<tree> *vec_oprnds,
3287 unsigned int op_num, unsigned int number_of_vectors)
3288 {
3289 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3290 stmt_vec_info stmt_vinfo = stmts[0];
3291 gimple *stmt = stmt_vinfo->stmt;
3292 unsigned HOST_WIDE_INT nunits;
3293 tree vec_cst;
3294 unsigned j, number_of_places_left_in_vector;
3295 tree vector_type;
3296 tree vop;
3297 int group_size = stmts.length ();
3298 unsigned int vec_num, i;
3299 unsigned number_of_copies = 1;
3300 vec<tree> voprnds;
3301 voprnds.create (number_of_vectors);
3302 bool constant_p, is_store;
3303 tree neutral_op = NULL;
3304 enum tree_code code = gimple_expr_code (stmt);
3305 gimple_seq ctor_seq = NULL;
3306 auto_vec<tree, 16> permute_results;
3307
3308 /* Check if vector type is a boolean vector. */
3309 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3310 && vect_mask_constant_operand_p (stmt_vinfo))
3311 vector_type
3312 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3313 else
3314 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3315
3316 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3317 {
3318 is_store = true;
3319 op = gimple_assign_rhs1 (stmt);
3320 }
3321 else
3322 is_store = false;
3323
3324 gcc_assert (op);
3325
3326 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3327 created vectors. It is greater than 1 if unrolling is performed.
3328
3329 For example, we have two scalar operands, s1 and s2 (e.g., group of
3330 strided accesses of size two), while NUNITS is four (i.e., four scalars
3331 of this type can be packed in a vector). The output vector will contain
3332 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3333 will be 2).
3334
3335 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3336 containing the operands.
3337
3338 For example, NUNITS is four as before, and the group size is 8
3339 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3340 {s5, s6, s7, s8}. */
3341
3342 /* When using duplicate_and_interleave, we just need one element for
3343 each scalar statement. */
3344 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3345 nunits = group_size;
3346
3347 number_of_copies = nunits * number_of_vectors / group_size;
3348
3349 number_of_places_left_in_vector = nunits;
3350 constant_p = true;
3351 tree_vector_builder elts (vector_type, nunits, 1);
3352 elts.quick_grow (nunits);
3353 bool place_after_defs = false;
3354 for (j = 0; j < number_of_copies; j++)
3355 {
3356 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
3357 {
3358 stmt = stmt_vinfo->stmt;
3359 if (is_store)
3360 op = gimple_assign_rhs1 (stmt);
3361 else
3362 {
3363 switch (code)
3364 {
3365 case COND_EXPR:
3366 {
3367 tree cond = gimple_assign_rhs1 (stmt);
3368 if (TREE_CODE (cond) == SSA_NAME)
3369 op = gimple_op (stmt, op_num + 1);
3370 else if (op_num == 0 || op_num == 1)
3371 op = TREE_OPERAND (cond, op_num);
3372 else
3373 {
3374 if (op_num == 2)
3375 op = gimple_assign_rhs2 (stmt);
3376 else
3377 op = gimple_assign_rhs3 (stmt);
3378 }
3379 }
3380 break;
3381
3382 case CALL_EXPR:
3383 op = gimple_call_arg (stmt, op_num);
3384 break;
3385
3386 case LSHIFT_EXPR:
3387 case RSHIFT_EXPR:
3388 case LROTATE_EXPR:
3389 case RROTATE_EXPR:
3390 op = gimple_op (stmt, op_num + 1);
3391 /* Unlike the other binary operators, shifts/rotates have
3392 the shift count being int, instead of the same type as
3393 the lhs, so make sure the scalar is the right type if
3394 we are dealing with vectors of
3395 long long/long/short/char. */
3396 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3397 op = fold_convert (TREE_TYPE (vector_type), op);
3398 break;
3399
3400 default:
3401 op = gimple_op (stmt, op_num + 1);
3402 break;
3403 }
3404 }
3405
3406 /* Create 'vect_ = {op0,op1,...,opn}'. */
3407 number_of_places_left_in_vector--;
3408 tree orig_op = op;
3409 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3410 {
3411 if (CONSTANT_CLASS_P (op))
3412 {
3413 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3414 {
3415 /* Can't use VIEW_CONVERT_EXPR for booleans because
3416 of possibly different sizes of scalar value and
3417 vector element. */
3418 if (integer_zerop (op))
3419 op = build_int_cst (TREE_TYPE (vector_type), 0);
3420 else if (integer_onep (op))
3421 op = build_all_ones_cst (TREE_TYPE (vector_type));
3422 else
3423 gcc_unreachable ();
3424 }
3425 else
3426 op = fold_unary (VIEW_CONVERT_EXPR,
3427 TREE_TYPE (vector_type), op);
3428 gcc_assert (op && CONSTANT_CLASS_P (op));
3429 }
3430 else
3431 {
3432 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3433 gimple *init_stmt;
3434 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3435 {
3436 tree true_val
3437 = build_all_ones_cst (TREE_TYPE (vector_type));
3438 tree false_val
3439 = build_zero_cst (TREE_TYPE (vector_type));
3440 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3441 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3442 op, true_val,
3443 false_val);
3444 }
3445 else
3446 {
3447 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3448 op);
3449 init_stmt
3450 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3451 op);
3452 }
3453 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3454 op = new_temp;
3455 }
3456 }
3457 elts[number_of_places_left_in_vector] = op;
3458 if (!CONSTANT_CLASS_P (op))
3459 constant_p = false;
3460 if (TREE_CODE (orig_op) == SSA_NAME
3461 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3462 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3463 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3464 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3465 place_after_defs = true;
3466
3467 if (number_of_places_left_in_vector == 0)
3468 {
3469 if (constant_p
3470 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3471 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3472 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3473 else
3474 {
3475 if (vec_oprnds->is_empty ())
3476 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3477 number_of_vectors,
3478 permute_results);
3479 vec_cst = permute_results[number_of_vectors - j - 1];
3480 }
3481 tree init;
3482 gimple_stmt_iterator gsi;
3483 if (place_after_defs)
3484 {
3485 stmt_vec_info last_stmt_info
3486 = vect_find_last_scalar_stmt_in_slp (slp_node);
3487 gsi = gsi_for_stmt (last_stmt_info->stmt);
3488 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3489 &gsi);
3490 }
3491 else
3492 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3493 NULL);
3494 if (ctor_seq != NULL)
3495 {
3496 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3497 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3498 ctor_seq = NULL;
3499 }
3500 voprnds.quick_push (init);
3501 place_after_defs = false;
3502 number_of_places_left_in_vector = nunits;
3503 constant_p = true;
3504 elts.new_vector (vector_type, nunits, 1);
3505 elts.quick_grow (nunits);
3506 }
3507 }
3508 }
3509
3510 /* Since the vectors are created in the reverse order, we should invert
3511 them. */
3512 vec_num = voprnds.length ();
3513 for (j = vec_num; j != 0; j--)
3514 {
3515 vop = voprnds[j - 1];
3516 vec_oprnds->quick_push (vop);
3517 }
3518
3519 voprnds.release ();
3520
3521 /* In case that VF is greater than the unrolling factor needed for the SLP
3522 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3523 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3524 to replicate the vectors. */
3525 while (number_of_vectors > vec_oprnds->length ())
3526 {
3527 tree neutral_vec = NULL;
3528
3529 if (neutral_op)
3530 {
3531 if (!neutral_vec)
3532 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3533
3534 vec_oprnds->quick_push (neutral_vec);
3535 }
3536 else
3537 {
3538 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3539 vec_oprnds->quick_push (vop);
3540 }
3541 }
3542 }
3543
3544
3545 /* Get vectorized definitions from SLP_NODE that contains corresponding
3546 vectorized def-stmts. */
3547
3548 static void
3549 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3550 {
3551 tree vec_oprnd;
3552 stmt_vec_info vec_def_stmt_info;
3553 unsigned int i;
3554
3555 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3556
3557 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3558 {
3559 gcc_assert (vec_def_stmt_info);
3560 if (gphi *vec_def_phi = dyn_cast <gphi *> (vec_def_stmt_info->stmt))
3561 vec_oprnd = gimple_phi_result (vec_def_phi);
3562 else
3563 vec_oprnd = gimple_get_lhs (vec_def_stmt_info->stmt);
3564 vec_oprnds->quick_push (vec_oprnd);
3565 }
3566 }
3567
3568
3569 /* Get vectorized definitions for SLP_NODE.
3570 If the scalar definitions are loop invariants or constants, collect them and
3571 call vect_get_constant_vectors() to create vector stmts.
3572 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3573 must be stored in the corresponding child of SLP_NODE, and we call
3574 vect_get_slp_vect_defs () to retrieve them. */
3575
3576 void
3577 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3578 vec<vec<tree> > *vec_oprnds)
3579 {
3580 int number_of_vects = 0, i;
3581 unsigned int child_index = 0;
3582 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3583 slp_tree child = NULL;
3584 vec<tree> vec_defs;
3585 tree oprnd;
3586 bool vectorized_defs;
3587
3588 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3589 FOR_EACH_VEC_ELT (ops, i, oprnd)
3590 {
3591 /* For each operand we check if it has vectorized definitions in a child
3592 node or we need to create them (for invariants and constants). We
3593 check if the LHS of the first stmt of the next child matches OPRND.
3594 If it does, we found the correct child. Otherwise, we call
3595 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3596 to check this child node for the next operand. */
3597 vectorized_defs = false;
3598 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3599 {
3600 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3601
3602 /* We have to check both pattern and original def, if available. */
3603 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3604 {
3605 stmt_vec_info first_def_info = SLP_TREE_SCALAR_STMTS (child)[0];
3606 stmt_vec_info related = STMT_VINFO_RELATED_STMT (first_def_info);
3607 tree first_def_op;
3608
3609 if (gphi *first_def = dyn_cast <gphi *> (first_def_info->stmt))
3610 first_def_op = gimple_phi_result (first_def);
3611 else
3612 first_def_op = gimple_get_lhs (first_def_info->stmt);
3613 if (operand_equal_p (oprnd, first_def_op, 0)
3614 || (related
3615 && operand_equal_p (oprnd,
3616 gimple_get_lhs (related->stmt), 0)))
3617 {
3618 /* The number of vector defs is determined by the number of
3619 vector statements in the node from which we get those
3620 statements. */
3621 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3622 vectorized_defs = true;
3623 child_index++;
3624 }
3625 }
3626 else
3627 child_index++;
3628 }
3629
3630 if (!vectorized_defs)
3631 {
3632 if (i == 0)
3633 {
3634 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3635 /* Number of vector stmts was calculated according to LHS in
3636 vect_schedule_slp_instance (), fix it by replacing LHS with
3637 RHS, if necessary. See vect_get_smallest_scalar_type () for
3638 details. */
3639 vect_get_smallest_scalar_type (first_stmt_info, &lhs_size_unit,
3640 &rhs_size_unit);
3641 if (rhs_size_unit != lhs_size_unit)
3642 {
3643 number_of_vects *= rhs_size_unit;
3644 number_of_vects /= lhs_size_unit;
3645 }
3646 }
3647 }
3648
3649 /* Allocate memory for vectorized defs. */
3650 vec_defs = vNULL;
3651 vec_defs.create (number_of_vects);
3652
3653 /* For reduction defs we call vect_get_constant_vectors (), since we are
3654 looking for initial loop invariant values. */
3655 if (vectorized_defs)
3656 /* The defs are already vectorized. */
3657 vect_get_slp_vect_defs (child, &vec_defs);
3658 else
3659 /* Build vectors from scalar defs. */
3660 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3661 number_of_vects);
3662
3663 vec_oprnds->quick_push (vec_defs);
3664 }
3665 }
3666
3667 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3668 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3669 permute statements for the SLP node NODE of the SLP instance
3670 SLP_NODE_INSTANCE. */
3671
3672 bool
3673 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3674 gimple_stmt_iterator *gsi, poly_uint64 vf,
3675 slp_instance slp_node_instance, bool analyze_only,
3676 unsigned *n_perms)
3677 {
3678 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3679 vec_info *vinfo = stmt_info->vinfo;
3680 int vec_index = 0;
3681 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3682 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3683 unsigned int mask_element;
3684 machine_mode mode;
3685
3686 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3687 return false;
3688
3689 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3690
3691 mode = TYPE_MODE (vectype);
3692 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3693
3694 /* Initialize the vect stmts of NODE to properly insert the generated
3695 stmts later. */
3696 if (! analyze_only)
3697 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3698 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3699 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3700
3701 /* Generate permutation masks for every NODE. Number of masks for each NODE
3702 is equal to GROUP_SIZE.
3703 E.g., we have a group of three nodes with three loads from the same
3704 location in each node, and the vector size is 4. I.e., we have a
3705 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3706 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3707 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3708 ...
3709
3710 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3711 The last mask is illegal since we assume two operands for permute
3712 operation, and the mask element values can't be outside that range.
3713 Hence, the last mask must be converted into {2,5,5,5}.
3714 For the first two permutations we need the first and the second input
3715 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3716 we need the second and the third vectors: {b1,c1,a2,b2} and
3717 {c2,a3,b3,c3}. */
3718
3719 int vect_stmts_counter = 0;
3720 unsigned int index = 0;
3721 int first_vec_index = -1;
3722 int second_vec_index = -1;
3723 bool noop_p = true;
3724 *n_perms = 0;
3725
3726 vec_perm_builder mask;
3727 unsigned int nelts_to_build;
3728 unsigned int nvectors_per_build;
3729 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3730 && multiple_p (nunits, group_size));
3731 if (repeating_p)
3732 {
3733 /* A single vector contains a whole number of copies of the node, so:
3734 (a) all permutes can use the same mask; and
3735 (b) the permutes only need a single vector input. */
3736 mask.new_vector (nunits, group_size, 3);
3737 nelts_to_build = mask.encoded_nelts ();
3738 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3739 }
3740 else
3741 {
3742 /* We need to construct a separate mask for each vector statement. */
3743 unsigned HOST_WIDE_INT const_nunits, const_vf;
3744 if (!nunits.is_constant (&const_nunits)
3745 || !vf.is_constant (&const_vf))
3746 return false;
3747 mask.new_vector (const_nunits, const_nunits, 1);
3748 nelts_to_build = const_vf * group_size;
3749 nvectors_per_build = 1;
3750 }
3751
3752 unsigned int count = mask.encoded_nelts ();
3753 mask.quick_grow (count);
3754 vec_perm_indices indices;
3755
3756 for (unsigned int j = 0; j < nelts_to_build; j++)
3757 {
3758 unsigned int iter_num = j / group_size;
3759 unsigned int stmt_num = j % group_size;
3760 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3761 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3762 if (repeating_p)
3763 {
3764 first_vec_index = 0;
3765 mask_element = i;
3766 }
3767 else
3768 {
3769 /* Enforced before the loop when !repeating_p. */
3770 unsigned int const_nunits = nunits.to_constant ();
3771 vec_index = i / const_nunits;
3772 mask_element = i % const_nunits;
3773 if (vec_index == first_vec_index
3774 || first_vec_index == -1)
3775 {
3776 first_vec_index = vec_index;
3777 }
3778 else if (vec_index == second_vec_index
3779 || second_vec_index == -1)
3780 {
3781 second_vec_index = vec_index;
3782 mask_element += const_nunits;
3783 }
3784 else
3785 {
3786 if (dump_enabled_p ())
3787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3788 "permutation requires at "
3789 "least three vectors %G",
3790 stmt_info->stmt);
3791 gcc_assert (analyze_only);
3792 return false;
3793 }
3794
3795 gcc_assert (mask_element < 2 * const_nunits);
3796 }
3797
3798 if (mask_element != index)
3799 noop_p = false;
3800 mask[index++] = mask_element;
3801
3802 if (index == count && !noop_p)
3803 {
3804 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3805 if (!can_vec_perm_const_p (mode, indices))
3806 {
3807 if (dump_enabled_p ())
3808 {
3809 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3810 vect_location,
3811 "unsupported vect permute { ");
3812 for (i = 0; i < count; ++i)
3813 {
3814 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3815 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3816 }
3817 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3818 }
3819 gcc_assert (analyze_only);
3820 return false;
3821 }
3822
3823 ++*n_perms;
3824 }
3825
3826 if (index == count)
3827 {
3828 if (!analyze_only)
3829 {
3830 tree mask_vec = NULL_TREE;
3831
3832 if (! noop_p)
3833 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3834
3835 if (second_vec_index == -1)
3836 second_vec_index = first_vec_index;
3837
3838 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3839 {
3840 /* Generate the permute statement if necessary. */
3841 tree first_vec = dr_chain[first_vec_index + ri];
3842 tree second_vec = dr_chain[second_vec_index + ri];
3843 stmt_vec_info perm_stmt_info;
3844 if (! noop_p)
3845 {
3846 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3847 tree perm_dest
3848 = vect_create_destination_var (gimple_assign_lhs (stmt),
3849 vectype);
3850 perm_dest = make_ssa_name (perm_dest);
3851 gassign *perm_stmt
3852 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3853 first_vec, second_vec,
3854 mask_vec);
3855 perm_stmt_info
3856 = vect_finish_stmt_generation (stmt_info, perm_stmt,
3857 gsi);
3858 }
3859 else
3860 /* If mask was NULL_TREE generate the requested
3861 identity transform. */
3862 perm_stmt_info = vinfo->lookup_def (first_vec);
3863
3864 /* Store the vector statement in NODE. */
3865 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
3866 = perm_stmt_info;
3867 }
3868 }
3869
3870 index = 0;
3871 first_vec_index = -1;
3872 second_vec_index = -1;
3873 noop_p = true;
3874 }
3875 }
3876
3877 return true;
3878 }
3879
3880 /* Vectorize SLP instance tree in postorder. */
3881
3882 static void
3883 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3884 scalar_stmts_to_slp_tree_map_t *bst_map)
3885 {
3886 gimple_stmt_iterator si;
3887 stmt_vec_info stmt_info;
3888 unsigned int group_size;
3889 tree vectype;
3890 int i, j;
3891 slp_tree child;
3892
3893 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3894 return;
3895
3896 /* See if we have already vectorized the node in the graph of the
3897 SLP instance. */
3898 if (SLP_TREE_VEC_STMTS (node).exists ())
3899 return;
3900
3901 /* See if we have already vectorized the same set of stmts and reuse their
3902 vectorized stmts across instances. */
3903 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3904 {
3905 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3906 return;
3907 }
3908
3909 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3910 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3911 vect_schedule_slp_instance (child, instance, bst_map);
3912
3913 /* Push SLP node def-type to stmts. */
3914 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3915 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3916 {
3917 stmt_vec_info child_stmt_info;
3918 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3919 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
3920 }
3921
3922 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3923
3924 /* VECTYPE is the type of the destination. */
3925 vectype = STMT_VINFO_VECTYPE (stmt_info);
3926 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3927 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3928
3929 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3930 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3931
3932 if (dump_enabled_p ())
3933 dump_printf_loc (MSG_NOTE, vect_location,
3934 "------>vectorizing SLP node starting from: %G",
3935 stmt_info->stmt);
3936
3937 /* Vectorized stmts go before the last scalar stmt which is where
3938 all uses are ready. */
3939 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
3940 si = gsi_for_stmt (last_stmt_info->stmt);
3941
3942 /* Mark the first element of the reduction chain as reduction to properly
3943 transform the node. In the analysis phase only the last element of the
3944 chain is marked as reduction. */
3945 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3946 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
3947 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3948 {
3949 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3950 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3951 }
3952
3953 /* Handle two-operation SLP nodes by vectorizing the group with
3954 both operations and then performing a merge. */
3955 if (SLP_TREE_TWO_OPERATORS (node))
3956 {
3957 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3958 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3959 enum tree_code ocode = ERROR_MARK;
3960 stmt_vec_info ostmt_info;
3961 vec_perm_builder mask (group_size, group_size, 1);
3962 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
3963 {
3964 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3965 if (gimple_assign_rhs_code (ostmt) != code0)
3966 {
3967 mask.quick_push (1);
3968 ocode = gimple_assign_rhs_code (ostmt);
3969 }
3970 else
3971 mask.quick_push (0);
3972 }
3973 if (ocode != ERROR_MARK)
3974 {
3975 vec<stmt_vec_info> v0;
3976 vec<stmt_vec_info> v1;
3977 unsigned j;
3978 tree tmask = NULL_TREE;
3979 vect_transform_stmt (stmt_info, &si, node, instance);
3980 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3981 SLP_TREE_VEC_STMTS (node).truncate (0);
3982 gimple_assign_set_rhs_code (stmt, ocode);
3983 vect_transform_stmt (stmt_info, &si, node, instance);
3984 gimple_assign_set_rhs_code (stmt, code0);
3985 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3986 SLP_TREE_VEC_STMTS (node).truncate (0);
3987 tree meltype = build_nonstandard_integer_type
3988 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3989 tree mvectype = get_same_sized_vectype (meltype, vectype);
3990 unsigned k = 0, l;
3991 for (j = 0; j < v0.length (); ++j)
3992 {
3993 /* Enforced by vect_build_slp_tree, which rejects variable-length
3994 vectors for SLP_TREE_TWO_OPERATORS. */
3995 unsigned int const_nunits = nunits.to_constant ();
3996 tree_vector_builder melts (mvectype, const_nunits, 1);
3997 for (l = 0; l < const_nunits; ++l)
3998 {
3999 if (k >= group_size)
4000 k = 0;
4001 tree t = build_int_cst (meltype,
4002 mask[k++] * const_nunits + l);
4003 melts.quick_push (t);
4004 }
4005 tmask = melts.build ();
4006
4007 /* ??? Not all targets support a VEC_PERM_EXPR with a
4008 constant mask that would translate to a vec_merge RTX
4009 (with their vec_perm_const_ok). We can either not
4010 vectorize in that case or let veclower do its job.
4011 Unfortunately that isn't too great and at least for
4012 plus/minus we'd eventually like to match targets
4013 vector addsub instructions. */
4014 gimple *vstmt;
4015 vstmt = gimple_build_assign (make_ssa_name (vectype),
4016 VEC_PERM_EXPR,
4017 gimple_assign_lhs (v0[j]->stmt),
4018 gimple_assign_lhs (v1[j]->stmt),
4019 tmask);
4020 SLP_TREE_VEC_STMTS (node).quick_push
4021 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
4022 }
4023 v0.release ();
4024 v1.release ();
4025 return;
4026 }
4027 }
4028 vect_transform_stmt (stmt_info, &si, node, instance);
4029
4030 /* Restore stmt def-types. */
4031 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4032 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4033 {
4034 stmt_vec_info child_stmt_info;
4035 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4036 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
4037 }
4038 }
4039
4040 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4041 For loop vectorization this is done in vectorizable_call, but for SLP
4042 it needs to be deferred until end of vect_schedule_slp, because multiple
4043 SLP instances may refer to the same scalar stmt. */
4044
4045 static void
4046 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
4047 {
4048 gimple *new_stmt;
4049 gimple_stmt_iterator gsi;
4050 int i;
4051 slp_tree child;
4052 tree lhs;
4053 stmt_vec_info stmt_info;
4054
4055 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4056 return;
4057
4058 if (visited.add (node))
4059 return;
4060
4061 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4062 vect_remove_slp_scalar_calls (child, visited);
4063
4064 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4065 {
4066 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4067 if (!stmt || gimple_bb (stmt) == NULL)
4068 continue;
4069 if (is_pattern_stmt_p (stmt_info)
4070 || !PURE_SLP_STMT (stmt_info))
4071 continue;
4072 lhs = gimple_call_lhs (stmt);
4073 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4074 gsi = gsi_for_stmt (stmt);
4075 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4076 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4077 }
4078 }
4079
4080 static void
4081 vect_remove_slp_scalar_calls (slp_tree node)
4082 {
4083 hash_set<slp_tree> visited;
4084 vect_remove_slp_scalar_calls (node, visited);
4085 }
4086
4087 /* Generate vector code for all SLP instances in the loop/basic block. */
4088
4089 void
4090 vect_schedule_slp (vec_info *vinfo)
4091 {
4092 vec<slp_instance> slp_instances;
4093 slp_instance instance;
4094 unsigned int i;
4095
4096 scalar_stmts_to_slp_tree_map_t *bst_map
4097 = new scalar_stmts_to_slp_tree_map_t ();
4098 slp_instances = vinfo->slp_instances;
4099 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4100 {
4101 /* Schedule the tree of INSTANCE. */
4102 vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4103 instance, bst_map);
4104 if (dump_enabled_p ())
4105 dump_printf_loc (MSG_NOTE, vect_location,
4106 "vectorizing stmts using SLP.\n");
4107 }
4108 delete bst_map;
4109
4110 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4111 {
4112 slp_tree root = SLP_INSTANCE_TREE (instance);
4113 stmt_vec_info store_info;
4114 unsigned int j;
4115
4116 /* Remove scalar call stmts. Do not do this for basic-block
4117 vectorization as not all uses may be vectorized.
4118 ??? Why should this be necessary? DCE should be able to
4119 remove the stmts itself.
4120 ??? For BB vectorization we can as well remove scalar
4121 stmts starting from the SLP tree root if they have no
4122 uses. */
4123 if (is_a <loop_vec_info> (vinfo))
4124 vect_remove_slp_scalar_calls (root);
4125
4126 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4127 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4128 {
4129 if (!STMT_VINFO_DATA_REF (store_info))
4130 break;
4131
4132 store_info = vect_orig_stmt (store_info);
4133 /* Free the attached stmt_vec_info and remove the stmt. */
4134 vinfo->remove_stmt (store_info);
4135 }
4136 }
4137 }
This page took 0.221079 seconds and 6 git commands to generate.