]> gcc.gnu.org Git - gcc.git/blob - gcc/tree-vect-slp.c
[06/46] Add vec_info::add_stmt
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48
49
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51 FINAL_P is true if we have vectorized the instance or if we have
52 made a final decision not to vectorize the statements in any way. */
53
54 static void
55 vect_free_slp_tree (slp_tree node, bool final_p)
56 {
57 int i;
58 slp_tree child;
59
60 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
61 vect_free_slp_tree (child, final_p);
62
63 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
64 Some statements might no longer exist, after having been
65 removed by vect_transform_stmt. Updating the remaining
66 statements would be redundant. */
67 if (!final_p)
68 {
69 gimple *stmt;
70 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
71 {
72 gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0);
73 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--;
74 }
75 }
76
77 SLP_TREE_CHILDREN (node).release ();
78 SLP_TREE_SCALAR_STMTS (node).release ();
79 SLP_TREE_VEC_STMTS (node).release ();
80 SLP_TREE_LOAD_PERMUTATION (node).release ();
81
82 free (node);
83 }
84
85
86 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
87 have vectorized the instance or if we have made a final decision not
88 to vectorize the statements in any way. */
89
90 void
91 vect_free_slp_instance (slp_instance instance, bool final_p)
92 {
93 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
94 SLP_INSTANCE_LOADS (instance).release ();
95 free (instance);
96 }
97
98
99 /* Create an SLP node for SCALAR_STMTS. */
100
101 static slp_tree
102 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
103 {
104 slp_tree node;
105 gimple *stmt = scalar_stmts[0];
106 unsigned int nops;
107
108 if (is_gimple_call (stmt))
109 nops = gimple_call_num_args (stmt);
110 else if (is_gimple_assign (stmt))
111 {
112 nops = gimple_num_ops (stmt) - 1;
113 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
114 nops++;
115 }
116 else if (gimple_code (stmt) == GIMPLE_PHI)
117 nops = 0;
118 else
119 return NULL;
120
121 node = XNEW (struct _slp_tree);
122 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
123 SLP_TREE_VEC_STMTS (node).create (0);
124 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
125 SLP_TREE_CHILDREN (node).create (nops);
126 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
127 SLP_TREE_TWO_OPERATORS (node) = false;
128 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
129
130 unsigned i;
131 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt)
132 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++;
133
134 return node;
135 }
136
137
138 /* This structure is used in creation of an SLP tree. Each instance
139 corresponds to the same operand in a group of scalar stmts in an SLP
140 node. */
141 typedef struct _slp_oprnd_info
142 {
143 /* Def-stmts for the operands. */
144 vec<gimple *> def_stmts;
145 /* Information about the first statement, its vector def-type, type, the
146 operand itself in case it's constant, and an indication if it's a pattern
147 stmt. */
148 tree first_op_type;
149 enum vect_def_type first_dt;
150 bool first_pattern;
151 bool second_pattern;
152 } *slp_oprnd_info;
153
154
155 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
156 operand. */
157 static vec<slp_oprnd_info>
158 vect_create_oprnd_info (int nops, int group_size)
159 {
160 int i;
161 slp_oprnd_info oprnd_info;
162 vec<slp_oprnd_info> oprnds_info;
163
164 oprnds_info.create (nops);
165 for (i = 0; i < nops; i++)
166 {
167 oprnd_info = XNEW (struct _slp_oprnd_info);
168 oprnd_info->def_stmts.create (group_size);
169 oprnd_info->first_dt = vect_uninitialized_def;
170 oprnd_info->first_op_type = NULL_TREE;
171 oprnd_info->first_pattern = false;
172 oprnd_info->second_pattern = false;
173 oprnds_info.quick_push (oprnd_info);
174 }
175
176 return oprnds_info;
177 }
178
179
180 /* Free operands info. */
181
182 static void
183 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
184 {
185 int i;
186 slp_oprnd_info oprnd_info;
187
188 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
189 {
190 oprnd_info->def_stmts.release ();
191 XDELETE (oprnd_info);
192 }
193
194 oprnds_info.release ();
195 }
196
197
198 /* Find the place of the data-ref in STMT in the interleaving chain that starts
199 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
200
201 int
202 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
203 {
204 gimple *next_stmt = first_stmt;
205 int result = 0;
206
207 if (first_stmt != DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
208 return -1;
209
210 do
211 {
212 if (next_stmt == stmt)
213 return result;
214 next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
215 if (next_stmt)
216 result += DR_GROUP_GAP (vinfo_for_stmt (next_stmt));
217 }
218 while (next_stmt);
219
220 return -1;
221 }
222
223 /* Check whether it is possible to load COUNT elements of type ELT_MODE
224 using the method implemented by duplicate_and_interleave. Return true
225 if so, returning the number of intermediate vectors in *NVECTORS_OUT
226 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
227 (if nonnull). */
228
229 bool
230 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
231 unsigned int *nvectors_out,
232 tree *vector_type_out,
233 tree *permutes)
234 {
235 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
236 poly_int64 nelts;
237 unsigned int nvectors = 1;
238 for (;;)
239 {
240 scalar_int_mode int_mode;
241 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
242 if (multiple_p (current_vector_size, elt_bytes, &nelts)
243 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
244 {
245 tree int_type = build_nonstandard_integer_type
246 (GET_MODE_BITSIZE (int_mode), 1);
247 tree vector_type = build_vector_type (int_type, nelts);
248 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
249 {
250 vec_perm_builder sel1 (nelts, 2, 3);
251 vec_perm_builder sel2 (nelts, 2, 3);
252 poly_int64 half_nelts = exact_div (nelts, 2);
253 for (unsigned int i = 0; i < 3; ++i)
254 {
255 sel1.quick_push (i);
256 sel1.quick_push (i + nelts);
257 sel2.quick_push (half_nelts + i);
258 sel2.quick_push (half_nelts + i + nelts);
259 }
260 vec_perm_indices indices1 (sel1, 2, nelts);
261 vec_perm_indices indices2 (sel2, 2, nelts);
262 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
263 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
264 {
265 if (nvectors_out)
266 *nvectors_out = nvectors;
267 if (vector_type_out)
268 *vector_type_out = vector_type;
269 if (permutes)
270 {
271 permutes[0] = vect_gen_perm_mask_checked (vector_type,
272 indices1);
273 permutes[1] = vect_gen_perm_mask_checked (vector_type,
274 indices2);
275 }
276 return true;
277 }
278 }
279 }
280 if (!multiple_p (elt_bytes, 2, &elt_bytes))
281 return false;
282 nvectors *= 2;
283 }
284 }
285
286 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
287 they are of a valid type and that they match the defs of the first stmt of
288 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
289 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
290 indicates swap is required for cond_expr stmts. Specifically, *SWAP
291 is 1 if STMT is cond and operands of comparison need to be swapped;
292 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
293 If there is any operand swap in this function, *SWAP is set to non-zero
294 value.
295 If there was a fatal error return -1; if the error could be corrected by
296 swapping operands of father node of this one, return 1; if everything is
297 ok return 0. */
298 static int
299 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
300 vec<gimple *> stmts, unsigned stmt_num,
301 vec<slp_oprnd_info> *oprnds_info)
302 {
303 gimple *stmt = stmts[stmt_num];
304 tree oprnd;
305 unsigned int i, number_of_oprnds;
306 gimple *def_stmt;
307 enum vect_def_type dt = vect_uninitialized_def;
308 bool pattern = false;
309 slp_oprnd_info oprnd_info;
310 int first_op_idx = 1;
311 bool commutative = false;
312 bool first_op_cond = false;
313 bool first = stmt_num == 0;
314 bool second = stmt_num == 1;
315
316 if (is_gimple_call (stmt))
317 {
318 number_of_oprnds = gimple_call_num_args (stmt);
319 first_op_idx = 3;
320 }
321 else if (is_gimple_assign (stmt))
322 {
323 enum tree_code code = gimple_assign_rhs_code (stmt);
324 number_of_oprnds = gimple_num_ops (stmt) - 1;
325 /* Swap can only be done for cond_expr if asked to, otherwise we
326 could result in different comparison code to the first stmt. */
327 if (gimple_assign_rhs_code (stmt) == COND_EXPR
328 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
329 {
330 first_op_cond = true;
331 number_of_oprnds++;
332 }
333 else
334 commutative = commutative_tree_code (code);
335 }
336 else
337 return -1;
338
339 bool swapped = (*swap != 0);
340 gcc_assert (!swapped || first_op_cond);
341 for (i = 0; i < number_of_oprnds; i++)
342 {
343 again:
344 if (first_op_cond)
345 {
346 /* Map indicating how operands of cond_expr should be swapped. */
347 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
348 int *map = maps[*swap];
349
350 if (i < 2)
351 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]);
352 else
353 oprnd = gimple_op (stmt, map[i]);
354 }
355 else
356 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
357
358 oprnd_info = (*oprnds_info)[i];
359
360 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt))
361 {
362 if (dump_enabled_p ())
363 {
364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
365 "Build SLP failed: can't analyze def for ");
366 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
367 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
368 }
369
370 return -1;
371 }
372
373 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
374 from the pattern. Check that all the stmts of the node are in the
375 pattern. */
376 if (def_stmt && gimple_bb (def_stmt)
377 && vect_stmt_in_region_p (vinfo, def_stmt)
378 && vinfo_for_stmt (def_stmt)
379 && is_pattern_stmt_p (vinfo_for_stmt (def_stmt)))
380 {
381 pattern = true;
382 if (!first && !oprnd_info->first_pattern
383 /* Allow different pattern state for the defs of the
384 first stmt in reduction chains. */
385 && (oprnd_info->first_dt != vect_reduction_def
386 || (!second && !oprnd_info->second_pattern)))
387 {
388 if (i == 0
389 && !swapped
390 && commutative)
391 {
392 swapped = true;
393 goto again;
394 }
395
396 if (dump_enabled_p ())
397 {
398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
399 "Build SLP failed: some of the stmts"
400 " are in a pattern, and others are not ");
401 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
402 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
403 }
404
405 return 1;
406 }
407
408 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
409
410 if (dt == vect_unknown_def_type)
411 {
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414 "Unsupported pattern.\n");
415 return -1;
416 }
417
418 switch (gimple_code (def_stmt))
419 {
420 case GIMPLE_PHI:
421 case GIMPLE_ASSIGN:
422 break;
423
424 default:
425 if (dump_enabled_p ())
426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
427 "unsupported defining stmt:\n");
428 return -1;
429 }
430 }
431
432 if (second)
433 oprnd_info->second_pattern = pattern;
434
435 if (first)
436 {
437 oprnd_info->first_dt = dt;
438 oprnd_info->first_pattern = pattern;
439 oprnd_info->first_op_type = TREE_TYPE (oprnd);
440 }
441 else
442 {
443 /* Not first stmt of the group, check that the def-stmt/s match
444 the def-stmt/s of the first stmt. Allow different definition
445 types for reduction chains: the first stmt must be a
446 vect_reduction_def (a phi node), and the rest
447 vect_internal_def. */
448 tree type = TREE_TYPE (oprnd);
449 if ((oprnd_info->first_dt != dt
450 && !(oprnd_info->first_dt == vect_reduction_def
451 && dt == vect_internal_def)
452 && !((oprnd_info->first_dt == vect_external_def
453 || oprnd_info->first_dt == vect_constant_def)
454 && (dt == vect_external_def
455 || dt == vect_constant_def)))
456 || !types_compatible_p (oprnd_info->first_op_type, type))
457 {
458 /* Try swapping operands if we got a mismatch. */
459 if (i == 0
460 && !swapped
461 && commutative)
462 {
463 swapped = true;
464 goto again;
465 }
466
467 if (dump_enabled_p ())
468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
469 "Build SLP failed: different types\n");
470
471 return 1;
472 }
473 if ((dt == vect_constant_def
474 || dt == vect_external_def)
475 && !current_vector_size.is_constant ()
476 && (TREE_CODE (type) == BOOLEAN_TYPE
477 || !can_duplicate_and_interleave_p (stmts.length (),
478 TYPE_MODE (type))))
479 {
480 if (dump_enabled_p ())
481 {
482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
483 "Build SLP failed: invalid type of def "
484 "for variable-length SLP ");
485 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
486 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
487 }
488 return -1;
489 }
490 }
491
492 /* Check the types of the definitions. */
493 switch (dt)
494 {
495 case vect_constant_def:
496 case vect_external_def:
497 break;
498
499 case vect_reduction_def:
500 case vect_induction_def:
501 case vect_internal_def:
502 oprnd_info->def_stmts.quick_push (def_stmt);
503 break;
504
505 default:
506 /* FORNOW: Not supported. */
507 if (dump_enabled_p ())
508 {
509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
510 "Build SLP failed: illegal type of def ");
511 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
512 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513 }
514
515 return -1;
516 }
517 }
518
519 /* Swap operands. */
520 if (swapped)
521 {
522 /* If there are already uses of this stmt in a SLP instance then
523 we've committed to the operand order and can't swap it. */
524 if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0)
525 {
526 if (dump_enabled_p ())
527 {
528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
529 "Build SLP failed: cannot swap operands of "
530 "shared stmt ");
531 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
532 }
533 return -1;
534 }
535
536 if (first_op_cond)
537 {
538 tree cond = gimple_assign_rhs1 (stmt);
539 enum tree_code code = TREE_CODE (cond);
540
541 /* Swap. */
542 if (*swap == 1)
543 {
544 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
545 &TREE_OPERAND (cond, 1));
546 TREE_SET_CODE (cond, swap_tree_comparison (code));
547 }
548 /* Invert. */
549 else
550 {
551 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
552 gimple_assign_rhs3_ptr (stmt));
553 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
554 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
555 gcc_assert (code != ERROR_MARK);
556 TREE_SET_CODE (cond, code);
557 }
558 }
559 else
560 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
561 gimple_assign_rhs2_ptr (stmt));
562 if (dump_enabled_p ())
563 {
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "swapped operands to match def types in ");
566 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
567 }
568 }
569
570 *swap = swapped;
571 return 0;
572 }
573
574 /* Return true if call statements CALL1 and CALL2 are similar enough
575 to be combined into the same SLP group. */
576
577 static bool
578 compatible_calls_p (gcall *call1, gcall *call2)
579 {
580 unsigned int nargs = gimple_call_num_args (call1);
581 if (nargs != gimple_call_num_args (call2))
582 return false;
583
584 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
585 return false;
586
587 if (gimple_call_internal_p (call1))
588 {
589 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
590 TREE_TYPE (gimple_call_lhs (call2))))
591 return false;
592 for (unsigned int i = 0; i < nargs; ++i)
593 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
594 TREE_TYPE (gimple_call_arg (call2, i))))
595 return false;
596 }
597 else
598 {
599 if (!operand_equal_p (gimple_call_fn (call1),
600 gimple_call_fn (call2), 0))
601 return false;
602
603 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
604 return false;
605 }
606 return true;
607 }
608
609 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
610 caller's attempt to find the vector type in STMT with the narrowest
611 element type. Return true if VECTYPE is nonnull and if it is valid
612 for VINFO. When returning true, update MAX_NUNITS to reflect the
613 number of units in VECTYPE. VINFO, GORUP_SIZE and MAX_NUNITS are
614 as for vect_build_slp_tree. */
615
616 static bool
617 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size,
618 tree vectype, poly_uint64 *max_nunits)
619 {
620 if (!vectype)
621 {
622 if (dump_enabled_p ())
623 {
624 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
625 "Build SLP failed: unsupported data-type in ");
626 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
627 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
628 }
629 /* Fatal mismatch. */
630 return false;
631 }
632
633 /* If populating the vector type requires unrolling then fail
634 before adjusting *max_nunits for basic-block vectorization. */
635 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
636 unsigned HOST_WIDE_INT const_nunits;
637 if (is_a <bb_vec_info> (vinfo)
638 && (!nunits.is_constant (&const_nunits)
639 || const_nunits > group_size))
640 {
641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
642 "Build SLP failed: unrolling required "
643 "in basic block SLP\n");
644 /* Fatal mismatch. */
645 return false;
646 }
647
648 /* In case of multiple types we need to detect the smallest type. */
649 vect_update_max_nunits (max_nunits, vectype);
650 return true;
651 }
652
653 /* STMTS is a group of GROUP_SIZE SLP statements in which some
654 statements do the same operation as the first statement and in which
655 the others do ALT_STMT_CODE. Return true if we can take one vector
656 of the first operation and one vector of the second and permute them
657 to get the required result. VECTYPE is the type of the vector that
658 would be permuted. */
659
660 static bool
661 vect_two_operations_perm_ok_p (vec<gimple *> stmts, unsigned int group_size,
662 tree vectype, tree_code alt_stmt_code)
663 {
664 unsigned HOST_WIDE_INT count;
665 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
666 return false;
667
668 vec_perm_builder sel (count, count, 1);
669 for (unsigned int i = 0; i < count; ++i)
670 {
671 unsigned int elt = i;
672 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
673 elt += count;
674 sel.quick_push (elt);
675 }
676 vec_perm_indices indices (sel, 2, count);
677 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
678 }
679
680 /* Verify if the scalar stmts STMTS are isomorphic, require data
681 permutation or are of unsupported types of operation. Return
682 true if they are, otherwise return false and indicate in *MATCHES
683 which stmts are not isomorphic to the first one. If MATCHES[0]
684 is false then this indicates the comparison could not be
685 carried out or the stmts will never be vectorized by SLP.
686
687 Note COND_EXPR is possibly ismorphic to another one after swapping its
688 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
689 the first stmt by swapping the two operands of comparison; set SWAP[i]
690 to 2 if stmt I is isormorphic to the first stmt by inverting the code
691 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
692 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
693
694 static bool
695 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
696 vec<gimple *> stmts, unsigned int group_size,
697 poly_uint64 *max_nunits, bool *matches,
698 bool *two_operators)
699 {
700 unsigned int i;
701 gimple *first_stmt = stmts[0], *stmt = stmts[0];
702 enum tree_code first_stmt_code = ERROR_MARK;
703 enum tree_code alt_stmt_code = ERROR_MARK;
704 enum tree_code rhs_code = ERROR_MARK;
705 enum tree_code first_cond_code = ERROR_MARK;
706 tree lhs;
707 bool need_same_oprnds = false;
708 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
709 optab optab;
710 int icode;
711 machine_mode optab_op2_mode;
712 machine_mode vec_mode;
713 gimple *first_load = NULL, *prev_first_load = NULL;
714
715 /* For every stmt in NODE find its def stmt/s. */
716 FOR_EACH_VEC_ELT (stmts, i, stmt)
717 {
718 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
719 swap[i] = 0;
720 matches[i] = false;
721
722 if (dump_enabled_p ())
723 {
724 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
725 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
726 }
727
728 /* Fail to vectorize statements marked as unvectorizable. */
729 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
730 {
731 if (dump_enabled_p ())
732 {
733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
734 "Build SLP failed: unvectorizable statement ");
735 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
736 }
737 /* Fatal mismatch. */
738 matches[0] = false;
739 return false;
740 }
741
742 lhs = gimple_get_lhs (stmt);
743 if (lhs == NULL_TREE)
744 {
745 if (dump_enabled_p ())
746 {
747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
748 "Build SLP failed: not GIMPLE_ASSIGN nor "
749 "GIMPLE_CALL ");
750 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
751 }
752 /* Fatal mismatch. */
753 matches[0] = false;
754 return false;
755 }
756
757 tree nunits_vectype;
758 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
759 &nunits_vectype)
760 || (nunits_vectype
761 && !vect_record_max_nunits (vinfo, stmt, group_size,
762 nunits_vectype, max_nunits)))
763 {
764 /* Fatal mismatch. */
765 matches[0] = false;
766 return false;
767 }
768
769 gcc_assert (vectype);
770
771 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
772 {
773 rhs_code = CALL_EXPR;
774 if ((gimple_call_internal_p (call_stmt)
775 && (!vectorizable_internal_fn_p
776 (gimple_call_internal_fn (call_stmt))))
777 || gimple_call_tail_p (call_stmt)
778 || gimple_call_noreturn_p (call_stmt)
779 || !gimple_call_nothrow_p (call_stmt)
780 || gimple_call_chain (call_stmt))
781 {
782 if (dump_enabled_p ())
783 {
784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
785 "Build SLP failed: unsupported call type ");
786 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
787 call_stmt, 0);
788 }
789 /* Fatal mismatch. */
790 matches[0] = false;
791 return false;
792 }
793 }
794 else
795 rhs_code = gimple_assign_rhs_code (stmt);
796
797 /* Check the operation. */
798 if (i == 0)
799 {
800 first_stmt_code = rhs_code;
801
802 /* Shift arguments should be equal in all the packed stmts for a
803 vector shift with scalar shift operand. */
804 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
805 || rhs_code == LROTATE_EXPR
806 || rhs_code == RROTATE_EXPR)
807 {
808 if (vectype == boolean_type_node)
809 {
810 if (dump_enabled_p ())
811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
812 "Build SLP failed: shift of a"
813 " boolean.\n");
814 /* Fatal mismatch. */
815 matches[0] = false;
816 return false;
817 }
818
819 vec_mode = TYPE_MODE (vectype);
820
821 /* First see if we have a vector/vector shift. */
822 optab = optab_for_tree_code (rhs_code, vectype,
823 optab_vector);
824
825 if (!optab
826 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
827 {
828 /* No vector/vector shift, try for a vector/scalar shift. */
829 optab = optab_for_tree_code (rhs_code, vectype,
830 optab_scalar);
831
832 if (!optab)
833 {
834 if (dump_enabled_p ())
835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
836 "Build SLP failed: no optab.\n");
837 /* Fatal mismatch. */
838 matches[0] = false;
839 return false;
840 }
841 icode = (int) optab_handler (optab, vec_mode);
842 if (icode == CODE_FOR_nothing)
843 {
844 if (dump_enabled_p ())
845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
846 "Build SLP failed: "
847 "op not supported by target.\n");
848 /* Fatal mismatch. */
849 matches[0] = false;
850 return false;
851 }
852 optab_op2_mode = insn_data[icode].operand[2].mode;
853 if (!VECTOR_MODE_P (optab_op2_mode))
854 {
855 need_same_oprnds = true;
856 first_op1 = gimple_assign_rhs2 (stmt);
857 }
858 }
859 }
860 else if (rhs_code == WIDEN_LSHIFT_EXPR)
861 {
862 need_same_oprnds = true;
863 first_op1 = gimple_assign_rhs2 (stmt);
864 }
865 }
866 else
867 {
868 if (first_stmt_code != rhs_code
869 && alt_stmt_code == ERROR_MARK)
870 alt_stmt_code = rhs_code;
871 if (first_stmt_code != rhs_code
872 && (first_stmt_code != IMAGPART_EXPR
873 || rhs_code != REALPART_EXPR)
874 && (first_stmt_code != REALPART_EXPR
875 || rhs_code != IMAGPART_EXPR)
876 /* Handle mismatches in plus/minus by computing both
877 and merging the results. */
878 && !((first_stmt_code == PLUS_EXPR
879 || first_stmt_code == MINUS_EXPR)
880 && (alt_stmt_code == PLUS_EXPR
881 || alt_stmt_code == MINUS_EXPR)
882 && rhs_code == alt_stmt_code)
883 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
884 && (first_stmt_code == ARRAY_REF
885 || first_stmt_code == BIT_FIELD_REF
886 || first_stmt_code == INDIRECT_REF
887 || first_stmt_code == COMPONENT_REF
888 || first_stmt_code == MEM_REF)))
889 {
890 if (dump_enabled_p ())
891 {
892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
893 "Build SLP failed: different operation "
894 "in stmt ");
895 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
897 "original stmt ");
898 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
899 first_stmt, 0);
900 }
901 /* Mismatch. */
902 continue;
903 }
904
905 if (need_same_oprnds
906 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
907 {
908 if (dump_enabled_p ())
909 {
910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
911 "Build SLP failed: different shift "
912 "arguments in ");
913 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
914 }
915 /* Mismatch. */
916 continue;
917 }
918
919 if (rhs_code == CALL_EXPR)
920 {
921 gimple *first_stmt = stmts[0];
922 if (!compatible_calls_p (as_a <gcall *> (first_stmt),
923 as_a <gcall *> (stmt)))
924 {
925 if (dump_enabled_p ())
926 {
927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
928 "Build SLP failed: different calls in ");
929 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
930 stmt, 0);
931 }
932 /* Mismatch. */
933 continue;
934 }
935 }
936 }
937
938 /* Grouped store or load. */
939 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
940 {
941 if (REFERENCE_CLASS_P (lhs))
942 {
943 /* Store. */
944 ;
945 }
946 else
947 {
948 /* Load. */
949 first_load = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
950 if (prev_first_load)
951 {
952 /* Check that there are no loads from different interleaving
953 chains in the same node. */
954 if (prev_first_load != first_load)
955 {
956 if (dump_enabled_p ())
957 {
958 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
959 vect_location,
960 "Build SLP failed: different "
961 "interleaving chains in one node ");
962 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
963 stmt, 0);
964 }
965 /* Mismatch. */
966 continue;
967 }
968 }
969 else
970 prev_first_load = first_load;
971 }
972 } /* Grouped access. */
973 else
974 {
975 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
976 {
977 /* Not grouped load. */
978 if (dump_enabled_p ())
979 {
980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
981 "Build SLP failed: not grouped load ");
982 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
983 }
984
985 /* FORNOW: Not grouped loads are not supported. */
986 /* Fatal mismatch. */
987 matches[0] = false;
988 return false;
989 }
990
991 /* Not memory operation. */
992 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
993 && TREE_CODE_CLASS (rhs_code) != tcc_unary
994 && TREE_CODE_CLASS (rhs_code) != tcc_expression
995 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
996 && rhs_code != CALL_EXPR)
997 {
998 if (dump_enabled_p ())
999 {
1000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1001 "Build SLP failed: operation");
1002 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
1003 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1004 }
1005 /* Fatal mismatch. */
1006 matches[0] = false;
1007 return false;
1008 }
1009
1010 if (rhs_code == COND_EXPR)
1011 {
1012 tree cond_expr = gimple_assign_rhs1 (stmt);
1013 enum tree_code cond_code = TREE_CODE (cond_expr);
1014 enum tree_code swap_code = ERROR_MARK;
1015 enum tree_code invert_code = ERROR_MARK;
1016
1017 if (i == 0)
1018 first_cond_code = TREE_CODE (cond_expr);
1019 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1020 {
1021 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1022 swap_code = swap_tree_comparison (cond_code);
1023 invert_code = invert_tree_comparison (cond_code, honor_nans);
1024 }
1025
1026 if (first_cond_code == cond_code)
1027 ;
1028 /* Isomorphic can be achieved by swapping. */
1029 else if (first_cond_code == swap_code)
1030 swap[i] = 1;
1031 /* Isomorphic can be achieved by inverting. */
1032 else if (first_cond_code == invert_code)
1033 swap[i] = 2;
1034 else
1035 {
1036 if (dump_enabled_p ())
1037 {
1038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1039 "Build SLP failed: different"
1040 " operation");
1041 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1042 stmt, 0);
1043 }
1044 /* Mismatch. */
1045 continue;
1046 }
1047 }
1048 }
1049
1050 matches[i] = true;
1051 }
1052
1053 for (i = 0; i < group_size; ++i)
1054 if (!matches[i])
1055 return false;
1056
1057 /* If we allowed a two-operation SLP node verify the target can cope
1058 with the permute we are going to use. */
1059 if (alt_stmt_code != ERROR_MARK
1060 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1061 {
1062 if (vectype == boolean_type_node
1063 || !vect_two_operations_perm_ok_p (stmts, group_size,
1064 vectype, alt_stmt_code))
1065 {
1066 for (i = 0; i < group_size; ++i)
1067 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
1068 {
1069 matches[i] = false;
1070 if (dump_enabled_p ())
1071 {
1072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1073 "Build SLP failed: different operation "
1074 "in stmt ");
1075 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1076 stmts[i], 0);
1077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1078 "original stmt ");
1079 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1080 first_stmt, 0);
1081 }
1082 }
1083 return false;
1084 }
1085 *two_operators = true;
1086 }
1087
1088 return true;
1089 }
1090
1091 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1092 Note we never remove apart from at destruction time so we do not
1093 need a special value for deleted that differs from empty. */
1094 struct bst_traits
1095 {
1096 typedef vec <gimple *> value_type;
1097 typedef vec <gimple *> compare_type;
1098 static inline hashval_t hash (value_type);
1099 static inline bool equal (value_type existing, value_type candidate);
1100 static inline bool is_empty (value_type x) { return !x.exists (); }
1101 static inline bool is_deleted (value_type x) { return !x.exists (); }
1102 static inline void mark_empty (value_type &x) { x.release (); }
1103 static inline void mark_deleted (value_type &x) { x.release (); }
1104 static inline void remove (value_type &x) { x.release (); }
1105 };
1106 inline hashval_t
1107 bst_traits::hash (value_type x)
1108 {
1109 inchash::hash h;
1110 for (unsigned i = 0; i < x.length (); ++i)
1111 h.add_int (gimple_uid (x[i]));
1112 return h.end ();
1113 }
1114 inline bool
1115 bst_traits::equal (value_type existing, value_type candidate)
1116 {
1117 if (existing.length () != candidate.length ())
1118 return false;
1119 for (unsigned i = 0; i < existing.length (); ++i)
1120 if (existing[i] != candidate[i])
1121 return false;
1122 return true;
1123 }
1124
1125 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
1126 static scalar_stmts_set_t *bst_fail;
1127
1128 typedef hash_map <vec <gimple *>, slp_tree,
1129 simple_hashmap_traits <bst_traits, slp_tree> >
1130 scalar_stmts_to_slp_tree_map_t;
1131
1132 static slp_tree
1133 vect_build_slp_tree_2 (vec_info *vinfo,
1134 vec<gimple *> stmts, unsigned int group_size,
1135 poly_uint64 *max_nunits,
1136 vec<slp_tree> *loads,
1137 bool *matches, unsigned *npermutes, unsigned *tree_size,
1138 unsigned max_tree_size);
1139
1140 static slp_tree
1141 vect_build_slp_tree (vec_info *vinfo,
1142 vec<gimple *> stmts, unsigned int group_size,
1143 poly_uint64 *max_nunits, vec<slp_tree> *loads,
1144 bool *matches, unsigned *npermutes, unsigned *tree_size,
1145 unsigned max_tree_size)
1146 {
1147 if (bst_fail->contains (stmts))
1148 return NULL;
1149 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1150 loads, matches, npermutes, tree_size,
1151 max_tree_size);
1152 /* When SLP build fails for stmts record this, otherwise SLP build
1153 can be exponential in time when we allow to construct parts from
1154 scalars, see PR81723. */
1155 if (! res)
1156 {
1157 vec <gimple *> x;
1158 x.create (stmts.length ());
1159 x.splice (stmts);
1160 bst_fail->add (x);
1161 }
1162 return res;
1163 }
1164
1165 /* Recursively build an SLP tree starting from NODE.
1166 Fail (and return a value not equal to zero) if def-stmts are not
1167 isomorphic, require data permutation or are of unsupported types of
1168 operation. Otherwise, return 0.
1169 The value returned is the depth in the SLP tree where a mismatch
1170 was found. */
1171
1172 static slp_tree
1173 vect_build_slp_tree_2 (vec_info *vinfo,
1174 vec<gimple *> stmts, unsigned int group_size,
1175 poly_uint64 *max_nunits,
1176 vec<slp_tree> *loads,
1177 bool *matches, unsigned *npermutes, unsigned *tree_size,
1178 unsigned max_tree_size)
1179 {
1180 unsigned nops, i, this_tree_size = 0;
1181 poly_uint64 this_max_nunits = *max_nunits;
1182 gimple *stmt;
1183 slp_tree node;
1184
1185 matches[0] = false;
1186
1187 stmt = stmts[0];
1188 if (is_gimple_call (stmt))
1189 nops = gimple_call_num_args (stmt);
1190 else if (is_gimple_assign (stmt))
1191 {
1192 nops = gimple_num_ops (stmt) - 1;
1193 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1194 nops++;
1195 }
1196 else if (gimple_code (stmt) == GIMPLE_PHI)
1197 nops = 0;
1198 else
1199 return NULL;
1200
1201 /* If the SLP node is a PHI (induction or reduction), terminate
1202 the recursion. */
1203 if (gimple_code (stmt) == GIMPLE_PHI)
1204 {
1205 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1206 tree vectype = get_vectype_for_scalar_type (scalar_type);
1207 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype,
1208 max_nunits))
1209 return NULL;
1210
1211 vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt));
1212 /* Induction from different IVs is not supported. */
1213 if (def_type == vect_induction_def)
1214 {
1215 FOR_EACH_VEC_ELT (stmts, i, stmt)
1216 if (stmt != stmts[0])
1217 return NULL;
1218 }
1219 else
1220 {
1221 /* Else def types have to match. */
1222 FOR_EACH_VEC_ELT (stmts, i, stmt)
1223 {
1224 /* But for reduction chains only check on the first stmt. */
1225 if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
1226 && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt)
1227 continue;
1228 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type)
1229 return NULL;
1230 }
1231 }
1232 node = vect_create_new_slp_node (stmts);
1233 return node;
1234 }
1235
1236
1237 bool two_operators = false;
1238 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1239 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1240 &this_max_nunits, matches, &two_operators))
1241 return NULL;
1242
1243 /* If the SLP node is a load, terminate the recursion. */
1244 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1245 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1246 {
1247 *max_nunits = this_max_nunits;
1248 node = vect_create_new_slp_node (stmts);
1249 loads->safe_push (node);
1250 return node;
1251 }
1252
1253 /* Get at the operands, verifying they are compatible. */
1254 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1255 slp_oprnd_info oprnd_info;
1256 FOR_EACH_VEC_ELT (stmts, i, stmt)
1257 {
1258 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1259 stmts, i, &oprnds_info);
1260 if (res != 0)
1261 matches[(res == -1) ? 0 : i] = false;
1262 if (!matches[0])
1263 break;
1264 }
1265 for (i = 0; i < group_size; ++i)
1266 if (!matches[i])
1267 {
1268 vect_free_oprnd_info (oprnds_info);
1269 return NULL;
1270 }
1271
1272 auto_vec<slp_tree, 4> children;
1273 auto_vec<slp_tree> this_loads;
1274
1275 stmt = stmts[0];
1276
1277 if (tree_size)
1278 max_tree_size -= *tree_size;
1279
1280 /* Create SLP_TREE nodes for the definition node/s. */
1281 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1282 {
1283 slp_tree child;
1284 unsigned old_nloads = this_loads.length ();
1285 unsigned old_tree_size = this_tree_size;
1286 unsigned int j;
1287
1288 if (oprnd_info->first_dt != vect_internal_def
1289 && oprnd_info->first_dt != vect_reduction_def
1290 && oprnd_info->first_dt != vect_induction_def)
1291 continue;
1292
1293 if (++this_tree_size > max_tree_size)
1294 {
1295 FOR_EACH_VEC_ELT (children, j, child)
1296 vect_free_slp_tree (child, false);
1297 vect_free_oprnd_info (oprnds_info);
1298 return NULL;
1299 }
1300
1301 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1302 group_size, &this_max_nunits,
1303 &this_loads, matches, npermutes,
1304 &this_tree_size,
1305 max_tree_size)) != NULL)
1306 {
1307 /* If we have all children of child built up from scalars then just
1308 throw that away and build it up this node from scalars. */
1309 if (!SLP_TREE_CHILDREN (child).is_empty ()
1310 /* ??? Rejecting patterns this way doesn't work. We'd have to
1311 do extra work to cancel the pattern so the uses see the
1312 scalar version. */
1313 && !is_pattern_stmt_p
1314 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1315 {
1316 slp_tree grandchild;
1317
1318 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1319 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1320 break;
1321 if (!grandchild)
1322 {
1323 /* Roll back. */
1324 this_loads.truncate (old_nloads);
1325 this_tree_size = old_tree_size;
1326 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1327 vect_free_slp_tree (grandchild, false);
1328 SLP_TREE_CHILDREN (child).truncate (0);
1329
1330 dump_printf_loc (MSG_NOTE, vect_location,
1331 "Building parent vector operands from "
1332 "scalars instead\n");
1333 oprnd_info->def_stmts = vNULL;
1334 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1335 children.safe_push (child);
1336 continue;
1337 }
1338 }
1339
1340 oprnd_info->def_stmts = vNULL;
1341 children.safe_push (child);
1342 continue;
1343 }
1344
1345 /* If the SLP build failed fatally and we analyze a basic-block
1346 simply treat nodes we fail to build as externally defined
1347 (and thus build vectors from the scalar defs).
1348 The cost model will reject outright expensive cases.
1349 ??? This doesn't treat cases where permutation ultimatively
1350 fails (or we don't try permutation below). Ideally we'd
1351 even compute a permutation that will end up with the maximum
1352 SLP tree size... */
1353 if (is_a <bb_vec_info> (vinfo)
1354 && !matches[0]
1355 /* ??? Rejecting patterns this way doesn't work. We'd have to
1356 do extra work to cancel the pattern so the uses see the
1357 scalar version. */
1358 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1359 {
1360 dump_printf_loc (MSG_NOTE, vect_location,
1361 "Building vector operands from scalars\n");
1362 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1363 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1364 children.safe_push (child);
1365 oprnd_info->def_stmts = vNULL;
1366 continue;
1367 }
1368
1369 /* If the SLP build for operand zero failed and operand zero
1370 and one can be commutated try that for the scalar stmts
1371 that failed the match. */
1372 if (i == 0
1373 /* A first scalar stmt mismatch signals a fatal mismatch. */
1374 && matches[0]
1375 /* ??? For COND_EXPRs we can swap the comparison operands
1376 as well as the arms under some constraints. */
1377 && nops == 2
1378 && oprnds_info[1]->first_dt == vect_internal_def
1379 && is_gimple_assign (stmt)
1380 /* Do so only if the number of not successful permutes was nor more
1381 than a cut-ff as re-trying the recursive match on
1382 possibly each level of the tree would expose exponential
1383 behavior. */
1384 && *npermutes < 4)
1385 {
1386 /* See whether we can swap the matching or the non-matching
1387 stmt operands. */
1388 bool swap_not_matching = true;
1389 do
1390 {
1391 for (j = 0; j < group_size; ++j)
1392 {
1393 if (matches[j] != !swap_not_matching)
1394 continue;
1395 gimple *stmt = stmts[j];
1396 /* Verify if we can swap operands of this stmt. */
1397 if (!is_gimple_assign (stmt)
1398 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1399 {
1400 if (!swap_not_matching)
1401 goto fail;
1402 swap_not_matching = false;
1403 break;
1404 }
1405 /* Verify if we can safely swap or if we committed to a
1406 specific operand order already.
1407 ??? Instead of modifying GIMPLE stmts here we could
1408 record whether we want to swap operands in the SLP
1409 node and temporarily do that when processing it
1410 (or wrap operand accessors in a helper). */
1411 else if (swap[j] != 0
1412 || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)))
1413 {
1414 if (!swap_not_matching)
1415 {
1416 if (dump_enabled_p ())
1417 {
1418 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1419 vect_location,
1420 "Build SLP failed: cannot swap "
1421 "operands of shared stmt ");
1422 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
1423 TDF_SLIM, stmts[j], 0);
1424 }
1425 goto fail;
1426 }
1427 swap_not_matching = false;
1428 break;
1429 }
1430 }
1431 }
1432 while (j != group_size);
1433
1434 /* Swap mismatched definition stmts. */
1435 dump_printf_loc (MSG_NOTE, vect_location,
1436 "Re-trying with swapped operands of stmts ");
1437 for (j = 0; j < group_size; ++j)
1438 if (matches[j] == !swap_not_matching)
1439 {
1440 std::swap (oprnds_info[0]->def_stmts[j],
1441 oprnds_info[1]->def_stmts[j]);
1442 dump_printf (MSG_NOTE, "%d ", j);
1443 }
1444 dump_printf (MSG_NOTE, "\n");
1445 /* And try again with scratch 'matches' ... */
1446 bool *tem = XALLOCAVEC (bool, group_size);
1447 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1448 group_size, &this_max_nunits,
1449 &this_loads, tem, npermutes,
1450 &this_tree_size,
1451 max_tree_size)) != NULL)
1452 {
1453 /* ... so if successful we can apply the operand swapping
1454 to the GIMPLE IL. This is necessary because for example
1455 vect_get_slp_defs uses operand indexes and thus expects
1456 canonical operand order. This is also necessary even
1457 if we end up building the operand from scalars as
1458 we'll continue to process swapped operand two. */
1459 for (j = 0; j < group_size; ++j)
1460 {
1461 gimple *stmt = stmts[j];
1462 gimple_set_plf (stmt, GF_PLF_1, false);
1463 }
1464 for (j = 0; j < group_size; ++j)
1465 {
1466 gimple *stmt = stmts[j];
1467 if (matches[j] == !swap_not_matching)
1468 {
1469 /* Avoid swapping operands twice. */
1470 if (gimple_plf (stmt, GF_PLF_1))
1471 continue;
1472 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1473 gimple_assign_rhs2_ptr (stmt));
1474 gimple_set_plf (stmt, GF_PLF_1, true);
1475 }
1476 }
1477 /* Verify we swap all duplicates or none. */
1478 if (flag_checking)
1479 for (j = 0; j < group_size; ++j)
1480 {
1481 gimple *stmt = stmts[j];
1482 gcc_assert (gimple_plf (stmt, GF_PLF_1)
1483 == (matches[j] == !swap_not_matching));
1484 }
1485
1486 /* If we have all children of child built up from scalars then
1487 just throw that away and build it up this node from scalars. */
1488 if (!SLP_TREE_CHILDREN (child).is_empty ()
1489 /* ??? Rejecting patterns this way doesn't work. We'd have
1490 to do extra work to cancel the pattern so the uses see the
1491 scalar version. */
1492 && !is_pattern_stmt_p
1493 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])))
1494 {
1495 unsigned int j;
1496 slp_tree grandchild;
1497
1498 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1499 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1500 break;
1501 if (!grandchild)
1502 {
1503 /* Roll back. */
1504 this_loads.truncate (old_nloads);
1505 this_tree_size = old_tree_size;
1506 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1507 vect_free_slp_tree (grandchild, false);
1508 SLP_TREE_CHILDREN (child).truncate (0);
1509
1510 dump_printf_loc (MSG_NOTE, vect_location,
1511 "Building parent vector operands from "
1512 "scalars instead\n");
1513 oprnd_info->def_stmts = vNULL;
1514 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1515 children.safe_push (child);
1516 continue;
1517 }
1518 }
1519
1520 oprnd_info->def_stmts = vNULL;
1521 children.safe_push (child);
1522 continue;
1523 }
1524
1525 ++*npermutes;
1526 }
1527
1528 fail:
1529 gcc_assert (child == NULL);
1530 FOR_EACH_VEC_ELT (children, j, child)
1531 vect_free_slp_tree (child, false);
1532 vect_free_oprnd_info (oprnds_info);
1533 return NULL;
1534 }
1535
1536 vect_free_oprnd_info (oprnds_info);
1537
1538 if (tree_size)
1539 *tree_size += this_tree_size;
1540 *max_nunits = this_max_nunits;
1541 loads->safe_splice (this_loads);
1542
1543 node = vect_create_new_slp_node (stmts);
1544 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1545 SLP_TREE_CHILDREN (node).splice (children);
1546 return node;
1547 }
1548
1549 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1550
1551 static void
1552 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1553 slp_tree node)
1554 {
1555 int i;
1556 gimple *stmt;
1557 slp_tree child;
1558
1559 dump_printf_loc (dump_kind, loc, "node%s\n",
1560 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1561 ? " (external)" : "");
1562 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1563 {
1564 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
1565 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1566 }
1567 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1568 vect_print_slp_tree (dump_kind, loc, child);
1569 }
1570
1571
1572 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1573 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1574 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1575 stmts in NODE are to be marked. */
1576
1577 static void
1578 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1579 {
1580 int i;
1581 gimple *stmt;
1582 slp_tree child;
1583
1584 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1585 return;
1586
1587 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1588 if (j < 0 || i == j)
1589 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1590
1591 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1592 vect_mark_slp_stmts (child, mark, j);
1593 }
1594
1595
1596 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1597
1598 static void
1599 vect_mark_slp_stmts_relevant (slp_tree node)
1600 {
1601 int i;
1602 gimple *stmt;
1603 stmt_vec_info stmt_info;
1604 slp_tree child;
1605
1606 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1607 return;
1608
1609 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1610 {
1611 stmt_info = vinfo_for_stmt (stmt);
1612 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1613 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1614 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1615 }
1616
1617 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1618 vect_mark_slp_stmts_relevant (child);
1619 }
1620
1621
1622 /* Rearrange the statements of NODE according to PERMUTATION. */
1623
1624 static void
1625 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1626 vec<unsigned> permutation)
1627 {
1628 gimple *stmt;
1629 vec<gimple *> tmp_stmts;
1630 unsigned int i;
1631 slp_tree child;
1632
1633 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1634 vect_slp_rearrange_stmts (child, group_size, permutation);
1635
1636 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1637 tmp_stmts.create (group_size);
1638 tmp_stmts.quick_grow_cleared (group_size);
1639
1640 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1641 tmp_stmts[permutation[i]] = stmt;
1642
1643 SLP_TREE_SCALAR_STMTS (node).release ();
1644 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1645 }
1646
1647
1648 /* Attempt to reorder stmts in a reduction chain so that we don't
1649 require any load permutation. Return true if that was possible,
1650 otherwise return false. */
1651
1652 static bool
1653 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1654 {
1655 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1656 unsigned int i, j;
1657 unsigned int lidx;
1658 slp_tree node, load;
1659
1660 /* Compare all the permutation sequences to the first one. We know
1661 that at least one load is permuted. */
1662 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1663 if (!node->load_permutation.exists ())
1664 return false;
1665 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1666 {
1667 if (!load->load_permutation.exists ())
1668 return false;
1669 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1670 if (lidx != node->load_permutation[j])
1671 return false;
1672 }
1673
1674 /* Check that the loads in the first sequence are different and there
1675 are no gaps between them. */
1676 auto_sbitmap load_index (group_size);
1677 bitmap_clear (load_index);
1678 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1679 {
1680 if (lidx >= group_size)
1681 return false;
1682 if (bitmap_bit_p (load_index, lidx))
1683 return false;
1684
1685 bitmap_set_bit (load_index, lidx);
1686 }
1687 for (i = 0; i < group_size; i++)
1688 if (!bitmap_bit_p (load_index, i))
1689 return false;
1690
1691 /* This permutation is valid for reduction. Since the order of the
1692 statements in the nodes is not important unless they are memory
1693 accesses, we can rearrange the statements in all the nodes
1694 according to the order of the loads. */
1695 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1696 node->load_permutation);
1697
1698 /* We are done, no actual permutations need to be generated. */
1699 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1700 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1701 {
1702 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1703 first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
1704 /* But we have to keep those permutations that are required because
1705 of handling of gaps. */
1706 if (known_eq (unrolling_factor, 1U)
1707 || (group_size == DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
1708 && DR_GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))
1709 SLP_TREE_LOAD_PERMUTATION (node).release ();
1710 else
1711 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1712 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1713 }
1714
1715 return true;
1716 }
1717
1718 /* Check if the required load permutations in the SLP instance
1719 SLP_INSTN are supported. */
1720
1721 static bool
1722 vect_supported_load_permutation_p (slp_instance slp_instn)
1723 {
1724 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1725 unsigned int i, j, k, next;
1726 slp_tree node;
1727 gimple *stmt, *load, *next_load;
1728
1729 if (dump_enabled_p ())
1730 {
1731 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1732 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1733 if (node->load_permutation.exists ())
1734 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1735 dump_printf (MSG_NOTE, "%d ", next);
1736 else
1737 for (k = 0; k < group_size; ++k)
1738 dump_printf (MSG_NOTE, "%d ", k);
1739 dump_printf (MSG_NOTE, "\n");
1740 }
1741
1742 /* In case of reduction every load permutation is allowed, since the order
1743 of the reduction statements is not important (as opposed to the case of
1744 grouped stores). The only condition we need to check is that all the
1745 load nodes are of the same size and have the same permutation (and then
1746 rearrange all the nodes of the SLP instance according to this
1747 permutation). */
1748
1749 /* Check that all the load nodes are of the same size. */
1750 /* ??? Can't we assert this? */
1751 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1752 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1753 return false;
1754
1755 node = SLP_INSTANCE_TREE (slp_instn);
1756 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1757
1758 /* Reduction (there are no data-refs in the root).
1759 In reduction chain the order of the loads is not important. */
1760 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1761 && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1762 vect_attempt_slp_rearrange_stmts (slp_instn);
1763
1764 /* In basic block vectorization we allow any subchain of an interleaving
1765 chain.
1766 FORNOW: not supported in loop SLP because of realignment compications. */
1767 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1768 {
1769 /* Check whether the loads in an instance form a subchain and thus
1770 no permutation is necessary. */
1771 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1772 {
1773 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1774 continue;
1775 bool subchain_p = true;
1776 next_load = NULL;
1777 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1778 {
1779 if (j != 0
1780 && (next_load != load
1781 || DR_GROUP_GAP (vinfo_for_stmt (load)) != 1))
1782 {
1783 subchain_p = false;
1784 break;
1785 }
1786 next_load = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1787 }
1788 if (subchain_p)
1789 SLP_TREE_LOAD_PERMUTATION (node).release ();
1790 else
1791 {
1792 stmt_vec_info group_info
1793 = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1794 group_info = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (group_info));
1795 unsigned HOST_WIDE_INT nunits;
1796 unsigned k, maxk = 0;
1797 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1798 if (k > maxk)
1799 maxk = k;
1800 /* In BB vectorization we may not actually use a loaded vector
1801 accessing elements in excess of DR_GROUP_SIZE. */
1802 tree vectype = STMT_VINFO_VECTYPE (group_info);
1803 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1804 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1805 {
1806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1807 "BB vectorization with gaps at the end of "
1808 "a load is not supported\n");
1809 return false;
1810 }
1811
1812 /* Verify the permutation can be generated. */
1813 vec<tree> tem;
1814 unsigned n_perms;
1815 if (!vect_transform_slp_perm_load (node, tem, NULL,
1816 1, slp_instn, true, &n_perms))
1817 {
1818 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1819 vect_location,
1820 "unsupported load permutation\n");
1821 return false;
1822 }
1823 }
1824 }
1825 return true;
1826 }
1827
1828 /* For loop vectorization verify we can generate the permutation. Be
1829 conservative about the vectorization factor, there are permutations
1830 that will use three vector inputs only starting from a specific factor
1831 and the vectorization factor is not yet final.
1832 ??? The SLP instance unrolling factor might not be the maximum one. */
1833 unsigned n_perms;
1834 poly_uint64 test_vf
1835 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1836 LOOP_VINFO_VECT_FACTOR
1837 (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt))));
1838 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1839 if (node->load_permutation.exists ()
1840 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1841 slp_instn, true, &n_perms))
1842 return false;
1843
1844 return true;
1845 }
1846
1847
1848 /* Find the last store in SLP INSTANCE. */
1849
1850 gimple *
1851 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1852 {
1853 gimple *last = NULL, *stmt;
1854
1855 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1856 {
1857 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1858 if (is_pattern_stmt_p (stmt_vinfo))
1859 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1860 else
1861 last = get_later_stmt (stmt, last);
1862 }
1863
1864 return last;
1865 }
1866
1867 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups:
1868 one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing
1869 the first GROUP1_SIZE stmts, since stores are consecutive), the second
1870 containing the remainder.
1871 Return the first stmt in the second group. */
1872
1873 static gimple *
1874 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size)
1875 {
1876 stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt);
1877 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt);
1878 gcc_assert (group1_size > 0);
1879 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1880 gcc_assert (group2_size > 0);
1881 DR_GROUP_SIZE (first_vinfo) = group1_size;
1882
1883 gimple *stmt = first_stmt;
1884 for (unsigned i = group1_size; i > 1; i--)
1885 {
1886 stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
1887 gcc_assert (DR_GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
1888 }
1889 /* STMT is now the last element of the first group. */
1890 gimple *group2 = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
1891 DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0;
1892
1893 DR_GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size;
1894 for (stmt = group2; stmt; stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)))
1895 {
1896 DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2;
1897 gcc_assert (DR_GROUP_GAP (vinfo_for_stmt (stmt)) == 1);
1898 }
1899
1900 /* For the second group, the DR_GROUP_GAP is that before the original group,
1901 plus skipping over the first vector. */
1902 DR_GROUP_GAP (vinfo_for_stmt (group2))
1903 = DR_GROUP_GAP (first_vinfo) + group1_size;
1904
1905 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1906 DR_GROUP_GAP (first_vinfo) += group2_size;
1907
1908 if (dump_enabled_p ())
1909 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1910 group1_size, group2_size);
1911
1912 return group2;
1913 }
1914
1915 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1916 statements and a vector of NUNITS elements. */
1917
1918 static poly_uint64
1919 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1920 {
1921 return exact_div (common_multiple (nunits, group_size), group_size);
1922 }
1923
1924 /* Analyze an SLP instance starting from a group of grouped stores. Call
1925 vect_build_slp_tree to build a tree of packed stmts if possible.
1926 Return FALSE if it's impossible to SLP any stmt in the loop. */
1927
1928 static bool
1929 vect_analyze_slp_instance (vec_info *vinfo,
1930 gimple *stmt, unsigned max_tree_size)
1931 {
1932 slp_instance new_instance;
1933 slp_tree node;
1934 unsigned int group_size;
1935 tree vectype, scalar_type = NULL_TREE;
1936 gimple *next;
1937 unsigned int i;
1938 vec<slp_tree> loads;
1939 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1940 vec<gimple *> scalar_stmts;
1941
1942 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1943 {
1944 scalar_type = TREE_TYPE (DR_REF (dr));
1945 vectype = get_vectype_for_scalar_type (scalar_type);
1946 group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
1947 }
1948 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1949 {
1950 gcc_assert (is_a <loop_vec_info> (vinfo));
1951 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1952 group_size = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
1953 }
1954 else
1955 {
1956 gcc_assert (is_a <loop_vec_info> (vinfo));
1957 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1958 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1959 }
1960
1961 if (!vectype)
1962 {
1963 if (dump_enabled_p ())
1964 {
1965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1966 "Build SLP failed: unsupported data-type ");
1967 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1968 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1969 }
1970
1971 return false;
1972 }
1973 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1974
1975 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1976 scalar_stmts.create (group_size);
1977 next = stmt;
1978 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1979 {
1980 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1981 while (next)
1982 {
1983 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1984 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1985 scalar_stmts.safe_push (
1986 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1987 else
1988 scalar_stmts.safe_push (next);
1989 next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1990 }
1991 }
1992 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1993 {
1994 /* Collect the reduction stmts and store them in
1995 SLP_TREE_SCALAR_STMTS. */
1996 while (next)
1997 {
1998 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1999 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
2000 scalar_stmts.safe_push (
2001 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
2002 else
2003 scalar_stmts.safe_push (next);
2004 next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2005 }
2006 /* Mark the first element of the reduction chain as reduction to properly
2007 transform the node. In the reduction analysis phase only the last
2008 element of the chain is marked as reduction. */
2009 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
2010 }
2011 else
2012 {
2013 /* Collect reduction statements. */
2014 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2015 for (i = 0; reductions.iterate (i, &next); i++)
2016 scalar_stmts.safe_push (next);
2017 }
2018
2019 loads.create (group_size);
2020
2021 /* Build the tree for the SLP instance. */
2022 bool *matches = XALLOCAVEC (bool, group_size);
2023 unsigned npermutes = 0;
2024 bst_fail = new scalar_stmts_set_t ();
2025 poly_uint64 max_nunits = nunits;
2026 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2027 &max_nunits, &loads, matches, &npermutes,
2028 NULL, max_tree_size);
2029 delete bst_fail;
2030 if (node != NULL)
2031 {
2032 /* Calculate the unrolling factor based on the smallest type. */
2033 poly_uint64 unrolling_factor
2034 = calculate_unrolling_factor (max_nunits, group_size);
2035
2036 if (maybe_ne (unrolling_factor, 1U)
2037 && is_a <bb_vec_info> (vinfo))
2038 {
2039 unsigned HOST_WIDE_INT const_max_nunits;
2040 if (!max_nunits.is_constant (&const_max_nunits)
2041 || const_max_nunits > group_size)
2042 {
2043 if (dump_enabled_p ())
2044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2045 "Build SLP failed: store group "
2046 "size not a multiple of the vector size "
2047 "in basic block SLP\n");
2048 vect_free_slp_tree (node, false);
2049 loads.release ();
2050 return false;
2051 }
2052 /* Fatal mismatch. */
2053 matches[group_size / const_max_nunits * const_max_nunits] = false;
2054 vect_free_slp_tree (node, false);
2055 loads.release ();
2056 }
2057 else
2058 {
2059 /* Create a new SLP instance. */
2060 new_instance = XNEW (struct _slp_instance);
2061 SLP_INSTANCE_TREE (new_instance) = node;
2062 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2063 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2064 SLP_INSTANCE_LOADS (new_instance) = loads;
2065
2066 /* Compute the load permutation. */
2067 slp_tree load_node;
2068 bool loads_permuted = false;
2069 FOR_EACH_VEC_ELT (loads, i, load_node)
2070 {
2071 vec<unsigned> load_permutation;
2072 int j;
2073 gimple *load, *first_stmt;
2074 bool this_load_permuted = false;
2075 load_permutation.create (group_size);
2076 first_stmt = DR_GROUP_FIRST_ELEMENT
2077 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2078 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
2079 {
2080 int load_place = vect_get_place_in_interleaving_chain
2081 (load, first_stmt);
2082 gcc_assert (load_place != -1);
2083 if (load_place != j)
2084 this_load_permuted = true;
2085 load_permutation.safe_push (load_place);
2086 }
2087 if (!this_load_permuted
2088 /* The load requires permutation when unrolling exposes
2089 a gap either because the group is larger than the SLP
2090 group-size or because there is a gap between the groups. */
2091 && (known_eq (unrolling_factor, 1U)
2092 || (group_size == DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
2093 && DR_GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
2094 {
2095 load_permutation.release ();
2096 continue;
2097 }
2098 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2099 loads_permuted = true;
2100 }
2101
2102 if (loads_permuted)
2103 {
2104 if (!vect_supported_load_permutation_p (new_instance))
2105 {
2106 if (dump_enabled_p ())
2107 {
2108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2109 "Build SLP failed: unsupported load "
2110 "permutation ");
2111 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
2112 TDF_SLIM, stmt, 0);
2113 }
2114 vect_free_slp_instance (new_instance, false);
2115 return false;
2116 }
2117 }
2118
2119 /* If the loads and stores can be handled with load/store-lan
2120 instructions do not generate this SLP instance. */
2121 if (is_a <loop_vec_info> (vinfo)
2122 && loads_permuted
2123 && dr && vect_store_lanes_supported (vectype, group_size, false))
2124 {
2125 slp_tree load_node;
2126 FOR_EACH_VEC_ELT (loads, i, load_node)
2127 {
2128 gimple *first_stmt = DR_GROUP_FIRST_ELEMENT
2129 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
2130 stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt);
2131 /* Use SLP for strided accesses (or if we
2132 can't load-lanes). */
2133 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2134 || ! vect_load_lanes_supported
2135 (STMT_VINFO_VECTYPE (stmt_vinfo),
2136 DR_GROUP_SIZE (stmt_vinfo), false))
2137 break;
2138 }
2139 if (i == loads.length ())
2140 {
2141 if (dump_enabled_p ())
2142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2143 "Built SLP cancelled: can use "
2144 "load/store-lanes\n");
2145 vect_free_slp_instance (new_instance, false);
2146 return false;
2147 }
2148 }
2149
2150 vinfo->slp_instances.safe_push (new_instance);
2151
2152 if (dump_enabled_p ())
2153 {
2154 dump_printf_loc (MSG_NOTE, vect_location,
2155 "Final SLP tree for instance:\n");
2156 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2157 }
2158
2159 return true;
2160 }
2161 }
2162 else
2163 {
2164 /* Failed to SLP. */
2165 /* Free the allocated memory. */
2166 scalar_stmts.release ();
2167 loads.release ();
2168 }
2169
2170 /* For basic block SLP, try to break the group up into multiples of the
2171 vector size. */
2172 unsigned HOST_WIDE_INT const_nunits;
2173 if (is_a <bb_vec_info> (vinfo)
2174 && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
2175 && DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2176 && nunits.is_constant (&const_nunits))
2177 {
2178 /* We consider breaking the group only on VF boundaries from the existing
2179 start. */
2180 for (i = 0; i < group_size; i++)
2181 if (!matches[i]) break;
2182
2183 if (i >= const_nunits && i < group_size)
2184 {
2185 /* Split into two groups at the first vector boundary before i. */
2186 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2187 unsigned group1_size = i & ~(const_nunits - 1);
2188
2189 gimple *rest = vect_split_slp_store_group (stmt, group1_size);
2190 bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size);
2191 /* If the first non-match was in the middle of a vector,
2192 skip the rest of that vector. */
2193 if (group1_size < i)
2194 {
2195 i = group1_size + const_nunits;
2196 if (i < group_size)
2197 rest = vect_split_slp_store_group (rest, const_nunits);
2198 }
2199 if (i < group_size)
2200 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2201 return res;
2202 }
2203 /* Even though the first vector did not all match, we might be able to SLP
2204 (some) of the remainder. FORNOW ignore this possibility. */
2205 }
2206
2207 return false;
2208 }
2209
2210
2211 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2212 trees of packed scalar stmts if SLP is possible. */
2213
2214 bool
2215 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2216 {
2217 unsigned int i;
2218 gimple *first_element;
2219
2220 DUMP_VECT_SCOPE ("vect_analyze_slp");
2221
2222 /* Find SLP sequences starting from groups of grouped stores. */
2223 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2224 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2225
2226 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2227 {
2228 if (loop_vinfo->reduction_chains.length () > 0)
2229 {
2230 /* Find SLP sequences starting from reduction chains. */
2231 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2232 if (! vect_analyze_slp_instance (vinfo, first_element,
2233 max_tree_size))
2234 {
2235 /* Dissolve reduction chain group. */
2236 gimple *next, *stmt = first_element;
2237 while (stmt)
2238 {
2239 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2240 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2241 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2242 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2243 stmt = next;
2244 }
2245 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element))
2246 = vect_internal_def;
2247 }
2248 }
2249
2250 /* Find SLP sequences starting from groups of reductions. */
2251 if (loop_vinfo->reductions.length () > 1)
2252 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2253 max_tree_size);
2254 }
2255
2256 return true;
2257 }
2258
2259
2260 /* For each possible SLP instance decide whether to SLP it and calculate overall
2261 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2262 least one instance. */
2263
2264 bool
2265 vect_make_slp_decision (loop_vec_info loop_vinfo)
2266 {
2267 unsigned int i;
2268 poly_uint64 unrolling_factor = 1;
2269 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2270 slp_instance instance;
2271 int decided_to_slp = 0;
2272
2273 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2274
2275 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2276 {
2277 /* FORNOW: SLP if you can. */
2278 /* All unroll factors have the form current_vector_size * X for some
2279 rational X, so they must have a common multiple. */
2280 unrolling_factor
2281 = force_common_multiple (unrolling_factor,
2282 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2283
2284 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2285 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2286 loop-based vectorization. Such stmts will be marked as HYBRID. */
2287 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2288 decided_to_slp++;
2289 }
2290
2291 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2292
2293 if (decided_to_slp && dump_enabled_p ())
2294 {
2295 dump_printf_loc (MSG_NOTE, vect_location,
2296 "Decided to SLP %d instances. Unrolling factor ",
2297 decided_to_slp);
2298 dump_dec (MSG_NOTE, unrolling_factor);
2299 dump_printf (MSG_NOTE, "\n");
2300 }
2301
2302 return (decided_to_slp > 0);
2303 }
2304
2305
2306 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2307 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2308
2309 static void
2310 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2311 {
2312 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2313 imm_use_iterator imm_iter;
2314 gimple *use_stmt;
2315 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2316 slp_tree child;
2317 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2318 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2319 int j;
2320
2321 /* Propagate hybrid down the SLP tree. */
2322 if (stype == hybrid)
2323 ;
2324 else if (HYBRID_SLP_STMT (stmt_vinfo))
2325 stype = hybrid;
2326 else
2327 {
2328 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2329 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2330 /* If we get a pattern stmt here we have to use the LHS of the
2331 original stmt for immediate uses. */
2332 if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo)
2333 && STMT_VINFO_RELATED_STMT (stmt_vinfo))
2334 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2335 tree def;
2336 if (gimple_code (stmt) == GIMPLE_PHI)
2337 def = gimple_phi_result (stmt);
2338 else
2339 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2340 if (def)
2341 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2342 {
2343 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2344 continue;
2345 use_vinfo = vinfo_for_stmt (use_stmt);
2346 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2347 && STMT_VINFO_RELATED_STMT (use_vinfo))
2348 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2349 if (!STMT_SLP_TYPE (use_vinfo)
2350 && (STMT_VINFO_RELEVANT (use_vinfo)
2351 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2352 && !(gimple_code (use_stmt) == GIMPLE_PHI
2353 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2354 {
2355 if (dump_enabled_p ())
2356 {
2357 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2358 "def in non-SLP stmt: ");
2359 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
2360 }
2361 stype = hybrid;
2362 }
2363 }
2364 }
2365
2366 if (stype == hybrid
2367 && !HYBRID_SLP_STMT (stmt_vinfo))
2368 {
2369 if (dump_enabled_p ())
2370 {
2371 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2372 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2373 }
2374 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2375 }
2376
2377 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2378 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2379 vect_detect_hybrid_slp_stmts (child, i, stype);
2380 }
2381
2382 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2383
2384 static tree
2385 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2386 {
2387 walk_stmt_info *wi = (walk_stmt_info *)data;
2388 struct loop *loopp = (struct loop *)wi->info;
2389
2390 if (wi->is_lhs)
2391 return NULL_TREE;
2392
2393 if (TREE_CODE (*tp) == SSA_NAME
2394 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2395 {
2396 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2397 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2398 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2399 {
2400 if (dump_enabled_p ())
2401 {
2402 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2403 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2404 }
2405 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2406 }
2407 }
2408
2409 return NULL_TREE;
2410 }
2411
2412 static tree
2413 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2414 walk_stmt_info *)
2415 {
2416 stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi));
2417 /* If the stmt is in a SLP instance then this isn't a reason
2418 to mark use definitions in other SLP instances as hybrid. */
2419 if (! STMT_SLP_TYPE (use_vinfo)
2420 && (STMT_VINFO_RELEVANT (use_vinfo)
2421 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2422 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2423 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2424 ;
2425 else
2426 *handled = true;
2427 return NULL_TREE;
2428 }
2429
2430 /* Find stmts that must be both vectorized and SLPed. */
2431
2432 void
2433 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2434 {
2435 unsigned int i;
2436 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2437 slp_instance instance;
2438
2439 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2440
2441 /* First walk all pattern stmt in the loop and mark defs of uses as
2442 hybrid because immediate uses in them are not recorded. */
2443 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2444 {
2445 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2446 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2447 gsi_next (&gsi))
2448 {
2449 gimple *stmt = gsi_stmt (gsi);
2450 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2451 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2452 {
2453 walk_stmt_info wi;
2454 memset (&wi, 0, sizeof (wi));
2455 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2456 gimple_stmt_iterator gsi2
2457 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2458 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2459 vect_detect_hybrid_slp_1, &wi);
2460 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2461 vect_detect_hybrid_slp_2,
2462 vect_detect_hybrid_slp_1, &wi);
2463 }
2464 }
2465 }
2466
2467 /* Then walk the SLP instance trees marking stmts with uses in
2468 non-SLP stmts as hybrid, also propagating hybrid down the
2469 SLP tree, collecting the above info on-the-fly. */
2470 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2471 {
2472 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2473 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2474 i, pure_slp);
2475 }
2476 }
2477
2478
2479 /* Initialize a bb_vec_info struct for the statements between
2480 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2481
2482 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2483 gimple_stmt_iterator region_end_in,
2484 vec_info_shared *shared)
2485 : vec_info (vec_info::bb, init_cost (NULL), shared),
2486 bb (gsi_bb (region_begin_in)),
2487 region_begin (region_begin_in),
2488 region_end (region_end_in)
2489 {
2490 gimple_stmt_iterator gsi;
2491
2492 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2493 gsi_next (&gsi))
2494 {
2495 gimple *stmt = gsi_stmt (gsi);
2496 gimple_set_uid (stmt, 0);
2497 add_stmt (stmt);
2498 }
2499
2500 bb->aux = this;
2501 }
2502
2503
2504 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2505 stmts in the basic block. */
2506
2507 _bb_vec_info::~_bb_vec_info ()
2508 {
2509 for (gimple_stmt_iterator si = region_begin;
2510 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2511 {
2512 gimple *stmt = gsi_stmt (si);
2513 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2514
2515 if (stmt_info)
2516 /* Free stmt_vec_info. */
2517 free_stmt_vec_info (stmt);
2518
2519 /* Reset region marker. */
2520 gimple_set_uid (stmt, -1);
2521 }
2522
2523 bb->aux = NULL;
2524 }
2525
2526 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2527 given then that child nodes have already been processed, and that
2528 their def types currently match their SLP node's def type. */
2529
2530 static bool
2531 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2532 slp_instance node_instance,
2533 stmt_vector_for_cost *cost_vec)
2534 {
2535 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2536 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2537 gcc_assert (stmt_info);
2538 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2539
2540 /* For BB vectorization vector types are assigned here.
2541 Memory accesses already got their vector type assigned
2542 in vect_analyze_data_refs. */
2543 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2544 if (bb_vinfo
2545 && ! STMT_VINFO_DATA_REF (stmt_info))
2546 {
2547 tree vectype, nunits_vectype;
2548 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2549 &nunits_vectype))
2550 /* We checked this when building the node. */
2551 gcc_unreachable ();
2552 if (vectype == boolean_type_node)
2553 {
2554 vectype = vect_get_mask_type_for_stmt (stmt_info);
2555 if (!vectype)
2556 /* vect_get_mask_type_for_stmt has already explained the
2557 failure. */
2558 return false;
2559 }
2560
2561 gimple *sstmt;
2562 unsigned int i;
2563 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt)
2564 STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype;
2565 }
2566
2567 /* Calculate the number of vector statements to be created for the
2568 scalar stmts in this node. For SLP reductions it is equal to the
2569 number of vector statements in the children (which has already been
2570 calculated by the recursive call). Otherwise it is the number of
2571 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2572 VF divided by the number of elements in a vector. */
2573 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2574 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2575 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2576 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2577 else
2578 {
2579 poly_uint64 vf;
2580 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2581 vf = loop_vinfo->vectorization_factor;
2582 else
2583 vf = 1;
2584 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2585 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2586 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2587 = vect_get_num_vectors (vf * group_size, vectype);
2588 }
2589
2590 bool dummy;
2591 return vect_analyze_stmt (stmt, &dummy, node, node_instance, cost_vec);
2592 }
2593
2594 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2595 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2596
2597 Return true if the operations are supported. */
2598
2599 static bool
2600 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2601 slp_instance node_instance,
2602 scalar_stmts_to_slp_tree_map_t *visited,
2603 scalar_stmts_to_slp_tree_map_t *lvisited,
2604 stmt_vector_for_cost *cost_vec)
2605 {
2606 int i, j;
2607 slp_tree child;
2608
2609 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2610 return true;
2611
2612 /* If we already analyzed the exact same set of scalar stmts we're done.
2613 We share the generated vector stmts for those. */
2614 slp_tree *leader;
2615 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2616 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2617 {
2618 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2619 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2620 return true;
2621 }
2622
2623 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2624 doesn't result in any issue since we throw away the lvisited set
2625 when we fail. */
2626 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2627
2628 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2629 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2630 visited, lvisited, cost_vec))
2631 return false;
2632
2633 /* Push SLP node def-type to stmt operands. */
2634 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2635 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2636 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2637 = SLP_TREE_DEF_TYPE (child);
2638 bool res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2639 cost_vec);
2640 /* Restore def-types. */
2641 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2642 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2643 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))
2644 = vect_internal_def;
2645 if (! res)
2646 return false;
2647
2648 return true;
2649 }
2650
2651
2652 /* Analyze statements in SLP instances of VINFO. Return true if the
2653 operations are supported. */
2654
2655 bool
2656 vect_slp_analyze_operations (vec_info *vinfo)
2657 {
2658 slp_instance instance;
2659 int i;
2660
2661 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2662
2663 scalar_stmts_to_slp_tree_map_t *visited
2664 = new scalar_stmts_to_slp_tree_map_t ();
2665 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2666 {
2667 scalar_stmts_to_slp_tree_map_t lvisited;
2668 stmt_vector_for_cost cost_vec;
2669 cost_vec.create (2);
2670 if (!vect_slp_analyze_node_operations (vinfo,
2671 SLP_INSTANCE_TREE (instance),
2672 instance, visited, &lvisited,
2673 &cost_vec))
2674 {
2675 dump_printf_loc (MSG_NOTE, vect_location,
2676 "removing SLP instance operations starting from: ");
2677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2678 SLP_TREE_SCALAR_STMTS
2679 (SLP_INSTANCE_TREE (instance))[0], 0);
2680 vect_free_slp_instance (instance, false);
2681 vinfo->slp_instances.ordered_remove (i);
2682 cost_vec.release ();
2683 }
2684 else
2685 {
2686 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2687 x != lvisited.end(); ++x)
2688 visited->put ((*x).first.copy (), (*x).second);
2689 i++;
2690
2691 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2692 cost_vec.release ();
2693 }
2694 }
2695 delete visited;
2696
2697 return !vinfo->slp_instances.is_empty ();
2698 }
2699
2700
2701 /* Compute the scalar cost of the SLP node NODE and its children
2702 and return it. Do not account defs that are marked in LIFE and
2703 update LIFE according to uses of NODE. */
2704
2705 static void
2706 vect_bb_slp_scalar_cost (basic_block bb,
2707 slp_tree node, vec<bool, va_heap> *life,
2708 stmt_vector_for_cost *cost_vec)
2709 {
2710 unsigned i;
2711 gimple *stmt;
2712 slp_tree child;
2713
2714 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2715 {
2716 ssa_op_iter op_iter;
2717 def_operand_p def_p;
2718 stmt_vec_info stmt_info;
2719
2720 if ((*life)[i])
2721 continue;
2722
2723 /* If there is a non-vectorized use of the defs then the scalar
2724 stmt is kept live in which case we do not account it or any
2725 required defs in the SLP children in the scalar cost. This
2726 way we make the vectorization more costly when compared to
2727 the scalar cost. */
2728 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2729 {
2730 imm_use_iterator use_iter;
2731 gimple *use_stmt;
2732 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2733 if (!is_gimple_debug (use_stmt)
2734 && (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo,
2735 use_stmt)
2736 || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt))))
2737 {
2738 (*life)[i] = true;
2739 BREAK_FROM_IMM_USE_STMT (use_iter);
2740 }
2741 }
2742 if ((*life)[i])
2743 continue;
2744
2745 /* Count scalar stmts only once. */
2746 if (gimple_visited_p (stmt))
2747 continue;
2748 gimple_set_visited (stmt, true);
2749
2750 stmt_info = vinfo_for_stmt (stmt);
2751 vect_cost_for_stmt kind;
2752 if (STMT_VINFO_DATA_REF (stmt_info))
2753 {
2754 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2755 kind = scalar_load;
2756 else
2757 kind = scalar_store;
2758 }
2759 else
2760 kind = scalar_stmt;
2761 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2762 }
2763
2764 auto_vec<bool, 20> subtree_life;
2765 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2766 {
2767 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2768 {
2769 /* Do not directly pass LIFE to the recursive call, copy it to
2770 confine changes in the callee to the current child/subtree. */
2771 subtree_life.safe_splice (*life);
2772 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec);
2773 subtree_life.truncate (0);
2774 }
2775 }
2776 }
2777
2778 /* Check if vectorization of the basic block is profitable. */
2779
2780 static bool
2781 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2782 {
2783 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2784 slp_instance instance;
2785 int i;
2786 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2787 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2788
2789 /* Calculate scalar cost. */
2790 stmt_vector_for_cost scalar_costs;
2791 scalar_costs.create (0);
2792 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2793 {
2794 auto_vec<bool, 20> life;
2795 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2796 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2797 SLP_INSTANCE_TREE (instance),
2798 &life, &scalar_costs);
2799 }
2800 void *target_cost_data = init_cost (NULL);
2801 add_stmt_costs (target_cost_data, &scalar_costs);
2802 scalar_costs.release ();
2803 unsigned dummy;
2804 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2805 destroy_cost_data (target_cost_data);
2806
2807 /* Unset visited flag. */
2808 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2809 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2810 gimple_set_visited (gsi_stmt (gsi), false);
2811
2812 /* Complete the target-specific cost calculation. */
2813 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2814 &vec_inside_cost, &vec_epilogue_cost);
2815
2816 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2817
2818 if (dump_enabled_p ())
2819 {
2820 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2821 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2822 vec_inside_cost);
2823 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2824 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2825 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2826 }
2827
2828 /* Vectorization is profitable if its cost is more than the cost of scalar
2829 version. Note that we err on the vector side for equal cost because
2830 the cost estimate is otherwise quite pessimistic (constant uses are
2831 free on the scalar side but cost a load on the vector side for
2832 example). */
2833 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2834 return false;
2835
2836 return true;
2837 }
2838
2839 /* Check if the basic block can be vectorized. Returns a bb_vec_info
2840 if so and sets fatal to true if failure is independent of
2841 current_vector_size. */
2842
2843 static bb_vec_info
2844 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2845 gimple_stmt_iterator region_end,
2846 vec<data_reference_p> datarefs, int n_stmts,
2847 bool &fatal, vec_info_shared *shared)
2848 {
2849 bb_vec_info bb_vinfo;
2850 slp_instance instance;
2851 int i;
2852 poly_uint64 min_vf = 2;
2853
2854 /* The first group of checks is independent of the vector size. */
2855 fatal = true;
2856
2857 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2858 {
2859 if (dump_enabled_p ())
2860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2861 "not vectorized: too many instructions in "
2862 "basic block.\n");
2863 free_data_refs (datarefs);
2864 return NULL;
2865 }
2866
2867 bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
2868 if (!bb_vinfo)
2869 return NULL;
2870
2871 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
2872 bb_vinfo->shared->save_datarefs ();
2873
2874 /* Analyze the data references. */
2875
2876 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
2877 {
2878 if (dump_enabled_p ())
2879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2880 "not vectorized: unhandled data-ref in basic "
2881 "block.\n");
2882
2883 delete bb_vinfo;
2884 return NULL;
2885 }
2886
2887 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2888 {
2889 if (dump_enabled_p ())
2890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2891 "not vectorized: not enough data-refs in "
2892 "basic block.\n");
2893
2894 delete bb_vinfo;
2895 return NULL;
2896 }
2897
2898 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2899 {
2900 if (dump_enabled_p ())
2901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2902 "not vectorized: unhandled data access in "
2903 "basic block.\n");
2904
2905 delete bb_vinfo;
2906 return NULL;
2907 }
2908
2909 /* If there are no grouped stores in the region there is no need
2910 to continue with pattern recog as vect_analyze_slp will fail
2911 anyway. */
2912 if (bb_vinfo->grouped_stores.is_empty ())
2913 {
2914 if (dump_enabled_p ())
2915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2916 "not vectorized: no grouped stores in "
2917 "basic block.\n");
2918
2919 delete bb_vinfo;
2920 return NULL;
2921 }
2922
2923 /* While the rest of the analysis below depends on it in some way. */
2924 fatal = false;
2925
2926 vect_pattern_recog (bb_vinfo);
2927
2928 /* Check the SLP opportunities in the basic block, analyze and build SLP
2929 trees. */
2930 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2931 {
2932 if (dump_enabled_p ())
2933 {
2934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2935 "Failed to SLP the basic block.\n");
2936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2937 "not vectorized: failed to find SLP opportunities "
2938 "in basic block.\n");
2939 }
2940
2941 delete bb_vinfo;
2942 return NULL;
2943 }
2944
2945 vect_record_base_alignments (bb_vinfo);
2946
2947 /* Analyze and verify the alignment of data references and the
2948 dependence in the SLP instances. */
2949 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
2950 {
2951 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2952 || ! vect_slp_analyze_instance_dependence (instance))
2953 {
2954 dump_printf_loc (MSG_NOTE, vect_location,
2955 "removing SLP instance operations starting from: ");
2956 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2957 SLP_TREE_SCALAR_STMTS
2958 (SLP_INSTANCE_TREE (instance))[0], 0);
2959 vect_free_slp_instance (instance, false);
2960 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2961 continue;
2962 }
2963
2964 /* Mark all the statements that we want to vectorize as pure SLP and
2965 relevant. */
2966 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2967 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2968
2969 i++;
2970 }
2971 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
2972 {
2973 delete bb_vinfo;
2974 return NULL;
2975 }
2976
2977 if (!vect_slp_analyze_operations (bb_vinfo))
2978 {
2979 if (dump_enabled_p ())
2980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2981 "not vectorized: bad operation in basic block.\n");
2982
2983 delete bb_vinfo;
2984 return NULL;
2985 }
2986
2987 /* Cost model: check if the vectorization is worthwhile. */
2988 if (!unlimited_cost_model (NULL)
2989 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2990 {
2991 if (dump_enabled_p ())
2992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2993 "not vectorized: vectorization is not "
2994 "profitable.\n");
2995
2996 delete bb_vinfo;
2997 return NULL;
2998 }
2999
3000 if (dump_enabled_p ())
3001 dump_printf_loc (MSG_NOTE, vect_location,
3002 "Basic block will be vectorized using SLP\n");
3003
3004 return bb_vinfo;
3005 }
3006
3007
3008 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3009 true if anything in the basic-block was vectorized. */
3010
3011 bool
3012 vect_slp_bb (basic_block bb)
3013 {
3014 bb_vec_info bb_vinfo;
3015 gimple_stmt_iterator gsi;
3016 bool any_vectorized = false;
3017 auto_vector_sizes vector_sizes;
3018
3019 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3020
3021 /* Autodetect first vector size we try. */
3022 current_vector_size = 0;
3023 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
3024 unsigned int next_size = 0;
3025
3026 gsi = gsi_start_bb (bb);
3027
3028 poly_uint64 autodetected_vector_size = 0;
3029 while (1)
3030 {
3031 if (gsi_end_p (gsi))
3032 break;
3033
3034 gimple_stmt_iterator region_begin = gsi;
3035 vec<data_reference_p> datarefs = vNULL;
3036 int insns = 0;
3037
3038 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3039 {
3040 gimple *stmt = gsi_stmt (gsi);
3041 if (is_gimple_debug (stmt))
3042 continue;
3043 insns++;
3044
3045 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3046 vect_location = stmt;
3047
3048 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3049 break;
3050 }
3051
3052 /* Skip leading unhandled stmts. */
3053 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3054 {
3055 gsi_next (&gsi);
3056 continue;
3057 }
3058
3059 gimple_stmt_iterator region_end = gsi;
3060
3061 bool vectorized = false;
3062 bool fatal = false;
3063 vec_info_shared shared;
3064 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3065 datarefs, insns, fatal, &shared);
3066 if (bb_vinfo
3067 && dbg_cnt (vect_slp))
3068 {
3069 if (dump_enabled_p ())
3070 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3071
3072 bb_vinfo->shared->check_datarefs ();
3073 vect_schedule_slp (bb_vinfo);
3074
3075 unsigned HOST_WIDE_INT bytes;
3076 if (current_vector_size.is_constant (&bytes))
3077 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3078 "basic block part vectorized using "
3079 HOST_WIDE_INT_PRINT_UNSIGNED " byte "
3080 "vectors\n", bytes);
3081 else
3082 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3083 "basic block part vectorized using variable "
3084 "length vectors\n");
3085
3086 vectorized = true;
3087 }
3088 delete bb_vinfo;
3089
3090 any_vectorized |= vectorized;
3091
3092 if (next_size == 0)
3093 autodetected_vector_size = current_vector_size;
3094
3095 if (next_size < vector_sizes.length ()
3096 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3097 next_size += 1;
3098
3099 if (vectorized
3100 || next_size == vector_sizes.length ()
3101 || known_eq (current_vector_size, 0U)
3102 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3103 vector sizes will fail do not bother iterating. */
3104 || fatal)
3105 {
3106 if (gsi_end_p (region_end))
3107 break;
3108
3109 /* Skip the unhandled stmt. */
3110 gsi_next (&gsi);
3111
3112 /* And reset vector sizes. */
3113 current_vector_size = 0;
3114 next_size = 0;
3115 }
3116 else
3117 {
3118 /* Try the next biggest vector size. */
3119 current_vector_size = vector_sizes[next_size++];
3120 if (dump_enabled_p ())
3121 {
3122 dump_printf_loc (MSG_NOTE, vect_location,
3123 "***** Re-trying analysis with "
3124 "vector size ");
3125 dump_dec (MSG_NOTE, current_vector_size);
3126 dump_printf (MSG_NOTE, "\n");
3127 }
3128
3129 /* Start over. */
3130 gsi = region_begin;
3131 }
3132 }
3133
3134 return any_vectorized;
3135 }
3136
3137
3138 /* Return 1 if vector type of boolean constant which is OPNUM
3139 operand in statement STMT is a boolean vector. */
3140
3141 static bool
3142 vect_mask_constant_operand_p (gimple *stmt, int opnum)
3143 {
3144 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3145 enum tree_code code = gimple_expr_code (stmt);
3146 tree op, vectype;
3147 enum vect_def_type dt;
3148
3149 /* For comparison and COND_EXPR type is chosen depending
3150 on the other comparison operand. */
3151 if (TREE_CODE_CLASS (code) == tcc_comparison)
3152 {
3153 if (opnum)
3154 op = gimple_assign_rhs1 (stmt);
3155 else
3156 op = gimple_assign_rhs2 (stmt);
3157
3158 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3159 gcc_unreachable ();
3160
3161 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3162 }
3163
3164 if (code == COND_EXPR)
3165 {
3166 tree cond = gimple_assign_rhs1 (stmt);
3167
3168 if (TREE_CODE (cond) == SSA_NAME)
3169 op = cond;
3170 else if (opnum)
3171 op = TREE_OPERAND (cond, 1);
3172 else
3173 op = TREE_OPERAND (cond, 0);
3174
3175 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3176 gcc_unreachable ();
3177
3178 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3179 }
3180
3181 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3182 }
3183
3184 /* Build a variable-length vector in which the elements in ELTS are repeated
3185 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3186 RESULTS and add any new instructions to SEQ.
3187
3188 The approach we use is:
3189
3190 (1) Find a vector mode VM with integer elements of mode IM.
3191
3192 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3193 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3194 from small vectors to IM.
3195
3196 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3197
3198 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3199 correct byte contents.
3200
3201 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3202
3203 We try to find the largest IM for which this sequence works, in order
3204 to cut down on the number of interleaves. */
3205
3206 void
3207 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3208 unsigned int nresults, vec<tree> &results)
3209 {
3210 unsigned int nelts = elts.length ();
3211 tree element_type = TREE_TYPE (vector_type);
3212
3213 /* (1) Find a vector mode VM with integer elements of mode IM. */
3214 unsigned int nvectors = 1;
3215 tree new_vector_type;
3216 tree permutes[2];
3217 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3218 &nvectors, &new_vector_type,
3219 permutes))
3220 gcc_unreachable ();
3221
3222 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3223 unsigned int partial_nelts = nelts / nvectors;
3224 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3225
3226 tree_vector_builder partial_elts;
3227 auto_vec<tree, 32> pieces (nvectors * 2);
3228 pieces.quick_grow (nvectors * 2);
3229 for (unsigned int i = 0; i < nvectors; ++i)
3230 {
3231 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3232 ELTS' has mode IM. */
3233 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3234 for (unsigned int j = 0; j < partial_nelts; ++j)
3235 partial_elts.quick_push (elts[i * partial_nelts + j]);
3236 tree t = gimple_build_vector (seq, &partial_elts);
3237 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3238 TREE_TYPE (new_vector_type), t);
3239
3240 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3241 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3242 }
3243
3244 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3245 correct byte contents.
3246
3247 We need to repeat the following operation log2(nvectors) times:
3248
3249 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3250 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3251
3252 However, if each input repeats every N elements and the VF is
3253 a multiple of N * 2, the HI result is the same as the LO. */
3254 unsigned int in_start = 0;
3255 unsigned int out_start = nvectors;
3256 unsigned int hi_start = nvectors / 2;
3257 /* A bound on the number of outputs needed to produce NRESULTS results
3258 in the final iteration. */
3259 unsigned int noutputs_bound = nvectors * nresults;
3260 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3261 {
3262 noutputs_bound /= 2;
3263 unsigned int limit = MIN (noutputs_bound, nvectors);
3264 for (unsigned int i = 0; i < limit; ++i)
3265 {
3266 if ((i & 1) != 0
3267 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3268 2 * in_repeat))
3269 {
3270 pieces[out_start + i] = pieces[out_start + i - 1];
3271 continue;
3272 }
3273
3274 tree output = make_ssa_name (new_vector_type);
3275 tree input1 = pieces[in_start + (i / 2)];
3276 tree input2 = pieces[in_start + (i / 2) + hi_start];
3277 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3278 input1, input2,
3279 permutes[i & 1]);
3280 gimple_seq_add_stmt (seq, stmt);
3281 pieces[out_start + i] = output;
3282 }
3283 std::swap (in_start, out_start);
3284 }
3285
3286 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3287 results.reserve (nresults);
3288 for (unsigned int i = 0; i < nresults; ++i)
3289 if (i < nvectors)
3290 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3291 pieces[in_start + i]));
3292 else
3293 results.quick_push (results[i - nvectors]);
3294 }
3295
3296
3297 /* For constant and loop invariant defs of SLP_NODE this function returns
3298 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3299 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3300 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3301 REDUC_INDEX is the index of the reduction operand in the statements, unless
3302 it is -1. */
3303
3304 static void
3305 vect_get_constant_vectors (tree op, slp_tree slp_node,
3306 vec<tree> *vec_oprnds,
3307 unsigned int op_num, unsigned int number_of_vectors)
3308 {
3309 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3310 gimple *stmt = stmts[0];
3311 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3312 unsigned HOST_WIDE_INT nunits;
3313 tree vec_cst;
3314 unsigned j, number_of_places_left_in_vector;
3315 tree vector_type;
3316 tree vop;
3317 int group_size = stmts.length ();
3318 unsigned int vec_num, i;
3319 unsigned number_of_copies = 1;
3320 vec<tree> voprnds;
3321 voprnds.create (number_of_vectors);
3322 bool constant_p, is_store;
3323 tree neutral_op = NULL;
3324 enum tree_code code = gimple_expr_code (stmt);
3325 gimple_seq ctor_seq = NULL;
3326 auto_vec<tree, 16> permute_results;
3327
3328 /* Check if vector type is a boolean vector. */
3329 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3330 && vect_mask_constant_operand_p (stmt, op_num))
3331 vector_type
3332 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3333 else
3334 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3335
3336 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3337 {
3338 is_store = true;
3339 op = gimple_assign_rhs1 (stmt);
3340 }
3341 else
3342 is_store = false;
3343
3344 gcc_assert (op);
3345
3346 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3347 created vectors. It is greater than 1 if unrolling is performed.
3348
3349 For example, we have two scalar operands, s1 and s2 (e.g., group of
3350 strided accesses of size two), while NUNITS is four (i.e., four scalars
3351 of this type can be packed in a vector). The output vector will contain
3352 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3353 will be 2).
3354
3355 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3356 containing the operands.
3357
3358 For example, NUNITS is four as before, and the group size is 8
3359 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3360 {s5, s6, s7, s8}. */
3361
3362 /* When using duplicate_and_interleave, we just need one element for
3363 each scalar statement. */
3364 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3365 nunits = group_size;
3366
3367 number_of_copies = nunits * number_of_vectors / group_size;
3368
3369 number_of_places_left_in_vector = nunits;
3370 constant_p = true;
3371 tree_vector_builder elts (vector_type, nunits, 1);
3372 elts.quick_grow (nunits);
3373 bool place_after_defs = false;
3374 for (j = 0; j < number_of_copies; j++)
3375 {
3376 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
3377 {
3378 if (is_store)
3379 op = gimple_assign_rhs1 (stmt);
3380 else
3381 {
3382 switch (code)
3383 {
3384 case COND_EXPR:
3385 {
3386 tree cond = gimple_assign_rhs1 (stmt);
3387 if (TREE_CODE (cond) == SSA_NAME)
3388 op = gimple_op (stmt, op_num + 1);
3389 else if (op_num == 0 || op_num == 1)
3390 op = TREE_OPERAND (cond, op_num);
3391 else
3392 {
3393 if (op_num == 2)
3394 op = gimple_assign_rhs2 (stmt);
3395 else
3396 op = gimple_assign_rhs3 (stmt);
3397 }
3398 }
3399 break;
3400
3401 case CALL_EXPR:
3402 op = gimple_call_arg (stmt, op_num);
3403 break;
3404
3405 case LSHIFT_EXPR:
3406 case RSHIFT_EXPR:
3407 case LROTATE_EXPR:
3408 case RROTATE_EXPR:
3409 op = gimple_op (stmt, op_num + 1);
3410 /* Unlike the other binary operators, shifts/rotates have
3411 the shift count being int, instead of the same type as
3412 the lhs, so make sure the scalar is the right type if
3413 we are dealing with vectors of
3414 long long/long/short/char. */
3415 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3416 op = fold_convert (TREE_TYPE (vector_type), op);
3417 break;
3418
3419 default:
3420 op = gimple_op (stmt, op_num + 1);
3421 break;
3422 }
3423 }
3424
3425 /* Create 'vect_ = {op0,op1,...,opn}'. */
3426 number_of_places_left_in_vector--;
3427 tree orig_op = op;
3428 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3429 {
3430 if (CONSTANT_CLASS_P (op))
3431 {
3432 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3433 {
3434 /* Can't use VIEW_CONVERT_EXPR for booleans because
3435 of possibly different sizes of scalar value and
3436 vector element. */
3437 if (integer_zerop (op))
3438 op = build_int_cst (TREE_TYPE (vector_type), 0);
3439 else if (integer_onep (op))
3440 op = build_all_ones_cst (TREE_TYPE (vector_type));
3441 else
3442 gcc_unreachable ();
3443 }
3444 else
3445 op = fold_unary (VIEW_CONVERT_EXPR,
3446 TREE_TYPE (vector_type), op);
3447 gcc_assert (op && CONSTANT_CLASS_P (op));
3448 }
3449 else
3450 {
3451 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3452 gimple *init_stmt;
3453 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3454 {
3455 tree true_val
3456 = build_all_ones_cst (TREE_TYPE (vector_type));
3457 tree false_val
3458 = build_zero_cst (TREE_TYPE (vector_type));
3459 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3460 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3461 op, true_val,
3462 false_val);
3463 }
3464 else
3465 {
3466 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3467 op);
3468 init_stmt
3469 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3470 op);
3471 }
3472 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3473 op = new_temp;
3474 }
3475 }
3476 elts[number_of_places_left_in_vector] = op;
3477 if (!CONSTANT_CLASS_P (op))
3478 constant_p = false;
3479 if (TREE_CODE (orig_op) == SSA_NAME
3480 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3481 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3482 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3483 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3484 place_after_defs = true;
3485
3486 if (number_of_places_left_in_vector == 0)
3487 {
3488 if (constant_p
3489 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3490 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3491 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3492 else
3493 {
3494 if (vec_oprnds->is_empty ())
3495 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3496 number_of_vectors,
3497 permute_results);
3498 vec_cst = permute_results[number_of_vectors - j - 1];
3499 }
3500 tree init;
3501 gimple_stmt_iterator gsi;
3502 if (place_after_defs)
3503 {
3504 gsi = gsi_for_stmt
3505 (vect_find_last_scalar_stmt_in_slp (slp_node));
3506 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
3507 }
3508 else
3509 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
3510 if (ctor_seq != NULL)
3511 {
3512 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3513 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3514 ctor_seq = NULL;
3515 }
3516 voprnds.quick_push (init);
3517 place_after_defs = false;
3518 number_of_places_left_in_vector = nunits;
3519 constant_p = true;
3520 elts.new_vector (vector_type, nunits, 1);
3521 elts.quick_grow (nunits);
3522 }
3523 }
3524 }
3525
3526 /* Since the vectors are created in the reverse order, we should invert
3527 them. */
3528 vec_num = voprnds.length ();
3529 for (j = vec_num; j != 0; j--)
3530 {
3531 vop = voprnds[j - 1];
3532 vec_oprnds->quick_push (vop);
3533 }
3534
3535 voprnds.release ();
3536
3537 /* In case that VF is greater than the unrolling factor needed for the SLP
3538 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3539 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3540 to replicate the vectors. */
3541 while (number_of_vectors > vec_oprnds->length ())
3542 {
3543 tree neutral_vec = NULL;
3544
3545 if (neutral_op)
3546 {
3547 if (!neutral_vec)
3548 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3549
3550 vec_oprnds->quick_push (neutral_vec);
3551 }
3552 else
3553 {
3554 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3555 vec_oprnds->quick_push (vop);
3556 }
3557 }
3558 }
3559
3560
3561 /* Get vectorized definitions from SLP_NODE that contains corresponding
3562 vectorized def-stmts. */
3563
3564 static void
3565 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3566 {
3567 tree vec_oprnd;
3568 gimple *vec_def_stmt;
3569 unsigned int i;
3570
3571 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3572
3573 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
3574 {
3575 gcc_assert (vec_def_stmt);
3576 if (gimple_code (vec_def_stmt) == GIMPLE_PHI)
3577 vec_oprnd = gimple_phi_result (vec_def_stmt);
3578 else
3579 vec_oprnd = gimple_get_lhs (vec_def_stmt);
3580 vec_oprnds->quick_push (vec_oprnd);
3581 }
3582 }
3583
3584
3585 /* Get vectorized definitions for SLP_NODE.
3586 If the scalar definitions are loop invariants or constants, collect them and
3587 call vect_get_constant_vectors() to create vector stmts.
3588 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3589 must be stored in the corresponding child of SLP_NODE, and we call
3590 vect_get_slp_vect_defs () to retrieve them. */
3591
3592 void
3593 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3594 vec<vec<tree> > *vec_oprnds)
3595 {
3596 gimple *first_stmt;
3597 int number_of_vects = 0, i;
3598 unsigned int child_index = 0;
3599 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3600 slp_tree child = NULL;
3601 vec<tree> vec_defs;
3602 tree oprnd;
3603 bool vectorized_defs;
3604
3605 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3606 FOR_EACH_VEC_ELT (ops, i, oprnd)
3607 {
3608 /* For each operand we check if it has vectorized definitions in a child
3609 node or we need to create them (for invariants and constants). We
3610 check if the LHS of the first stmt of the next child matches OPRND.
3611 If it does, we found the correct child. Otherwise, we call
3612 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3613 to check this child node for the next operand. */
3614 vectorized_defs = false;
3615 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3616 {
3617 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3618
3619 /* We have to check both pattern and original def, if available. */
3620 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3621 {
3622 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3623 gimple *related
3624 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3625 tree first_def_op;
3626
3627 if (gimple_code (first_def) == GIMPLE_PHI)
3628 first_def_op = gimple_phi_result (first_def);
3629 else
3630 first_def_op = gimple_get_lhs (first_def);
3631 if (operand_equal_p (oprnd, first_def_op, 0)
3632 || (related
3633 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3634 {
3635 /* The number of vector defs is determined by the number of
3636 vector statements in the node from which we get those
3637 statements. */
3638 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3639 vectorized_defs = true;
3640 child_index++;
3641 }
3642 }
3643 else
3644 child_index++;
3645 }
3646
3647 if (!vectorized_defs)
3648 {
3649 if (i == 0)
3650 {
3651 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3652 /* Number of vector stmts was calculated according to LHS in
3653 vect_schedule_slp_instance (), fix it by replacing LHS with
3654 RHS, if necessary. See vect_get_smallest_scalar_type () for
3655 details. */
3656 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3657 &rhs_size_unit);
3658 if (rhs_size_unit != lhs_size_unit)
3659 {
3660 number_of_vects *= rhs_size_unit;
3661 number_of_vects /= lhs_size_unit;
3662 }
3663 }
3664 }
3665
3666 /* Allocate memory for vectorized defs. */
3667 vec_defs = vNULL;
3668 vec_defs.create (number_of_vects);
3669
3670 /* For reduction defs we call vect_get_constant_vectors (), since we are
3671 looking for initial loop invariant values. */
3672 if (vectorized_defs)
3673 /* The defs are already vectorized. */
3674 vect_get_slp_vect_defs (child, &vec_defs);
3675 else
3676 /* Build vectors from scalar defs. */
3677 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3678 number_of_vects);
3679
3680 vec_oprnds->quick_push (vec_defs);
3681 }
3682 }
3683
3684 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3685 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3686 permute statements for the SLP node NODE of the SLP instance
3687 SLP_NODE_INSTANCE. */
3688
3689 bool
3690 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3691 gimple_stmt_iterator *gsi, poly_uint64 vf,
3692 slp_instance slp_node_instance, bool analyze_only,
3693 unsigned *n_perms)
3694 {
3695 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3696 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3697 tree mask_element_type = NULL_TREE, mask_type;
3698 int vec_index = 0;
3699 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3700 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3701 unsigned int mask_element;
3702 machine_mode mode;
3703 unsigned HOST_WIDE_INT nunits, const_vf;
3704
3705 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3706 return false;
3707
3708 stmt_info = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info));
3709
3710 mode = TYPE_MODE (vectype);
3711
3712 /* At the moment, all permutations are represented using per-element
3713 indices, so we can't cope with variable vector lengths or
3714 vectorization factors. */
3715 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
3716 || !vf.is_constant (&const_vf))
3717 return false;
3718
3719 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3720 same size as the vector element being permuted. */
3721 mask_element_type = lang_hooks.types.type_for_mode
3722 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
3723 mask_type = get_vectype_for_scalar_type (mask_element_type);
3724 vec_perm_builder mask (nunits, nunits, 1);
3725 mask.quick_grow (nunits);
3726 vec_perm_indices indices;
3727
3728 /* Initialize the vect stmts of NODE to properly insert the generated
3729 stmts later. */
3730 if (! analyze_only)
3731 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3732 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3733 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3734
3735 /* Generate permutation masks for every NODE. Number of masks for each NODE
3736 is equal to GROUP_SIZE.
3737 E.g., we have a group of three nodes with three loads from the same
3738 location in each node, and the vector size is 4. I.e., we have a
3739 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3740 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3741 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3742 ...
3743
3744 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3745 The last mask is illegal since we assume two operands for permute
3746 operation, and the mask element values can't be outside that range.
3747 Hence, the last mask must be converted into {2,5,5,5}.
3748 For the first two permutations we need the first and the second input
3749 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3750 we need the second and the third vectors: {b1,c1,a2,b2} and
3751 {c2,a3,b3,c3}. */
3752
3753 int vect_stmts_counter = 0;
3754 unsigned int index = 0;
3755 int first_vec_index = -1;
3756 int second_vec_index = -1;
3757 bool noop_p = true;
3758 *n_perms = 0;
3759
3760 for (unsigned int j = 0; j < const_vf; j++)
3761 {
3762 for (int k = 0; k < group_size; k++)
3763 {
3764 unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
3765 + j * DR_GROUP_SIZE (stmt_info));
3766 vec_index = i / nunits;
3767 mask_element = i % nunits;
3768 if (vec_index == first_vec_index
3769 || first_vec_index == -1)
3770 {
3771 first_vec_index = vec_index;
3772 }
3773 else if (vec_index == second_vec_index
3774 || second_vec_index == -1)
3775 {
3776 second_vec_index = vec_index;
3777 mask_element += nunits;
3778 }
3779 else
3780 {
3781 if (dump_enabled_p ())
3782 {
3783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3784 "permutation requires at "
3785 "least three vectors ");
3786 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3787 stmt, 0);
3788 }
3789 gcc_assert (analyze_only);
3790 return false;
3791 }
3792
3793 gcc_assert (mask_element < 2 * nunits);
3794 if (mask_element != index)
3795 noop_p = false;
3796 mask[index++] = mask_element;
3797
3798 if (index == nunits && !noop_p)
3799 {
3800 indices.new_vector (mask, 2, nunits);
3801 if (!can_vec_perm_const_p (mode, indices))
3802 {
3803 if (dump_enabled_p ())
3804 {
3805 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3806 vect_location,
3807 "unsupported vect permute { ");
3808 for (i = 0; i < nunits; ++i)
3809 {
3810 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3811 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3812 }
3813 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3814 }
3815 gcc_assert (analyze_only);
3816 return false;
3817 }
3818
3819 ++*n_perms;
3820 }
3821
3822 if (index == nunits)
3823 {
3824 if (!analyze_only)
3825 {
3826 tree mask_vec = NULL_TREE;
3827
3828 if (! noop_p)
3829 mask_vec = vec_perm_indices_to_tree (mask_type, indices);
3830
3831 if (second_vec_index == -1)
3832 second_vec_index = first_vec_index;
3833
3834 /* Generate the permute statement if necessary. */
3835 tree first_vec = dr_chain[first_vec_index];
3836 tree second_vec = dr_chain[second_vec_index];
3837 gimple *perm_stmt;
3838 if (! noop_p)
3839 {
3840 tree perm_dest
3841 = vect_create_destination_var (gimple_assign_lhs (stmt),
3842 vectype);
3843 perm_dest = make_ssa_name (perm_dest);
3844 perm_stmt = gimple_build_assign (perm_dest,
3845 VEC_PERM_EXPR,
3846 first_vec, second_vec,
3847 mask_vec);
3848 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3849 }
3850 else
3851 /* If mask was NULL_TREE generate the requested
3852 identity transform. */
3853 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
3854
3855 /* Store the vector statement in NODE. */
3856 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
3857 }
3858
3859 index = 0;
3860 first_vec_index = -1;
3861 second_vec_index = -1;
3862 noop_p = true;
3863 }
3864 }
3865 }
3866
3867 return true;
3868 }
3869
3870 /* Vectorize SLP instance tree in postorder. */
3871
3872 static bool
3873 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3874 scalar_stmts_to_slp_tree_map_t *bst_map)
3875 {
3876 gimple *stmt;
3877 bool grouped_store, is_store;
3878 gimple_stmt_iterator si;
3879 stmt_vec_info stmt_info;
3880 unsigned int group_size;
3881 tree vectype;
3882 int i, j;
3883 slp_tree child;
3884
3885 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3886 return false;
3887
3888 /* See if we have already vectorized the same set of stmts and reuse their
3889 vectorized stmts. */
3890 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3891 {
3892 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3893 return false;
3894 }
3895
3896 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3897 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3898 vect_schedule_slp_instance (child, instance, bst_map);
3899
3900 /* Push SLP node def-type to stmts. */
3901 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3902 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3903 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
3904 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child);
3905
3906 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3907 stmt_info = vinfo_for_stmt (stmt);
3908
3909 /* VECTYPE is the type of the destination. */
3910 vectype = STMT_VINFO_VECTYPE (stmt_info);
3911 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3912 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3913
3914 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3915 if (!SLP_TREE_VEC_STMTS (node).exists ())
3916 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3917
3918 if (dump_enabled_p ())
3919 {
3920 dump_printf_loc (MSG_NOTE,vect_location,
3921 "------>vectorizing SLP node starting from: ");
3922 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3923 }
3924
3925 /* Vectorized stmts go before the last scalar stmt which is where
3926 all uses are ready. */
3927 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3928
3929 /* Mark the first element of the reduction chain as reduction to properly
3930 transform the node. In the analysis phase only the last element of the
3931 chain is marked as reduction. */
3932 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3933 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
3934 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3935 {
3936 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3937 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3938 }
3939
3940 /* Handle two-operation SLP nodes by vectorizing the group with
3941 both operations and then performing a merge. */
3942 if (SLP_TREE_TWO_OPERATORS (node))
3943 {
3944 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3945 enum tree_code ocode = ERROR_MARK;
3946 gimple *ostmt;
3947 vec_perm_builder mask (group_size, group_size, 1);
3948 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3949 if (gimple_assign_rhs_code (ostmt) != code0)
3950 {
3951 mask.quick_push (1);
3952 ocode = gimple_assign_rhs_code (ostmt);
3953 }
3954 else
3955 mask.quick_push (0);
3956 if (ocode != ERROR_MARK)
3957 {
3958 vec<gimple *> v0;
3959 vec<gimple *> v1;
3960 unsigned j;
3961 tree tmask = NULL_TREE;
3962 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3963 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3964 SLP_TREE_VEC_STMTS (node).truncate (0);
3965 gimple_assign_set_rhs_code (stmt, ocode);
3966 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3967 gimple_assign_set_rhs_code (stmt, code0);
3968 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3969 SLP_TREE_VEC_STMTS (node).truncate (0);
3970 tree meltype = build_nonstandard_integer_type
3971 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3972 tree mvectype = get_same_sized_vectype (meltype, vectype);
3973 unsigned k = 0, l;
3974 for (j = 0; j < v0.length (); ++j)
3975 {
3976 /* Enforced by vect_build_slp_tree, which rejects variable-length
3977 vectors for SLP_TREE_TWO_OPERATORS. */
3978 unsigned int const_nunits = nunits.to_constant ();
3979 tree_vector_builder melts (mvectype, const_nunits, 1);
3980 for (l = 0; l < const_nunits; ++l)
3981 {
3982 if (k >= group_size)
3983 k = 0;
3984 tree t = build_int_cst (meltype,
3985 mask[k++] * const_nunits + l);
3986 melts.quick_push (t);
3987 }
3988 tmask = melts.build ();
3989
3990 /* ??? Not all targets support a VEC_PERM_EXPR with a
3991 constant mask that would translate to a vec_merge RTX
3992 (with their vec_perm_const_ok). We can either not
3993 vectorize in that case or let veclower do its job.
3994 Unfortunately that isn't too great and at least for
3995 plus/minus we'd eventually like to match targets
3996 vector addsub instructions. */
3997 gimple *vstmt;
3998 vstmt = gimple_build_assign (make_ssa_name (vectype),
3999 VEC_PERM_EXPR,
4000 gimple_assign_lhs (v0[j]),
4001 gimple_assign_lhs (v1[j]), tmask);
4002 vect_finish_stmt_generation (stmt, vstmt, &si);
4003 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
4004 }
4005 v0.release ();
4006 v1.release ();
4007 return false;
4008 }
4009 }
4010 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
4011
4012 /* Restore stmt def-types. */
4013 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4014 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4015 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt)
4016 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def;
4017
4018 return is_store;
4019 }
4020
4021 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4022 For loop vectorization this is done in vectorizable_call, but for SLP
4023 it needs to be deferred until end of vect_schedule_slp, because multiple
4024 SLP instances may refer to the same scalar stmt. */
4025
4026 static void
4027 vect_remove_slp_scalar_calls (slp_tree node)
4028 {
4029 gimple *stmt, *new_stmt;
4030 gimple_stmt_iterator gsi;
4031 int i;
4032 slp_tree child;
4033 tree lhs;
4034 stmt_vec_info stmt_info;
4035
4036 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4037 return;
4038
4039 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4040 vect_remove_slp_scalar_calls (child);
4041
4042 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4043 {
4044 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
4045 continue;
4046 stmt_info = vinfo_for_stmt (stmt);
4047 if (stmt_info == NULL
4048 || is_pattern_stmt_p (stmt_info)
4049 || !PURE_SLP_STMT (stmt_info))
4050 continue;
4051 lhs = gimple_call_lhs (stmt);
4052 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4053 set_vinfo_for_stmt (new_stmt, stmt_info);
4054 set_vinfo_for_stmt (stmt, NULL);
4055 STMT_VINFO_STMT (stmt_info) = new_stmt;
4056 gsi = gsi_for_stmt (stmt);
4057 gsi_replace (&gsi, new_stmt, false);
4058 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4059 }
4060 }
4061
4062 /* Generate vector code for all SLP instances in the loop/basic block. */
4063
4064 bool
4065 vect_schedule_slp (vec_info *vinfo)
4066 {
4067 vec<slp_instance> slp_instances;
4068 slp_instance instance;
4069 unsigned int i;
4070 bool is_store = false;
4071
4072
4073 scalar_stmts_to_slp_tree_map_t *bst_map
4074 = new scalar_stmts_to_slp_tree_map_t ();
4075 slp_instances = vinfo->slp_instances;
4076 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4077 {
4078 /* Schedule the tree of INSTANCE. */
4079 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4080 instance, bst_map);
4081 if (dump_enabled_p ())
4082 dump_printf_loc (MSG_NOTE, vect_location,
4083 "vectorizing stmts using SLP.\n");
4084 }
4085 delete bst_map;
4086
4087 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4088 {
4089 slp_tree root = SLP_INSTANCE_TREE (instance);
4090 gimple *store;
4091 unsigned int j;
4092 gimple_stmt_iterator gsi;
4093
4094 /* Remove scalar call stmts. Do not do this for basic-block
4095 vectorization as not all uses may be vectorized.
4096 ??? Why should this be necessary? DCE should be able to
4097 remove the stmts itself.
4098 ??? For BB vectorization we can as well remove scalar
4099 stmts starting from the SLP tree root if they have no
4100 uses. */
4101 if (is_a <loop_vec_info> (vinfo))
4102 vect_remove_slp_scalar_calls (root);
4103
4104 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
4105 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4106 {
4107 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
4108 break;
4109
4110 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
4111 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
4112 /* Free the attached stmt_vec_info and remove the stmt. */
4113 gsi = gsi_for_stmt (store);
4114 unlink_stmt_vdef (store);
4115 gsi_remove (&gsi, true);
4116 release_defs (store);
4117 free_stmt_vec_info (store);
4118 }
4119 }
4120
4121 return is_store;
4122 }
This page took 0.196205 seconds and 6 git commands to generate.