]> gcc.gnu.org Git - gcc.git/blob - gcc/tree-vect-slp.cc
vect: Move VMAT_LOAD_STORE_LANES handlings from final loop nest
[gcc.git] / gcc / tree-vect-slp.cc
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
55
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
70
71 static object_allocator<_slp_tree> *slp_tree_pool;
72 static slp_tree slp_first_node;
73
74 void
75 vect_slp_init (void)
76 {
77 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
78 }
79
80 void
81 vect_slp_fini (void)
82 {
83 while (slp_first_node)
84 delete slp_first_node;
85 delete slp_tree_pool;
86 slp_tree_pool = NULL;
87 }
88
89 void *
90 _slp_tree::operator new (size_t n)
91 {
92 gcc_assert (n == sizeof (_slp_tree));
93 return slp_tree_pool->allocate_raw ();
94 }
95
96 void
97 _slp_tree::operator delete (void *node, size_t n)
98 {
99 gcc_assert (n == sizeof (_slp_tree));
100 slp_tree_pool->remove_raw (node);
101 }
102
103
104 /* Initialize a SLP node. */
105
106 _slp_tree::_slp_tree ()
107 {
108 this->prev_node = NULL;
109 if (slp_first_node)
110 slp_first_node->prev_node = this;
111 this->next_node = slp_first_node;
112 slp_first_node = this;
113 SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 SLP_TREE_SCALAR_OPS (this) = vNULL;
115 SLP_TREE_VEC_DEFS (this) = vNULL;
116 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 SLP_TREE_CHILDREN (this) = vNULL;
118 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
121 SLP_TREE_CODE (this) = ERROR_MARK;
122 SLP_TREE_VECTYPE (this) = NULL_TREE;
123 SLP_TREE_REPRESENTATIVE (this) = NULL;
124 SLP_TREE_REF_COUNT (this) = 1;
125 this->failed = NULL;
126 this->max_nunits = 1;
127 this->lanes = 0;
128 }
129
130 /* Tear down a SLP node. */
131
132 _slp_tree::~_slp_tree ()
133 {
134 if (this->prev_node)
135 this->prev_node->next_node = this->next_node;
136 else
137 slp_first_node = this->next_node;
138 if (this->next_node)
139 this->next_node->prev_node = this->prev_node;
140 SLP_TREE_CHILDREN (this).release ();
141 SLP_TREE_SCALAR_STMTS (this).release ();
142 SLP_TREE_SCALAR_OPS (this).release ();
143 SLP_TREE_VEC_DEFS (this).release ();
144 SLP_TREE_LOAD_PERMUTATION (this).release ();
145 SLP_TREE_LANE_PERMUTATION (this).release ();
146 if (this->failed)
147 free (failed);
148 }
149
150 /* Push the single SSA definition in DEF to the vector of vector defs. */
151
152 void
153 _slp_tree::push_vec_def (gimple *def)
154 {
155 if (gphi *phi = dyn_cast <gphi *> (def))
156 vec_defs.quick_push (gimple_phi_result (phi));
157 else
158 {
159 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
160 vec_defs.quick_push (get_def_from_ptr (defop));
161 }
162 }
163
164 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
165
166 void
167 vect_free_slp_tree (slp_tree node)
168 {
169 int i;
170 slp_tree child;
171
172 if (--SLP_TREE_REF_COUNT (node) != 0)
173 return;
174
175 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
176 if (child)
177 vect_free_slp_tree (child);
178
179 /* If the node defines any SLP only patterns then those patterns are no
180 longer valid and should be removed. */
181 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
182 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
183 {
184 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
185 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
186 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
187 }
188
189 delete node;
190 }
191
192 /* Return a location suitable for dumpings related to the SLP instance. */
193
194 dump_user_location_t
195 _slp_instance::location () const
196 {
197 if (!root_stmts.is_empty ())
198 return root_stmts[0]->stmt;
199 else
200 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
201 }
202
203
204 /* Free the memory allocated for the SLP instance. */
205
206 void
207 vect_free_slp_instance (slp_instance instance)
208 {
209 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
210 SLP_INSTANCE_LOADS (instance).release ();
211 SLP_INSTANCE_ROOT_STMTS (instance).release ();
212 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
213 instance->subgraph_entries.release ();
214 instance->cost_vec.release ();
215 free (instance);
216 }
217
218
219 /* Create an SLP node for SCALAR_STMTS. */
220
221 slp_tree
222 vect_create_new_slp_node (unsigned nops, tree_code code)
223 {
224 slp_tree node = new _slp_tree;
225 SLP_TREE_SCALAR_STMTS (node) = vNULL;
226 SLP_TREE_CHILDREN (node).create (nops);
227 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
228 SLP_TREE_CODE (node) = code;
229 return node;
230 }
231 /* Create an SLP node for SCALAR_STMTS. */
232
233 static slp_tree
234 vect_create_new_slp_node (slp_tree node,
235 vec<stmt_vec_info> scalar_stmts, unsigned nops)
236 {
237 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
238 SLP_TREE_CHILDREN (node).create (nops);
239 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
240 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
241 SLP_TREE_LANES (node) = scalar_stmts.length ();
242 return node;
243 }
244
245 /* Create an SLP node for SCALAR_STMTS. */
246
247 static slp_tree
248 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
249 {
250 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
251 }
252
253 /* Create an SLP node for OPS. */
254
255 static slp_tree
256 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
257 {
258 SLP_TREE_SCALAR_OPS (node) = ops;
259 SLP_TREE_DEF_TYPE (node) = vect_external_def;
260 SLP_TREE_LANES (node) = ops.length ();
261 return node;
262 }
263
264 /* Create an SLP node for OPS. */
265
266 static slp_tree
267 vect_create_new_slp_node (vec<tree> ops)
268 {
269 return vect_create_new_slp_node (new _slp_tree, ops);
270 }
271
272
273 /* This structure is used in creation of an SLP tree. Each instance
274 corresponds to the same operand in a group of scalar stmts in an SLP
275 node. */
276 typedef struct _slp_oprnd_info
277 {
278 /* Def-stmts for the operands. */
279 vec<stmt_vec_info> def_stmts;
280 /* Operands. */
281 vec<tree> ops;
282 /* Information about the first statement, its vector def-type, type, the
283 operand itself in case it's constant, and an indication if it's a pattern
284 stmt. */
285 tree first_op_type;
286 enum vect_def_type first_dt;
287 bool any_pattern;
288 } *slp_oprnd_info;
289
290
291 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
292 operand. */
293 static vec<slp_oprnd_info>
294 vect_create_oprnd_info (int nops, int group_size)
295 {
296 int i;
297 slp_oprnd_info oprnd_info;
298 vec<slp_oprnd_info> oprnds_info;
299
300 oprnds_info.create (nops);
301 for (i = 0; i < nops; i++)
302 {
303 oprnd_info = XNEW (struct _slp_oprnd_info);
304 oprnd_info->def_stmts.create (group_size);
305 oprnd_info->ops.create (group_size);
306 oprnd_info->first_dt = vect_uninitialized_def;
307 oprnd_info->first_op_type = NULL_TREE;
308 oprnd_info->any_pattern = false;
309 oprnds_info.quick_push (oprnd_info);
310 }
311
312 return oprnds_info;
313 }
314
315
316 /* Free operands info. */
317
318 static void
319 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
320 {
321 int i;
322 slp_oprnd_info oprnd_info;
323
324 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
325 {
326 oprnd_info->def_stmts.release ();
327 oprnd_info->ops.release ();
328 XDELETE (oprnd_info);
329 }
330
331 oprnds_info.release ();
332 }
333
334 /* Return the execution frequency of NODE (so that a higher value indicates
335 a "more important" node when optimizing for speed). */
336
337 static sreal
338 vect_slp_node_weight (slp_tree node)
339 {
340 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
341 basic_block bb = gimple_bb (stmt_info->stmt);
342 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
343 }
344
345 /* Return true if STMTS contains a pattern statement. */
346
347 static bool
348 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
349 {
350 stmt_vec_info stmt_info;
351 unsigned int i;
352 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
353 if (is_pattern_stmt_p (stmt_info))
354 return true;
355 return false;
356 }
357
358 /* Return true when all lanes in the external or constant NODE have
359 the same value. */
360
361 static bool
362 vect_slp_tree_uniform_p (slp_tree node)
363 {
364 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
365 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
366
367 /* Pre-exsting vectors. */
368 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
369 return false;
370
371 unsigned i;
372 tree op, first = NULL_TREE;
373 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
374 if (!first)
375 first = op;
376 else if (!operand_equal_p (first, op, 0))
377 return false;
378
379 return true;
380 }
381
382 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
383 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
384 of the chain. */
385
386 int
387 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
388 stmt_vec_info first_stmt_info)
389 {
390 stmt_vec_info next_stmt_info = first_stmt_info;
391 int result = 0;
392
393 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
394 return -1;
395
396 do
397 {
398 if (next_stmt_info == stmt_info)
399 return result;
400 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
401 if (next_stmt_info)
402 result += DR_GROUP_GAP (next_stmt_info);
403 }
404 while (next_stmt_info);
405
406 return -1;
407 }
408
409 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
410 using the method implemented by duplicate_and_interleave. Return true
411 if so, returning the number of intermediate vectors in *NVECTORS_OUT
412 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
413 (if nonnull). */
414
415 bool
416 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
417 tree elt_type, unsigned int *nvectors_out,
418 tree *vector_type_out,
419 tree *permutes)
420 {
421 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
422 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
423 return false;
424
425 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
426 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
427 unsigned int nvectors = 1;
428 for (;;)
429 {
430 scalar_int_mode int_mode;
431 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
432 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
433 {
434 /* Get the natural vector type for this SLP group size. */
435 tree int_type = build_nonstandard_integer_type
436 (GET_MODE_BITSIZE (int_mode), 1);
437 tree vector_type
438 = get_vectype_for_scalar_type (vinfo, int_type, count);
439 poly_int64 half_nelts;
440 if (vector_type
441 && VECTOR_MODE_P (TYPE_MODE (vector_type))
442 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
443 GET_MODE_SIZE (base_vector_mode))
444 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
445 2, &half_nelts))
446 {
447 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
448 together into elements of type INT_TYPE and using the result
449 to build NVECTORS vectors. */
450 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
451 vec_perm_builder sel1 (nelts, 2, 3);
452 vec_perm_builder sel2 (nelts, 2, 3);
453
454 for (unsigned int i = 0; i < 3; ++i)
455 {
456 sel1.quick_push (i);
457 sel1.quick_push (i + nelts);
458 sel2.quick_push (half_nelts + i);
459 sel2.quick_push (half_nelts + i + nelts);
460 }
461 vec_perm_indices indices1 (sel1, 2, nelts);
462 vec_perm_indices indices2 (sel2, 2, nelts);
463 machine_mode vmode = TYPE_MODE (vector_type);
464 if (can_vec_perm_const_p (vmode, vmode, indices1)
465 && can_vec_perm_const_p (vmode, vmode, indices2))
466 {
467 if (nvectors_out)
468 *nvectors_out = nvectors;
469 if (vector_type_out)
470 *vector_type_out = vector_type;
471 if (permutes)
472 {
473 permutes[0] = vect_gen_perm_mask_checked (vector_type,
474 indices1);
475 permutes[1] = vect_gen_perm_mask_checked (vector_type,
476 indices2);
477 }
478 return true;
479 }
480 }
481 }
482 if (!multiple_p (elt_bytes, 2, &elt_bytes))
483 return false;
484 nvectors *= 2;
485 }
486 }
487
488 /* Return true if DTA and DTB match. */
489
490 static bool
491 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
492 {
493 return (dta == dtb
494 || ((dta == vect_external_def || dta == vect_constant_def)
495 && (dtb == vect_external_def || dtb == vect_constant_def)));
496 }
497
498 static const int cond_expr_maps[3][5] = {
499 { 4, -1, -2, 1, 2 },
500 { 4, -2, -1, 1, 2 },
501 { 4, -1, -2, 2, 1 }
502 };
503 static const int arg1_map[] = { 1, 1 };
504 static const int arg2_map[] = { 1, 2 };
505 static const int arg1_arg4_map[] = { 2, 1, 4 };
506 static const int op1_op0_map[] = { 2, 1, 0 };
507
508 /* For most SLP statements, there is a one-to-one mapping between
509 gimple arguments and child nodes. If that is not true for STMT,
510 return an array that contains:
511
512 - the number of child nodes, followed by
513 - for each child node, the index of the argument associated with that node.
514 The special index -1 is the first operand of an embedded comparison and
515 the special index -2 is the second operand of an embedded comparison.
516
517 SWAP is as for vect_get_and_check_slp_defs. */
518
519 static const int *
520 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
521 {
522 if (auto assign = dyn_cast<const gassign *> (stmt))
523 {
524 if (gimple_assign_rhs_code (assign) == COND_EXPR
525 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
526 return cond_expr_maps[swap];
527 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
528 && swap)
529 return op1_op0_map;
530 }
531 gcc_assert (!swap);
532 if (auto call = dyn_cast<const gcall *> (stmt))
533 {
534 if (gimple_call_internal_p (call))
535 switch (gimple_call_internal_fn (call))
536 {
537 case IFN_MASK_LOAD:
538 return arg2_map;
539
540 case IFN_GATHER_LOAD:
541 return arg1_map;
542
543 case IFN_MASK_GATHER_LOAD:
544 return arg1_arg4_map;
545
546 default:
547 break;
548 }
549 }
550 return nullptr;
551 }
552
553 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
554 they are of a valid type and that they match the defs of the first stmt of
555 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
556 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
557 indicates swap is required for cond_expr stmts. Specifically, SWAP
558 is 1 if STMT is cond and operands of comparison need to be swapped;
559 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
560
561 If there was a fatal error return -1; if the error could be corrected by
562 swapping operands of father node of this one, return 1; if everything is
563 ok return 0. */
564 static int
565 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
566 bool *skip_args,
567 vec<stmt_vec_info> stmts, unsigned stmt_num,
568 vec<slp_oprnd_info> *oprnds_info)
569 {
570 stmt_vec_info stmt_info = stmts[stmt_num];
571 tree oprnd;
572 unsigned int i, number_of_oprnds;
573 enum vect_def_type dt = vect_uninitialized_def;
574 slp_oprnd_info oprnd_info;
575 unsigned int commutative_op = -1U;
576 bool first = stmt_num == 0;
577
578 if (!is_a<gcall *> (stmt_info->stmt)
579 && !is_a<gassign *> (stmt_info->stmt)
580 && !is_a<gphi *> (stmt_info->stmt))
581 return -1;
582
583 number_of_oprnds = gimple_num_args (stmt_info->stmt);
584 const int *map = vect_get_operand_map (stmt_info->stmt, swap);
585 if (map)
586 number_of_oprnds = *map++;
587 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
588 {
589 if (gimple_call_internal_p (stmt))
590 {
591 internal_fn ifn = gimple_call_internal_fn (stmt);
592 commutative_op = first_commutative_argument (ifn);
593 }
594 }
595 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
596 {
597 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
598 commutative_op = 0;
599 }
600
601 bool swapped = (swap != 0);
602 bool backedge = false;
603 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
604 for (i = 0; i < number_of_oprnds; i++)
605 {
606 int opno = map ? map[i] : int (i);
607 if (opno < 0)
608 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
609 else
610 {
611 oprnd = gimple_arg (stmt_info->stmt, opno);
612 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
613 backedge = dominated_by_p (CDI_DOMINATORS,
614 gimple_phi_arg_edge (stmt, opno)->src,
615 gimple_bb (stmt_info->stmt));
616 }
617 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
618 oprnd = TREE_OPERAND (oprnd, 0);
619
620 oprnd_info = (*oprnds_info)[i];
621
622 stmt_vec_info def_stmt_info;
623 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
624 {
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "Build SLP failed: can't analyze def for %T\n",
628 oprnd);
629
630 return -1;
631 }
632
633 if (skip_args[i])
634 {
635 oprnd_info->def_stmts.quick_push (NULL);
636 oprnd_info->ops.quick_push (NULL_TREE);
637 oprnd_info->first_dt = vect_uninitialized_def;
638 continue;
639 }
640
641 oprnd_info->def_stmts.quick_push (def_stmt_info);
642 oprnd_info->ops.quick_push (oprnd);
643
644 if (def_stmt_info
645 && is_pattern_stmt_p (def_stmt_info))
646 {
647 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
648 != def_stmt_info)
649 oprnd_info->any_pattern = true;
650 else
651 /* If we promote this to external use the original stmt def. */
652 oprnd_info->ops.last ()
653 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
654 }
655
656 /* If there's a extern def on a backedge make sure we can
657 code-generate at the region start.
658 ??? This is another case that could be fixed by adjusting
659 how we split the function but at the moment we'd have conflicting
660 goals there. */
661 if (backedge
662 && dts[i] == vect_external_def
663 && is_a <bb_vec_info> (vinfo)
664 && TREE_CODE (oprnd) == SSA_NAME
665 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
666 && !dominated_by_p (CDI_DOMINATORS,
667 as_a <bb_vec_info> (vinfo)->bbs[0],
668 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
669 {
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
672 "Build SLP failed: extern def %T only defined "
673 "on backedge\n", oprnd);
674 return -1;
675 }
676
677 if (first)
678 {
679 tree type = TREE_TYPE (oprnd);
680 dt = dts[i];
681 if ((dt == vect_constant_def
682 || dt == vect_external_def)
683 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
684 && (TREE_CODE (type) == BOOLEAN_TYPE
685 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
686 type)))
687 {
688 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
690 "Build SLP failed: invalid type of def "
691 "for variable-length SLP %T\n", oprnd);
692 return -1;
693 }
694
695 /* For the swapping logic below force vect_reduction_def
696 for the reduction op in a SLP reduction group. */
697 if (!STMT_VINFO_DATA_REF (stmt_info)
698 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
699 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
700 && def_stmt_info)
701 dts[i] = dt = vect_reduction_def;
702
703 /* Check the types of the definition. */
704 switch (dt)
705 {
706 case vect_external_def:
707 case vect_constant_def:
708 case vect_internal_def:
709 case vect_reduction_def:
710 case vect_induction_def:
711 case vect_nested_cycle:
712 case vect_first_order_recurrence:
713 break;
714
715 default:
716 /* FORNOW: Not supported. */
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
719 "Build SLP failed: illegal type of def %T\n",
720 oprnd);
721 return -1;
722 }
723
724 oprnd_info->first_dt = dt;
725 oprnd_info->first_op_type = type;
726 }
727 }
728 if (first)
729 return 0;
730
731 /* Now match the operand definition types to that of the first stmt. */
732 for (i = 0; i < number_of_oprnds;)
733 {
734 if (skip_args[i])
735 {
736 ++i;
737 continue;
738 }
739
740 oprnd_info = (*oprnds_info)[i];
741 dt = dts[i];
742 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
743 oprnd = oprnd_info->ops[stmt_num];
744 tree type = TREE_TYPE (oprnd);
745
746 if (!types_compatible_p (oprnd_info->first_op_type, type))
747 {
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 "Build SLP failed: different operand types\n");
751 return 1;
752 }
753
754 /* Not first stmt of the group, check that the def-stmt/s match
755 the def-stmt/s of the first stmt. Allow different definition
756 types for reduction chains: the first stmt must be a
757 vect_reduction_def (a phi node), and the rest
758 end in the reduction chain. */
759 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
760 && !(oprnd_info->first_dt == vect_reduction_def
761 && !STMT_VINFO_DATA_REF (stmt_info)
762 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
763 && def_stmt_info
764 && !STMT_VINFO_DATA_REF (def_stmt_info)
765 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
766 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
767 || (!STMT_VINFO_DATA_REF (stmt_info)
768 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
769 && ((!def_stmt_info
770 || STMT_VINFO_DATA_REF (def_stmt_info)
771 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
772 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
773 != (oprnd_info->first_dt != vect_reduction_def))))
774 {
775 /* Try swapping operands if we got a mismatch. For BB
776 vectorization only in case it will clearly improve things. */
777 if (i == commutative_op && !swapped
778 && (!is_a <bb_vec_info> (vinfo)
779 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
780 dts[i+1])
781 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
782 || vect_def_types_match
783 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
784 {
785 if (dump_enabled_p ())
786 dump_printf_loc (MSG_NOTE, vect_location,
787 "trying swapped operands\n");
788 std::swap (dts[i], dts[i+1]);
789 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
790 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
791 std::swap ((*oprnds_info)[i]->ops[stmt_num],
792 (*oprnds_info)[i+1]->ops[stmt_num]);
793 swapped = true;
794 continue;
795 }
796
797 if (is_a <bb_vec_info> (vinfo)
798 && !oprnd_info->any_pattern)
799 {
800 /* Now for commutative ops we should see whether we can
801 make the other operand matching. */
802 if (dump_enabled_p ())
803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
804 "treating operand as external\n");
805 oprnd_info->first_dt = dt = vect_external_def;
806 }
807 else
808 {
809 if (dump_enabled_p ())
810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
811 "Build SLP failed: different types\n");
812 return 1;
813 }
814 }
815
816 /* Make sure to demote the overall operand to external. */
817 if (dt == vect_external_def)
818 oprnd_info->first_dt = vect_external_def;
819 /* For a SLP reduction chain we want to duplicate the reduction to
820 each of the chain members. That gets us a sane SLP graph (still
821 the stmts are not 100% correct wrt the initial values). */
822 else if ((dt == vect_internal_def
823 || dt == vect_reduction_def)
824 && oprnd_info->first_dt == vect_reduction_def
825 && !STMT_VINFO_DATA_REF (stmt_info)
826 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
827 && !STMT_VINFO_DATA_REF (def_stmt_info)
828 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
829 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
830 {
831 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
832 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
833 }
834
835 ++i;
836 }
837
838 /* Swap operands. */
839 if (swapped)
840 {
841 if (dump_enabled_p ())
842 dump_printf_loc (MSG_NOTE, vect_location,
843 "swapped operands to match def types in %G",
844 stmt_info->stmt);
845 }
846
847 return 0;
848 }
849
850 /* Return true if call statements CALL1 and CALL2 are similar enough
851 to be combined into the same SLP group. */
852
853 bool
854 compatible_calls_p (gcall *call1, gcall *call2)
855 {
856 unsigned int nargs = gimple_call_num_args (call1);
857 if (nargs != gimple_call_num_args (call2))
858 return false;
859
860 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
861 return false;
862
863 if (gimple_call_internal_p (call1))
864 {
865 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
866 TREE_TYPE (gimple_call_lhs (call2))))
867 return false;
868 for (unsigned int i = 0; i < nargs; ++i)
869 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
870 TREE_TYPE (gimple_call_arg (call2, i))))
871 return false;
872 }
873 else
874 {
875 if (!operand_equal_p (gimple_call_fn (call1),
876 gimple_call_fn (call2), 0))
877 return false;
878
879 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
880 return false;
881 }
882
883 /* Check that any unvectorized arguments are equal. */
884 if (const int *map = vect_get_operand_map (call1))
885 {
886 unsigned int nkept = *map++;
887 unsigned int mapi = 0;
888 for (unsigned int i = 0; i < nargs; ++i)
889 if (mapi < nkept && map[mapi] == int (i))
890 mapi += 1;
891 else if (!operand_equal_p (gimple_call_arg (call1, i),
892 gimple_call_arg (call2, i)))
893 return false;
894 }
895
896 return true;
897 }
898
899 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
900 caller's attempt to find the vector type in STMT_INFO with the narrowest
901 element type. Return true if VECTYPE is nonnull and if it is valid
902 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
903 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
904 vect_build_slp_tree. */
905
906 static bool
907 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
908 unsigned int group_size,
909 tree vectype, poly_uint64 *max_nunits)
910 {
911 if (!vectype)
912 {
913 if (dump_enabled_p ())
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 "Build SLP failed: unsupported data-type in %G\n",
916 stmt_info->stmt);
917 /* Fatal mismatch. */
918 return false;
919 }
920
921 /* If populating the vector type requires unrolling then fail
922 before adjusting *max_nunits for basic-block vectorization. */
923 if (is_a <bb_vec_info> (vinfo)
924 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
925 {
926 if (dump_enabled_p ())
927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
928 "Build SLP failed: unrolling required "
929 "in basic block SLP\n");
930 /* Fatal mismatch. */
931 return false;
932 }
933
934 /* In case of multiple types we need to detect the smallest type. */
935 vect_update_max_nunits (max_nunits, vectype);
936 return true;
937 }
938
939 /* Verify if the scalar stmts STMTS are isomorphic, require data
940 permutation or are of unsupported types of operation. Return
941 true if they are, otherwise return false and indicate in *MATCHES
942 which stmts are not isomorphic to the first one. If MATCHES[0]
943 is false then this indicates the comparison could not be
944 carried out or the stmts will never be vectorized by SLP.
945
946 Note COND_EXPR is possibly isomorphic to another one after swapping its
947 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
948 the first stmt by swapping the two operands of comparison; set SWAP[i]
949 to 2 if stmt I is isormorphic to the first stmt by inverting the code
950 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
951 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
952
953 static bool
954 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
955 vec<stmt_vec_info> stmts, unsigned int group_size,
956 poly_uint64 *max_nunits, bool *matches,
957 bool *two_operators, tree *node_vectype)
958 {
959 unsigned int i;
960 stmt_vec_info first_stmt_info = stmts[0];
961 code_helper first_stmt_code = ERROR_MARK;
962 code_helper alt_stmt_code = ERROR_MARK;
963 code_helper rhs_code = ERROR_MARK;
964 code_helper first_cond_code = ERROR_MARK;
965 tree lhs;
966 bool need_same_oprnds = false;
967 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
968 stmt_vec_info first_load = NULL, prev_first_load = NULL;
969 bool first_stmt_load_p = false, load_p = false;
970 bool first_stmt_phi_p = false, phi_p = false;
971 bool maybe_soft_fail = false;
972 tree soft_fail_nunits_vectype = NULL_TREE;
973
974 /* For every stmt in NODE find its def stmt/s. */
975 stmt_vec_info stmt_info;
976 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
977 {
978 gimple *stmt = stmt_info->stmt;
979 swap[i] = 0;
980 matches[i] = false;
981
982 if (dump_enabled_p ())
983 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
984
985 /* Fail to vectorize statements marked as unvectorizable, throw
986 or are volatile. */
987 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
988 || stmt_can_throw_internal (cfun, stmt)
989 || gimple_has_volatile_ops (stmt))
990 {
991 if (dump_enabled_p ())
992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
993 "Build SLP failed: unvectorizable statement %G",
994 stmt);
995 /* ??? For BB vectorization we want to commutate operands in a way
996 to shuffle all unvectorizable defs into one operand and have
997 the other still vectorized. The following doesn't reliably
998 work for this though but it's the easiest we can do here. */
999 if (is_a <bb_vec_info> (vinfo) && i != 0)
1000 continue;
1001 /* Fatal mismatch. */
1002 matches[0] = false;
1003 return false;
1004 }
1005
1006 lhs = gimple_get_lhs (stmt);
1007 if (lhs == NULL_TREE)
1008 {
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1011 "Build SLP failed: not GIMPLE_ASSIGN nor "
1012 "GIMPLE_CALL %G", stmt);
1013 if (is_a <bb_vec_info> (vinfo) && i != 0)
1014 continue;
1015 /* Fatal mismatch. */
1016 matches[0] = false;
1017 return false;
1018 }
1019
1020 tree nunits_vectype;
1021 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1022 &nunits_vectype, group_size))
1023 {
1024 if (is_a <bb_vec_info> (vinfo) && i != 0)
1025 continue;
1026 /* Fatal mismatch. */
1027 matches[0] = false;
1028 return false;
1029 }
1030 /* Record nunits required but continue analysis, producing matches[]
1031 as if nunits was not an issue. This allows splitting of groups
1032 to happen. */
1033 if (nunits_vectype
1034 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1035 nunits_vectype, max_nunits))
1036 {
1037 gcc_assert (is_a <bb_vec_info> (vinfo));
1038 maybe_soft_fail = true;
1039 soft_fail_nunits_vectype = nunits_vectype;
1040 }
1041
1042 gcc_assert (vectype);
1043
1044 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1045 if (call_stmt)
1046 {
1047 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1048 if (cfn != CFN_LAST)
1049 rhs_code = cfn;
1050 else
1051 rhs_code = CALL_EXPR;
1052
1053 if (cfn == CFN_MASK_LOAD
1054 || cfn == CFN_GATHER_LOAD
1055 || cfn == CFN_MASK_GATHER_LOAD)
1056 load_p = true;
1057 else if ((internal_fn_p (cfn)
1058 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1059 || gimple_call_tail_p (call_stmt)
1060 || gimple_call_noreturn_p (call_stmt)
1061 || gimple_call_chain (call_stmt))
1062 {
1063 if (dump_enabled_p ())
1064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1065 "Build SLP failed: unsupported call type %G",
1066 (gimple *) call_stmt);
1067 if (is_a <bb_vec_info> (vinfo) && i != 0)
1068 continue;
1069 /* Fatal mismatch. */
1070 matches[0] = false;
1071 return false;
1072 }
1073 }
1074 else if (gimple_code (stmt) == GIMPLE_PHI)
1075 {
1076 rhs_code = ERROR_MARK;
1077 phi_p = true;
1078 }
1079 else
1080 {
1081 rhs_code = gimple_assign_rhs_code (stmt);
1082 load_p = gimple_vuse (stmt);
1083 }
1084
1085 /* Check the operation. */
1086 if (i == 0)
1087 {
1088 *node_vectype = vectype;
1089 first_stmt_code = rhs_code;
1090 first_stmt_load_p = load_p;
1091 first_stmt_phi_p = phi_p;
1092
1093 /* Shift arguments should be equal in all the packed stmts for a
1094 vector shift with scalar shift operand. */
1095 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1096 || rhs_code == LROTATE_EXPR
1097 || rhs_code == RROTATE_EXPR)
1098 {
1099 /* First see if we have a vector/vector shift. */
1100 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1101 {
1102 /* No vector/vector shift, try for a vector/scalar shift. */
1103 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1104 {
1105 if (dump_enabled_p ())
1106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1107 "Build SLP failed: "
1108 "op not supported by target.\n");
1109 if (is_a <bb_vec_info> (vinfo) && i != 0)
1110 continue;
1111 /* Fatal mismatch. */
1112 matches[0] = false;
1113 return false;
1114 }
1115 need_same_oprnds = true;
1116 first_op1 = gimple_assign_rhs2 (stmt);
1117 }
1118 }
1119 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1120 {
1121 need_same_oprnds = true;
1122 first_op1 = gimple_assign_rhs2 (stmt);
1123 }
1124 else if (!load_p
1125 && rhs_code == BIT_FIELD_REF)
1126 {
1127 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1128 if (!is_a <bb_vec_info> (vinfo)
1129 || TREE_CODE (vec) != SSA_NAME
1130 /* When the element types are not compatible we pun the
1131 source to the target vectype which requires equal size. */
1132 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1133 || !types_compatible_p (TREE_TYPE (vectype),
1134 TREE_TYPE (TREE_TYPE (vec))))
1135 && !operand_equal_p (TYPE_SIZE (vectype),
1136 TYPE_SIZE (TREE_TYPE (vec)))))
1137 {
1138 if (dump_enabled_p ())
1139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140 "Build SLP failed: "
1141 "BIT_FIELD_REF not supported\n");
1142 /* Fatal mismatch. */
1143 matches[0] = false;
1144 return false;
1145 }
1146 }
1147 else if (rhs_code == CFN_DIV_POW2)
1148 {
1149 need_same_oprnds = true;
1150 first_op1 = gimple_call_arg (call_stmt, 1);
1151 }
1152 }
1153 else
1154 {
1155 if (first_stmt_code != rhs_code
1156 && alt_stmt_code == ERROR_MARK)
1157 alt_stmt_code = rhs_code;
1158 if ((first_stmt_code != rhs_code
1159 && (first_stmt_code != IMAGPART_EXPR
1160 || rhs_code != REALPART_EXPR)
1161 && (first_stmt_code != REALPART_EXPR
1162 || rhs_code != IMAGPART_EXPR)
1163 /* Handle mismatches in plus/minus by computing both
1164 and merging the results. */
1165 && !((first_stmt_code == PLUS_EXPR
1166 || first_stmt_code == MINUS_EXPR)
1167 && (alt_stmt_code == PLUS_EXPR
1168 || alt_stmt_code == MINUS_EXPR)
1169 && rhs_code == alt_stmt_code)
1170 && !(first_stmt_code.is_tree_code ()
1171 && rhs_code.is_tree_code ()
1172 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1173 == tcc_comparison)
1174 && (swap_tree_comparison (tree_code (first_stmt_code))
1175 == tree_code (rhs_code)))
1176 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1177 && (first_stmt_code == ARRAY_REF
1178 || first_stmt_code == BIT_FIELD_REF
1179 || first_stmt_code == INDIRECT_REF
1180 || first_stmt_code == COMPONENT_REF
1181 || first_stmt_code == MEM_REF)
1182 && (rhs_code == ARRAY_REF
1183 || rhs_code == BIT_FIELD_REF
1184 || rhs_code == INDIRECT_REF
1185 || rhs_code == COMPONENT_REF
1186 || rhs_code == MEM_REF)))
1187 || first_stmt_load_p != load_p
1188 || first_stmt_phi_p != phi_p)
1189 {
1190 if (dump_enabled_p ())
1191 {
1192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193 "Build SLP failed: different operation "
1194 "in stmt %G", stmt);
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196 "original stmt %G", first_stmt_info->stmt);
1197 }
1198 /* Mismatch. */
1199 continue;
1200 }
1201
1202 if (!load_p
1203 && first_stmt_code == BIT_FIELD_REF
1204 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1205 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1206 {
1207 if (dump_enabled_p ())
1208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209 "Build SLP failed: different BIT_FIELD_REF "
1210 "arguments in %G", stmt);
1211 /* Mismatch. */
1212 continue;
1213 }
1214
1215 if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1216 {
1217 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1218 call_stmt))
1219 {
1220 if (dump_enabled_p ())
1221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222 "Build SLP failed: different calls in %G",
1223 stmt);
1224 /* Mismatch. */
1225 continue;
1226 }
1227 }
1228
1229 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1230 && (gimple_bb (first_stmt_info->stmt)
1231 != gimple_bb (stmt_info->stmt)))
1232 {
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "Build SLP failed: different BB for PHI "
1236 "or possibly trapping operation in %G", stmt);
1237 /* Mismatch. */
1238 continue;
1239 }
1240
1241 if (need_same_oprnds)
1242 {
1243 tree other_op1 = gimple_arg (stmt, 1);
1244 if (!operand_equal_p (first_op1, other_op1, 0))
1245 {
1246 if (dump_enabled_p ())
1247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248 "Build SLP failed: different shift "
1249 "arguments in %G", stmt);
1250 /* Mismatch. */
1251 continue;
1252 }
1253 }
1254
1255 if (!types_compatible_p (vectype, *node_vectype))
1256 {
1257 if (dump_enabled_p ())
1258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1259 "Build SLP failed: different vector type "
1260 "in %G", stmt);
1261 /* Mismatch. */
1262 continue;
1263 }
1264 }
1265
1266 /* Grouped store or load. */
1267 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1268 {
1269 if (REFERENCE_CLASS_P (lhs))
1270 {
1271 /* Store. */
1272 ;
1273 }
1274 else
1275 {
1276 /* Load. */
1277 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1278 if (prev_first_load)
1279 {
1280 /* Check that there are no loads from different interleaving
1281 chains in the same node. */
1282 if (prev_first_load != first_load)
1283 {
1284 if (dump_enabled_p ())
1285 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1286 vect_location,
1287 "Build SLP failed: different "
1288 "interleaving chains in one node %G",
1289 stmt);
1290 /* Mismatch. */
1291 continue;
1292 }
1293 }
1294 else
1295 prev_first_load = first_load;
1296 }
1297 } /* Grouped access. */
1298 else
1299 {
1300 if (load_p
1301 && rhs_code != CFN_GATHER_LOAD
1302 && rhs_code != CFN_MASK_GATHER_LOAD
1303 /* Not grouped loads are handled as externals for BB
1304 vectorization. For loop vectorization we can handle
1305 splats the same we handle single element interleaving. */
1306 && (is_a <bb_vec_info> (vinfo)
1307 || stmt_info != first_stmt_info
1308 || STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
1309 {
1310 /* Not grouped load. */
1311 if (dump_enabled_p ())
1312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1313 "Build SLP failed: not grouped load %G", stmt);
1314
1315 if (i != 0)
1316 continue;
1317 /* Fatal mismatch. */
1318 matches[0] = false;
1319 return false;
1320 }
1321
1322 /* Not memory operation. */
1323 if (!load_p
1324 && !phi_p
1325 && rhs_code.is_tree_code ()
1326 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1327 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1328 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1329 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1330 && rhs_code != VIEW_CONVERT_EXPR
1331 && rhs_code != CALL_EXPR
1332 && rhs_code != BIT_FIELD_REF)
1333 {
1334 if (dump_enabled_p ())
1335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1336 "Build SLP failed: operation unsupported %G",
1337 stmt);
1338 if (is_a <bb_vec_info> (vinfo) && i != 0)
1339 continue;
1340 /* Fatal mismatch. */
1341 matches[0] = false;
1342 return false;
1343 }
1344
1345 if (rhs_code == COND_EXPR)
1346 {
1347 tree cond_expr = gimple_assign_rhs1 (stmt);
1348 enum tree_code cond_code = TREE_CODE (cond_expr);
1349 enum tree_code swap_code = ERROR_MARK;
1350 enum tree_code invert_code = ERROR_MARK;
1351
1352 if (i == 0)
1353 first_cond_code = TREE_CODE (cond_expr);
1354 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1355 {
1356 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1357 swap_code = swap_tree_comparison (cond_code);
1358 invert_code = invert_tree_comparison (cond_code, honor_nans);
1359 }
1360
1361 if (first_cond_code == cond_code)
1362 ;
1363 /* Isomorphic can be achieved by swapping. */
1364 else if (first_cond_code == swap_code)
1365 swap[i] = 1;
1366 /* Isomorphic can be achieved by inverting. */
1367 else if (first_cond_code == invert_code)
1368 swap[i] = 2;
1369 else
1370 {
1371 if (dump_enabled_p ())
1372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1373 "Build SLP failed: different"
1374 " operation %G", stmt);
1375 /* Mismatch. */
1376 continue;
1377 }
1378 }
1379
1380 if (rhs_code.is_tree_code ()
1381 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1382 && (swap_tree_comparison ((tree_code)first_stmt_code)
1383 == (tree_code)rhs_code))
1384 swap[i] = 1;
1385 }
1386
1387 matches[i] = true;
1388 }
1389
1390 for (i = 0; i < group_size; ++i)
1391 if (!matches[i])
1392 return false;
1393
1394 /* If we allowed a two-operation SLP node verify the target can cope
1395 with the permute we are going to use. */
1396 if (alt_stmt_code != ERROR_MARK
1397 && (!alt_stmt_code.is_tree_code ()
1398 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1399 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1400 {
1401 *two_operators = true;
1402 }
1403
1404 if (maybe_soft_fail)
1405 {
1406 unsigned HOST_WIDE_INT const_nunits;
1407 if (!TYPE_VECTOR_SUBPARTS
1408 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1409 || const_nunits > group_size)
1410 matches[0] = false;
1411 else
1412 {
1413 /* With constant vector elements simulate a mismatch at the
1414 point we need to split. */
1415 unsigned tail = group_size & (const_nunits - 1);
1416 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1417 }
1418 return false;
1419 }
1420
1421 return true;
1422 }
1423
1424 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1425 Note we never remove apart from at destruction time so we do not
1426 need a special value for deleted that differs from empty. */
1427 struct bst_traits
1428 {
1429 typedef vec <stmt_vec_info> value_type;
1430 typedef vec <stmt_vec_info> compare_type;
1431 static inline hashval_t hash (value_type);
1432 static inline bool equal (value_type existing, value_type candidate);
1433 static inline bool is_empty (value_type x) { return !x.exists (); }
1434 static inline bool is_deleted (value_type x) { return !x.exists (); }
1435 static const bool empty_zero_p = true;
1436 static inline void mark_empty (value_type &x) { x.release (); }
1437 static inline void mark_deleted (value_type &x) { x.release (); }
1438 static inline void remove (value_type &x) { x.release (); }
1439 };
1440 inline hashval_t
1441 bst_traits::hash (value_type x)
1442 {
1443 inchash::hash h;
1444 for (unsigned i = 0; i < x.length (); ++i)
1445 h.add_int (gimple_uid (x[i]->stmt));
1446 return h.end ();
1447 }
1448 inline bool
1449 bst_traits::equal (value_type existing, value_type candidate)
1450 {
1451 if (existing.length () != candidate.length ())
1452 return false;
1453 for (unsigned i = 0; i < existing.length (); ++i)
1454 if (existing[i] != candidate[i])
1455 return false;
1456 return true;
1457 }
1458
1459 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1460 but then vec::insert does memmove and that's not compatible with
1461 std::pair. */
1462 struct chain_op_t
1463 {
1464 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1465 : code (code_), dt (dt_), op (op_) {}
1466 tree_code code;
1467 vect_def_type dt;
1468 tree op;
1469 };
1470
1471 /* Comparator for sorting associatable chains. */
1472
1473 static int
1474 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1475 {
1476 auto *op1 = (const chain_op_t *) op1_;
1477 auto *op2 = (const chain_op_t *) op2_;
1478 if (op1->dt != op2->dt)
1479 return (int)op1->dt - (int)op2->dt;
1480 return (int)op1->code - (int)op2->code;
1481 }
1482
1483 /* Linearize the associatable expression chain at START with the
1484 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1485 filling CHAIN with the result and using WORKLIST as intermediate storage.
1486 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1487 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1488 stmts, starting with START. */
1489
1490 static void
1491 vect_slp_linearize_chain (vec_info *vinfo,
1492 vec<std::pair<tree_code, gimple *> > &worklist,
1493 vec<chain_op_t> &chain,
1494 enum tree_code code, gimple *start,
1495 gimple *&code_stmt, gimple *&alt_code_stmt,
1496 vec<gimple *> *chain_stmts)
1497 {
1498 /* For each lane linearize the addition/subtraction (or other
1499 uniform associatable operation) expression tree. */
1500 worklist.safe_push (std::make_pair (code, start));
1501 while (!worklist.is_empty ())
1502 {
1503 auto entry = worklist.pop ();
1504 gassign *stmt = as_a <gassign *> (entry.second);
1505 enum tree_code in_code = entry.first;
1506 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1507 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1508 if (!code_stmt
1509 && gimple_assign_rhs_code (stmt) == code)
1510 code_stmt = stmt;
1511 else if (!alt_code_stmt
1512 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1513 alt_code_stmt = stmt;
1514 if (chain_stmts)
1515 chain_stmts->safe_push (stmt);
1516 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1517 {
1518 tree op = gimple_op (stmt, opnum);
1519 vect_def_type dt;
1520 stmt_vec_info def_stmt_info;
1521 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1522 gcc_assert (res);
1523 if (dt == vect_internal_def
1524 && is_pattern_stmt_p (def_stmt_info))
1525 op = gimple_get_lhs (def_stmt_info->stmt);
1526 gimple *use_stmt;
1527 use_operand_p use_p;
1528 if (dt == vect_internal_def
1529 && single_imm_use (op, &use_p, &use_stmt)
1530 && is_gimple_assign (def_stmt_info->stmt)
1531 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1532 || (code == PLUS_EXPR
1533 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1534 == MINUS_EXPR))))
1535 {
1536 tree_code op_def_code = this_code;
1537 if (op_def_code == MINUS_EXPR && opnum == 1)
1538 op_def_code = PLUS_EXPR;
1539 if (in_code == MINUS_EXPR)
1540 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1541 worklist.safe_push (std::make_pair (op_def_code,
1542 def_stmt_info->stmt));
1543 }
1544 else
1545 {
1546 tree_code op_def_code = this_code;
1547 if (op_def_code == MINUS_EXPR && opnum == 1)
1548 op_def_code = PLUS_EXPR;
1549 if (in_code == MINUS_EXPR)
1550 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1551 chain.safe_push (chain_op_t (op_def_code, dt, op));
1552 }
1553 }
1554 }
1555 }
1556
1557 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1558 simple_hashmap_traits <bst_traits, slp_tree> >
1559 scalar_stmts_to_slp_tree_map_t;
1560
1561 static slp_tree
1562 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1563 vec<stmt_vec_info> stmts, unsigned int group_size,
1564 poly_uint64 *max_nunits,
1565 bool *matches, unsigned *limit, unsigned *tree_size,
1566 scalar_stmts_to_slp_tree_map_t *bst_map);
1567
1568 static slp_tree
1569 vect_build_slp_tree (vec_info *vinfo,
1570 vec<stmt_vec_info> stmts, unsigned int group_size,
1571 poly_uint64 *max_nunits,
1572 bool *matches, unsigned *limit, unsigned *tree_size,
1573 scalar_stmts_to_slp_tree_map_t *bst_map)
1574 {
1575 if (slp_tree *leader = bst_map->get (stmts))
1576 {
1577 if (dump_enabled_p ())
1578 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1579 !(*leader)->failed ? "" : "failed ",
1580 (void *) *leader);
1581 if (!(*leader)->failed)
1582 {
1583 SLP_TREE_REF_COUNT (*leader)++;
1584 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1585 stmts.release ();
1586 return *leader;
1587 }
1588 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1589 return NULL;
1590 }
1591
1592 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1593 so we can pick up backedge destinations during discovery. */
1594 slp_tree res = new _slp_tree;
1595 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1596 SLP_TREE_SCALAR_STMTS (res) = stmts;
1597 bst_map->put (stmts.copy (), res);
1598
1599 if (*limit == 0)
1600 {
1601 if (dump_enabled_p ())
1602 dump_printf_loc (MSG_NOTE, vect_location,
1603 "SLP discovery limit exceeded\n");
1604 /* Mark the node invalid so we can detect those when still in use
1605 as backedge destinations. */
1606 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1607 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1608 res->failed = XNEWVEC (bool, group_size);
1609 memset (res->failed, 0, sizeof (bool) * group_size);
1610 memset (matches, 0, sizeof (bool) * group_size);
1611 return NULL;
1612 }
1613 --*limit;
1614
1615 if (dump_enabled_p ())
1616 dump_printf_loc (MSG_NOTE, vect_location,
1617 "starting SLP discovery for node %p\n", (void *) res);
1618
1619 poly_uint64 this_max_nunits = 1;
1620 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1621 &this_max_nunits,
1622 matches, limit, tree_size, bst_map);
1623 if (!res_)
1624 {
1625 if (dump_enabled_p ())
1626 dump_printf_loc (MSG_NOTE, vect_location,
1627 "SLP discovery for node %p failed\n", (void *) res);
1628 /* Mark the node invalid so we can detect those when still in use
1629 as backedge destinations. */
1630 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1631 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1632 res->failed = XNEWVEC (bool, group_size);
1633 if (flag_checking)
1634 {
1635 unsigned i;
1636 for (i = 0; i < group_size; ++i)
1637 if (!matches[i])
1638 break;
1639 gcc_assert (i < group_size);
1640 }
1641 memcpy (res->failed, matches, sizeof (bool) * group_size);
1642 }
1643 else
1644 {
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_NOTE, vect_location,
1647 "SLP discovery for node %p succeeded\n",
1648 (void *) res);
1649 gcc_assert (res_ == res);
1650 res->max_nunits = this_max_nunits;
1651 vect_update_max_nunits (max_nunits, this_max_nunits);
1652 /* Keep a reference for the bst_map use. */
1653 SLP_TREE_REF_COUNT (res)++;
1654 }
1655 return res_;
1656 }
1657
1658 /* Helper for building an associated SLP node chain. */
1659
1660 static void
1661 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1662 slp_tree op0, slp_tree op1,
1663 stmt_vec_info oper1, stmt_vec_info oper2,
1664 vec<std::pair<unsigned, unsigned> > lperm)
1665 {
1666 unsigned group_size = SLP_TREE_LANES (op1);
1667
1668 slp_tree child1 = new _slp_tree;
1669 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1670 SLP_TREE_VECTYPE (child1) = vectype;
1671 SLP_TREE_LANES (child1) = group_size;
1672 SLP_TREE_CHILDREN (child1).create (2);
1673 SLP_TREE_CHILDREN (child1).quick_push (op0);
1674 SLP_TREE_CHILDREN (child1).quick_push (op1);
1675 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1676
1677 slp_tree child2 = new _slp_tree;
1678 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1679 SLP_TREE_VECTYPE (child2) = vectype;
1680 SLP_TREE_LANES (child2) = group_size;
1681 SLP_TREE_CHILDREN (child2).create (2);
1682 SLP_TREE_CHILDREN (child2).quick_push (op0);
1683 SLP_TREE_REF_COUNT (op0)++;
1684 SLP_TREE_CHILDREN (child2).quick_push (op1);
1685 SLP_TREE_REF_COUNT (op1)++;
1686 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1687
1688 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1689 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1690 SLP_TREE_VECTYPE (perm) = vectype;
1691 SLP_TREE_LANES (perm) = group_size;
1692 /* ??? We should set this NULL but that's not expected. */
1693 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1694 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1695 SLP_TREE_CHILDREN (perm).quick_push (child1);
1696 SLP_TREE_CHILDREN (perm).quick_push (child2);
1697 }
1698
1699 /* Recursively build an SLP tree starting from NODE.
1700 Fail (and return a value not equal to zero) if def-stmts are not
1701 isomorphic, require data permutation or are of unsupported types of
1702 operation. Otherwise, return 0.
1703 The value returned is the depth in the SLP tree where a mismatch
1704 was found. */
1705
1706 static slp_tree
1707 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1708 vec<stmt_vec_info> stmts, unsigned int group_size,
1709 poly_uint64 *max_nunits,
1710 bool *matches, unsigned *limit, unsigned *tree_size,
1711 scalar_stmts_to_slp_tree_map_t *bst_map)
1712 {
1713 unsigned nops, i, this_tree_size = 0;
1714 poly_uint64 this_max_nunits = *max_nunits;
1715
1716 matches[0] = false;
1717
1718 stmt_vec_info stmt_info = stmts[0];
1719 if (!is_a<gcall *> (stmt_info->stmt)
1720 && !is_a<gassign *> (stmt_info->stmt)
1721 && !is_a<gphi *> (stmt_info->stmt))
1722 return NULL;
1723
1724 nops = gimple_num_args (stmt_info->stmt);
1725 if (const int *map = vect_get_operand_map (stmt_info->stmt))
1726 nops = map[0];
1727
1728 /* If the SLP node is a PHI (induction or reduction), terminate
1729 the recursion. */
1730 bool *skip_args = XALLOCAVEC (bool, nops);
1731 memset (skip_args, 0, sizeof (bool) * nops);
1732 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1733 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1734 {
1735 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1736 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1737 group_size);
1738 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1739 max_nunits))
1740 return NULL;
1741
1742 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1743 if (def_type == vect_induction_def)
1744 {
1745 /* Induction PHIs are not cycles but walk the initial
1746 value. Only for inner loops through, for outer loops
1747 we need to pick up the value from the actual PHIs
1748 to more easily support peeling and epilogue vectorization. */
1749 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1750 if (!nested_in_vect_loop_p (loop, stmt_info))
1751 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1752 else
1753 loop = loop->inner;
1754 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1755 }
1756 else if (def_type == vect_reduction_def
1757 || def_type == vect_double_reduction_def
1758 || def_type == vect_nested_cycle
1759 || def_type == vect_first_order_recurrence)
1760 {
1761 /* Else def types have to match. */
1762 stmt_vec_info other_info;
1763 bool all_same = true;
1764 FOR_EACH_VEC_ELT (stmts, i, other_info)
1765 {
1766 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1767 return NULL;
1768 if (other_info != stmt_info)
1769 all_same = false;
1770 }
1771 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1772 /* Reduction initial values are not explicitely represented. */
1773 if (def_type != vect_first_order_recurrence
1774 && !nested_in_vect_loop_p (loop, stmt_info))
1775 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1776 /* Reduction chain backedge defs are filled manually.
1777 ??? Need a better way to identify a SLP reduction chain PHI.
1778 Or a better overall way to SLP match those. */
1779 if (all_same && def_type == vect_reduction_def)
1780 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1781 }
1782 else if (def_type != vect_internal_def)
1783 return NULL;
1784 }
1785
1786
1787 bool two_operators = false;
1788 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1789 tree vectype = NULL_TREE;
1790 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1791 &this_max_nunits, matches, &two_operators,
1792 &vectype))
1793 return NULL;
1794
1795 /* If the SLP node is a load, terminate the recursion unless masked. */
1796 if (STMT_VINFO_DATA_REF (stmt_info)
1797 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1798 {
1799 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1800 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1801 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1802 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1803 else
1804 {
1805 *max_nunits = this_max_nunits;
1806 (*tree_size)++;
1807 node = vect_create_new_slp_node (node, stmts, 0);
1808 SLP_TREE_VECTYPE (node) = vectype;
1809 /* And compute the load permutation. Whether it is actually
1810 a permutation depends on the unrolling factor which is
1811 decided later. */
1812 vec<unsigned> load_permutation;
1813 int j;
1814 stmt_vec_info load_info;
1815 load_permutation.create (group_size);
1816 stmt_vec_info first_stmt_info
1817 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1818 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1819 {
1820 int load_place;
1821 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1822 load_place = vect_get_place_in_interleaving_chain
1823 (load_info, first_stmt_info);
1824 else
1825 load_place = 0;
1826 gcc_assert (load_place != -1);
1827 load_permutation.safe_push (load_place);
1828 }
1829 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1830 return node;
1831 }
1832 }
1833 else if (gimple_assign_single_p (stmt_info->stmt)
1834 && !gimple_vuse (stmt_info->stmt)
1835 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1836 {
1837 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1838 the same SSA name vector of a compatible type to vectype. */
1839 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1840 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1841 stmt_vec_info estmt_info;
1842 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1843 {
1844 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1845 tree bfref = gimple_assign_rhs1 (estmt);
1846 HOST_WIDE_INT lane;
1847 if (!known_eq (bit_field_size (bfref),
1848 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1849 || !constant_multiple_p (bit_field_offset (bfref),
1850 bit_field_size (bfref), &lane))
1851 {
1852 lperm.release ();
1853 matches[0] = false;
1854 return NULL;
1855 }
1856 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1857 }
1858 slp_tree vnode = vect_create_new_slp_node (vNULL);
1859 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
1860 /* ??? We record vectype here but we hide eventually necessary
1861 punning and instead rely on code generation to materialize
1862 VIEW_CONVERT_EXPRs as necessary. We instead should make
1863 this explicit somehow. */
1864 SLP_TREE_VECTYPE (vnode) = vectype;
1865 else
1866 {
1867 /* For different size but compatible elements we can still
1868 use VEC_PERM_EXPR without punning. */
1869 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
1870 && types_compatible_p (TREE_TYPE (vectype),
1871 TREE_TYPE (TREE_TYPE (vec))));
1872 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1873 }
1874 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
1875 unsigned HOST_WIDE_INT const_nunits;
1876 if (nunits.is_constant (&const_nunits))
1877 SLP_TREE_LANES (vnode) = const_nunits;
1878 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1879 /* We are always building a permutation node even if it is an identity
1880 permute to shield the rest of the vectorizer from the odd node
1881 representing an actual vector without any scalar ops.
1882 ??? We could hide it completely with making the permute node
1883 external? */
1884 node = vect_create_new_slp_node (node, stmts, 1);
1885 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1886 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1887 SLP_TREE_VECTYPE (node) = vectype;
1888 SLP_TREE_CHILDREN (node).quick_push (vnode);
1889 return node;
1890 }
1891 /* When discovery reaches an associatable operation see whether we can
1892 improve that to match up lanes in a way superior to the operand
1893 swapping code which at most looks at two defs.
1894 ??? For BB vectorization we cannot do the brute-force search
1895 for matching as we can succeed by means of builds from scalars
1896 and have no good way to "cost" one build against another. */
1897 else if (is_a <loop_vec_info> (vinfo)
1898 /* ??? We don't handle !vect_internal_def defs below. */
1899 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1900 && is_gimple_assign (stmt_info->stmt)
1901 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1902 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1903 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1904 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1905 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1906 {
1907 /* See if we have a chain of (mixed) adds or subtracts or other
1908 associatable ops. */
1909 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1910 if (code == MINUS_EXPR)
1911 code = PLUS_EXPR;
1912 stmt_vec_info other_op_stmt_info = NULL;
1913 stmt_vec_info op_stmt_info = NULL;
1914 unsigned chain_len = 0;
1915 auto_vec<chain_op_t> chain;
1916 auto_vec<std::pair<tree_code, gimple *> > worklist;
1917 auto_vec<vec<chain_op_t> > chains (group_size);
1918 auto_vec<slp_tree, 4> children;
1919 bool hard_fail = true;
1920 for (unsigned lane = 0; lane < group_size; ++lane)
1921 {
1922 /* For each lane linearize the addition/subtraction (or other
1923 uniform associatable operation) expression tree. */
1924 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1925 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1926 stmts[lane]->stmt, op_stmt, other_op_stmt,
1927 NULL);
1928 if (!op_stmt_info && op_stmt)
1929 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1930 if (!other_op_stmt_info && other_op_stmt)
1931 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1932 if (chain.length () == 2)
1933 {
1934 /* In a chain of just two elements resort to the regular
1935 operand swapping scheme. If we run into a length
1936 mismatch still hard-FAIL. */
1937 if (chain_len == 0)
1938 hard_fail = false;
1939 else
1940 {
1941 matches[lane] = false;
1942 /* ??? We might want to process the other lanes, but
1943 make sure to not give false matching hints to the
1944 caller for lanes we did not process. */
1945 if (lane != group_size - 1)
1946 matches[0] = false;
1947 }
1948 break;
1949 }
1950 else if (chain_len == 0)
1951 chain_len = chain.length ();
1952 else if (chain.length () != chain_len)
1953 {
1954 /* ??? Here we could slip in magic to compensate with
1955 neutral operands. */
1956 matches[lane] = false;
1957 if (lane != group_size - 1)
1958 matches[0] = false;
1959 break;
1960 }
1961 chains.quick_push (chain.copy ());
1962 chain.truncate (0);
1963 }
1964 if (chains.length () == group_size)
1965 {
1966 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1967 if (!op_stmt_info)
1968 {
1969 hard_fail = false;
1970 goto out;
1971 }
1972 /* Now we have a set of chains with the same length. */
1973 /* 1. pre-sort according to def_type and operation. */
1974 for (unsigned lane = 0; lane < group_size; ++lane)
1975 chains[lane].stablesort (dt_sort_cmp, vinfo);
1976 if (dump_enabled_p ())
1977 {
1978 dump_printf_loc (MSG_NOTE, vect_location,
1979 "pre-sorted chains of %s\n",
1980 get_tree_code_name (code));
1981 for (unsigned lane = 0; lane < group_size; ++lane)
1982 {
1983 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1984 dump_printf (MSG_NOTE, "%s %T ",
1985 get_tree_code_name (chains[lane][opnum].code),
1986 chains[lane][opnum].op);
1987 dump_printf (MSG_NOTE, "\n");
1988 }
1989 }
1990 /* 2. try to build children nodes, associating as necessary. */
1991 for (unsigned n = 0; n < chain_len; ++n)
1992 {
1993 vect_def_type dt = chains[0][n].dt;
1994 unsigned lane;
1995 for (lane = 0; lane < group_size; ++lane)
1996 if (chains[lane][n].dt != dt)
1997 {
1998 if (dt == vect_constant_def
1999 && chains[lane][n].dt == vect_external_def)
2000 dt = vect_external_def;
2001 else if (dt == vect_external_def
2002 && chains[lane][n].dt == vect_constant_def)
2003 ;
2004 else
2005 break;
2006 }
2007 if (lane != group_size)
2008 {
2009 if (dump_enabled_p ())
2010 dump_printf_loc (MSG_NOTE, vect_location,
2011 "giving up on chain due to mismatched "
2012 "def types\n");
2013 matches[lane] = false;
2014 if (lane != group_size - 1)
2015 matches[0] = false;
2016 goto out;
2017 }
2018 if (dt == vect_constant_def
2019 || dt == vect_external_def)
2020 {
2021 /* Check whether we can build the invariant. If we can't
2022 we never will be able to. */
2023 tree type = TREE_TYPE (chains[0][n].op);
2024 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2025 && (TREE_CODE (type) == BOOLEAN_TYPE
2026 || !can_duplicate_and_interleave_p (vinfo, group_size,
2027 type)))
2028 {
2029 matches[0] = false;
2030 goto out;
2031 }
2032 vec<tree> ops;
2033 ops.create (group_size);
2034 for (lane = 0; lane < group_size; ++lane)
2035 ops.quick_push (chains[lane][n].op);
2036 slp_tree child = vect_create_new_slp_node (ops);
2037 SLP_TREE_DEF_TYPE (child) = dt;
2038 children.safe_push (child);
2039 }
2040 else if (dt != vect_internal_def)
2041 {
2042 /* Not sure, we might need sth special.
2043 gcc.dg/vect/pr96854.c,
2044 gfortran.dg/vect/fast-math-pr37021.f90
2045 and gfortran.dg/vect/pr61171.f trigger. */
2046 /* Soft-fail for now. */
2047 hard_fail = false;
2048 goto out;
2049 }
2050 else
2051 {
2052 vec<stmt_vec_info> op_stmts;
2053 op_stmts.create (group_size);
2054 slp_tree child = NULL;
2055 /* Brute-force our way. We have to consider a lane
2056 failing after fixing an earlier fail up in the
2057 SLP discovery recursion. So track the current
2058 permute per lane. */
2059 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2060 memset (perms, 0, sizeof (unsigned) * group_size);
2061 do
2062 {
2063 op_stmts.truncate (0);
2064 for (lane = 0; lane < group_size; ++lane)
2065 op_stmts.quick_push
2066 (vinfo->lookup_def (chains[lane][n].op));
2067 child = vect_build_slp_tree (vinfo, op_stmts,
2068 group_size, &this_max_nunits,
2069 matches, limit,
2070 &this_tree_size, bst_map);
2071 /* ??? We're likely getting too many fatal mismatches
2072 here so maybe we want to ignore them (but then we
2073 have no idea which lanes fatally mismatched). */
2074 if (child || !matches[0])
2075 break;
2076 /* Swap another lane we have not yet matched up into
2077 lanes that did not match. If we run out of
2078 permute possibilities for a lane terminate the
2079 search. */
2080 bool term = false;
2081 for (lane = 1; lane < group_size; ++lane)
2082 if (!matches[lane])
2083 {
2084 if (n + perms[lane] + 1 == chain_len)
2085 {
2086 term = true;
2087 break;
2088 }
2089 std::swap (chains[lane][n],
2090 chains[lane][n + perms[lane] + 1]);
2091 perms[lane]++;
2092 }
2093 if (term)
2094 break;
2095 }
2096 while (1);
2097 if (!child)
2098 {
2099 if (dump_enabled_p ())
2100 dump_printf_loc (MSG_NOTE, vect_location,
2101 "failed to match up op %d\n", n);
2102 op_stmts.release ();
2103 if (lane != group_size - 1)
2104 matches[0] = false;
2105 else
2106 matches[lane] = false;
2107 goto out;
2108 }
2109 if (dump_enabled_p ())
2110 {
2111 dump_printf_loc (MSG_NOTE, vect_location,
2112 "matched up op %d to\n", n);
2113 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2114 }
2115 children.safe_push (child);
2116 }
2117 }
2118 /* 3. build SLP nodes to combine the chain. */
2119 for (unsigned lane = 0; lane < group_size; ++lane)
2120 if (chains[lane][0].code != code)
2121 {
2122 /* See if there's any alternate all-PLUS entry. */
2123 unsigned n;
2124 for (n = 1; n < chain_len; ++n)
2125 {
2126 for (lane = 0; lane < group_size; ++lane)
2127 if (chains[lane][n].code != code)
2128 break;
2129 if (lane == group_size)
2130 break;
2131 }
2132 if (n != chain_len)
2133 {
2134 /* Swap that in at first position. */
2135 std::swap (children[0], children[n]);
2136 for (lane = 0; lane < group_size; ++lane)
2137 std::swap (chains[lane][0], chains[lane][n]);
2138 }
2139 else
2140 {
2141 /* ??? When this triggers and we end up with two
2142 vect_constant/external_def up-front things break (ICE)
2143 spectacularly finding an insertion place for the
2144 all-constant op. We should have a fully
2145 vect_internal_def operand though(?) so we can swap
2146 that into first place and then prepend the all-zero
2147 constant. */
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_NOTE, vect_location,
2150 "inserting constant zero to compensate "
2151 "for (partially) negated first "
2152 "operand\n");
2153 chain_len++;
2154 for (lane = 0; lane < group_size; ++lane)
2155 chains[lane].safe_insert
2156 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2157 vec<tree> zero_ops;
2158 zero_ops.create (group_size);
2159 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2160 for (lane = 1; lane < group_size; ++lane)
2161 zero_ops.quick_push (zero_ops[0]);
2162 slp_tree zero = vect_create_new_slp_node (zero_ops);
2163 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2164 children.safe_insert (0, zero);
2165 }
2166 break;
2167 }
2168 for (unsigned i = 1; i < children.length (); ++i)
2169 {
2170 slp_tree op0 = children[i - 1];
2171 slp_tree op1 = children[i];
2172 bool this_two_op = false;
2173 for (unsigned lane = 0; lane < group_size; ++lane)
2174 if (chains[lane][i].code != chains[0][i].code)
2175 {
2176 this_two_op = true;
2177 break;
2178 }
2179 slp_tree child;
2180 if (i == children.length () - 1)
2181 child = vect_create_new_slp_node (node, stmts, 2);
2182 else
2183 child = vect_create_new_slp_node (2, ERROR_MARK);
2184 if (this_two_op)
2185 {
2186 vec<std::pair<unsigned, unsigned> > lperm;
2187 lperm.create (group_size);
2188 for (unsigned lane = 0; lane < group_size; ++lane)
2189 lperm.quick_push (std::make_pair
2190 (chains[lane][i].code != chains[0][i].code, lane));
2191 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2192 (chains[0][i].code == code
2193 ? op_stmt_info
2194 : other_op_stmt_info),
2195 (chains[0][i].code == code
2196 ? other_op_stmt_info
2197 : op_stmt_info),
2198 lperm);
2199 }
2200 else
2201 {
2202 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2203 SLP_TREE_VECTYPE (child) = vectype;
2204 SLP_TREE_LANES (child) = group_size;
2205 SLP_TREE_CHILDREN (child).quick_push (op0);
2206 SLP_TREE_CHILDREN (child).quick_push (op1);
2207 SLP_TREE_REPRESENTATIVE (child)
2208 = (chains[0][i].code == code
2209 ? op_stmt_info : other_op_stmt_info);
2210 }
2211 children[i] = child;
2212 }
2213 *tree_size += this_tree_size + 1;
2214 *max_nunits = this_max_nunits;
2215 while (!chains.is_empty ())
2216 chains.pop ().release ();
2217 return node;
2218 }
2219 out:
2220 while (!children.is_empty ())
2221 vect_free_slp_tree (children.pop ());
2222 while (!chains.is_empty ())
2223 chains.pop ().release ();
2224 /* Hard-fail, otherwise we might run into quadratic processing of the
2225 chains starting one stmt into the chain again. */
2226 if (hard_fail)
2227 return NULL;
2228 /* Fall thru to normal processing. */
2229 }
2230
2231 /* Get at the operands, verifying they are compatible. */
2232 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2233 slp_oprnd_info oprnd_info;
2234 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2235 {
2236 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2237 stmts, i, &oprnds_info);
2238 if (res != 0)
2239 matches[(res == -1) ? 0 : i] = false;
2240 if (!matches[0])
2241 break;
2242 }
2243 for (i = 0; i < group_size; ++i)
2244 if (!matches[i])
2245 {
2246 vect_free_oprnd_info (oprnds_info);
2247 return NULL;
2248 }
2249 swap = NULL;
2250
2251 auto_vec<slp_tree, 4> children;
2252
2253 stmt_info = stmts[0];
2254
2255 /* Create SLP_TREE nodes for the definition node/s. */
2256 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2257 {
2258 slp_tree child;
2259 unsigned int j;
2260
2261 /* We're skipping certain operands from processing, for example
2262 outer loop reduction initial defs. */
2263 if (skip_args[i])
2264 {
2265 children.safe_push (NULL);
2266 continue;
2267 }
2268
2269 if (oprnd_info->first_dt == vect_uninitialized_def)
2270 {
2271 /* COND_EXPR have one too many eventually if the condition
2272 is a SSA name. */
2273 gcc_assert (i == 3 && nops == 4);
2274 continue;
2275 }
2276
2277 if (is_a <bb_vec_info> (vinfo)
2278 && oprnd_info->first_dt == vect_internal_def
2279 && !oprnd_info->any_pattern)
2280 {
2281 /* For BB vectorization, if all defs are the same do not
2282 bother to continue the build along the single-lane
2283 graph but use a splat of the scalar value. */
2284 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2285 for (j = 1; j < group_size; ++j)
2286 if (oprnd_info->def_stmts[j] != first_def)
2287 break;
2288 if (j == group_size
2289 /* But avoid doing this for loads where we may be
2290 able to CSE things, unless the stmt is not
2291 vectorizable. */
2292 && (!STMT_VINFO_VECTORIZABLE (first_def)
2293 || !gimple_vuse (first_def->stmt)))
2294 {
2295 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_NOTE, vect_location,
2297 "Using a splat of the uniform operand %G",
2298 first_def->stmt);
2299 oprnd_info->first_dt = vect_external_def;
2300 }
2301 }
2302
2303 if (oprnd_info->first_dt == vect_external_def
2304 || oprnd_info->first_dt == vect_constant_def)
2305 {
2306 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2307 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2308 oprnd_info->ops = vNULL;
2309 children.safe_push (invnode);
2310 continue;
2311 }
2312
2313 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2314 group_size, &this_max_nunits,
2315 matches, limit,
2316 &this_tree_size, bst_map)) != NULL)
2317 {
2318 oprnd_info->def_stmts = vNULL;
2319 children.safe_push (child);
2320 continue;
2321 }
2322
2323 /* If the SLP build for operand zero failed and operand zero
2324 and one can be commutated try that for the scalar stmts
2325 that failed the match. */
2326 if (i == 0
2327 /* A first scalar stmt mismatch signals a fatal mismatch. */
2328 && matches[0]
2329 /* ??? For COND_EXPRs we can swap the comparison operands
2330 as well as the arms under some constraints. */
2331 && nops == 2
2332 && oprnds_info[1]->first_dt == vect_internal_def
2333 && is_gimple_assign (stmt_info->stmt)
2334 /* Swapping operands for reductions breaks assumptions later on. */
2335 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2336 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2337 {
2338 /* See whether we can swap the matching or the non-matching
2339 stmt operands. */
2340 bool swap_not_matching = true;
2341 do
2342 {
2343 for (j = 0; j < group_size; ++j)
2344 {
2345 if (matches[j] != !swap_not_matching)
2346 continue;
2347 stmt_vec_info stmt_info = stmts[j];
2348 /* Verify if we can swap operands of this stmt. */
2349 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2350 if (!stmt
2351 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2352 {
2353 if (!swap_not_matching)
2354 goto fail;
2355 swap_not_matching = false;
2356 break;
2357 }
2358 }
2359 }
2360 while (j != group_size);
2361
2362 /* Swap mismatched definition stmts. */
2363 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_NOTE, vect_location,
2365 "Re-trying with swapped operands of stmts ");
2366 for (j = 0; j < group_size; ++j)
2367 if (matches[j] == !swap_not_matching)
2368 {
2369 std::swap (oprnds_info[0]->def_stmts[j],
2370 oprnds_info[1]->def_stmts[j]);
2371 std::swap (oprnds_info[0]->ops[j],
2372 oprnds_info[1]->ops[j]);
2373 if (dump_enabled_p ())
2374 dump_printf (MSG_NOTE, "%d ", j);
2375 }
2376 if (dump_enabled_p ())
2377 dump_printf (MSG_NOTE, "\n");
2378 /* After swapping some operands we lost track whether an
2379 operand has any pattern defs so be conservative here. */
2380 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2381 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2382 /* And try again with scratch 'matches' ... */
2383 bool *tem = XALLOCAVEC (bool, group_size);
2384 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2385 group_size, &this_max_nunits,
2386 tem, limit,
2387 &this_tree_size, bst_map)) != NULL)
2388 {
2389 oprnd_info->def_stmts = vNULL;
2390 children.safe_push (child);
2391 continue;
2392 }
2393 }
2394 fail:
2395
2396 /* If the SLP build failed and we analyze a basic-block
2397 simply treat nodes we fail to build as externally defined
2398 (and thus build vectors from the scalar defs).
2399 The cost model will reject outright expensive cases.
2400 ??? This doesn't treat cases where permutation ultimatively
2401 fails (or we don't try permutation below). Ideally we'd
2402 even compute a permutation that will end up with the maximum
2403 SLP tree size... */
2404 if (is_a <bb_vec_info> (vinfo)
2405 /* ??? Rejecting patterns this way doesn't work. We'd have to
2406 do extra work to cancel the pattern so the uses see the
2407 scalar version. */
2408 && !is_pattern_stmt_p (stmt_info)
2409 && !oprnd_info->any_pattern)
2410 {
2411 /* But if there's a leading vector sized set of matching stmts
2412 fail here so we can split the group. This matches the condition
2413 vect_analyze_slp_instance uses. */
2414 /* ??? We might want to split here and combine the results to support
2415 multiple vector sizes better. */
2416 for (j = 0; j < group_size; ++j)
2417 if (!matches[j])
2418 break;
2419 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2420 {
2421 if (dump_enabled_p ())
2422 dump_printf_loc (MSG_NOTE, vect_location,
2423 "Building vector operands from scalars\n");
2424 this_tree_size++;
2425 child = vect_create_new_slp_node (oprnd_info->ops);
2426 children.safe_push (child);
2427 oprnd_info->ops = vNULL;
2428 continue;
2429 }
2430 }
2431
2432 gcc_assert (child == NULL);
2433 FOR_EACH_VEC_ELT (children, j, child)
2434 if (child)
2435 vect_free_slp_tree (child);
2436 vect_free_oprnd_info (oprnds_info);
2437 return NULL;
2438 }
2439
2440 vect_free_oprnd_info (oprnds_info);
2441
2442 /* If we have all children of a child built up from uniform scalars
2443 or does more than one possibly expensive vector construction then
2444 just throw that away, causing it built up from scalars.
2445 The exception is the SLP node for the vector store. */
2446 if (is_a <bb_vec_info> (vinfo)
2447 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2448 /* ??? Rejecting patterns this way doesn't work. We'd have to
2449 do extra work to cancel the pattern so the uses see the
2450 scalar version. */
2451 && !is_pattern_stmt_p (stmt_info))
2452 {
2453 slp_tree child;
2454 unsigned j;
2455 bool all_uniform_p = true;
2456 unsigned n_vector_builds = 0;
2457 FOR_EACH_VEC_ELT (children, j, child)
2458 {
2459 if (!child)
2460 ;
2461 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2462 all_uniform_p = false;
2463 else if (!vect_slp_tree_uniform_p (child))
2464 {
2465 all_uniform_p = false;
2466 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2467 n_vector_builds++;
2468 }
2469 }
2470 if (all_uniform_p
2471 || n_vector_builds > 1
2472 || (n_vector_builds == children.length ()
2473 && is_a <gphi *> (stmt_info->stmt)))
2474 {
2475 /* Roll back. */
2476 matches[0] = false;
2477 FOR_EACH_VEC_ELT (children, j, child)
2478 if (child)
2479 vect_free_slp_tree (child);
2480
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_NOTE, vect_location,
2483 "Building parent vector operands from "
2484 "scalars instead\n");
2485 return NULL;
2486 }
2487 }
2488
2489 *tree_size += this_tree_size + 1;
2490 *max_nunits = this_max_nunits;
2491
2492 if (two_operators)
2493 {
2494 /* ??? We'd likely want to either cache in bst_map sth like
2495 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2496 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2497 explicit stmts to put in so the keying on 'stmts' doesn't
2498 work (but we have the same issue with nodes that use 'ops'). */
2499 slp_tree one = new _slp_tree;
2500 slp_tree two = new _slp_tree;
2501 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2502 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2503 SLP_TREE_VECTYPE (one) = vectype;
2504 SLP_TREE_VECTYPE (two) = vectype;
2505 SLP_TREE_CHILDREN (one).safe_splice (children);
2506 SLP_TREE_CHILDREN (two).safe_splice (children);
2507 slp_tree child;
2508 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2509 SLP_TREE_REF_COUNT (child)++;
2510
2511 /* Here we record the original defs since this
2512 node represents the final lane configuration. */
2513 node = vect_create_new_slp_node (node, stmts, 2);
2514 SLP_TREE_VECTYPE (node) = vectype;
2515 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2516 SLP_TREE_CHILDREN (node).quick_push (one);
2517 SLP_TREE_CHILDREN (node).quick_push (two);
2518 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2519 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2520 enum tree_code ocode = ERROR_MARK;
2521 stmt_vec_info ostmt_info;
2522 unsigned j = 0;
2523 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2524 {
2525 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2526 if (gimple_assign_rhs_code (ostmt) != code0)
2527 {
2528 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2529 ocode = gimple_assign_rhs_code (ostmt);
2530 j = i;
2531 }
2532 else
2533 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2534 }
2535 SLP_TREE_CODE (one) = code0;
2536 SLP_TREE_CODE (two) = ocode;
2537 SLP_TREE_LANES (one) = stmts.length ();
2538 SLP_TREE_LANES (two) = stmts.length ();
2539 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2540 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2541 return node;
2542 }
2543
2544 node = vect_create_new_slp_node (node, stmts, nops);
2545 SLP_TREE_VECTYPE (node) = vectype;
2546 SLP_TREE_CHILDREN (node).splice (children);
2547 return node;
2548 }
2549
2550 /* Dump a single SLP tree NODE. */
2551
2552 static void
2553 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2554 slp_tree node)
2555 {
2556 unsigned i, j;
2557 slp_tree child;
2558 stmt_vec_info stmt_info;
2559 tree op;
2560
2561 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2562 dump_user_location_t user_loc = loc.get_user_location ();
2563 dump_printf_loc (metadata, user_loc,
2564 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2565 ", refcnt=%u)",
2566 SLP_TREE_DEF_TYPE (node) == vect_external_def
2567 ? " (external)"
2568 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2569 ? " (constant)"
2570 : ""), (void *) node,
2571 estimated_poly_value (node->max_nunits),
2572 SLP_TREE_REF_COUNT (node));
2573 if (SLP_TREE_VECTYPE (node))
2574 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2575 dump_printf (metadata, "\n");
2576 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2577 {
2578 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2579 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2580 else
2581 dump_printf_loc (metadata, user_loc, "op template: %G",
2582 SLP_TREE_REPRESENTATIVE (node)->stmt);
2583 }
2584 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2585 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2586 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2587 else
2588 {
2589 dump_printf_loc (metadata, user_loc, "\t{ ");
2590 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2591 dump_printf (metadata, "%T%s ", op,
2592 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2593 dump_printf (metadata, "}\n");
2594 }
2595 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2596 {
2597 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2598 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2599 dump_printf (dump_kind, " %u", j);
2600 dump_printf (dump_kind, " }\n");
2601 }
2602 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2603 {
2604 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2605 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2606 dump_printf (dump_kind, " %u[%u]",
2607 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2608 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2609 dump_printf (dump_kind, " }\n");
2610 }
2611 if (SLP_TREE_CHILDREN (node).is_empty ())
2612 return;
2613 dump_printf_loc (metadata, user_loc, "\tchildren");
2614 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2615 dump_printf (dump_kind, " %p", (void *)child);
2616 dump_printf (dump_kind, "\n");
2617 }
2618
2619 DEBUG_FUNCTION void
2620 debug (slp_tree node)
2621 {
2622 debug_dump_context ctx;
2623 vect_print_slp_tree (MSG_NOTE,
2624 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2625 node);
2626 }
2627
2628 /* Recursive helper for the dot producer below. */
2629
2630 static void
2631 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2632 {
2633 if (visited.add (node))
2634 return;
2635
2636 fprintf (f, "\"%p\" [label=\"", (void *)node);
2637 vect_print_slp_tree (MSG_NOTE,
2638 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2639 node);
2640 fprintf (f, "\"];\n");
2641
2642
2643 for (slp_tree child : SLP_TREE_CHILDREN (node))
2644 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2645
2646 for (slp_tree child : SLP_TREE_CHILDREN (node))
2647 if (child)
2648 dot_slp_tree (f, child, visited);
2649 }
2650
2651 DEBUG_FUNCTION void
2652 dot_slp_tree (const char *fname, slp_tree node)
2653 {
2654 FILE *f = fopen (fname, "w");
2655 fprintf (f, "digraph {\n");
2656 fflush (f);
2657 {
2658 debug_dump_context ctx (f);
2659 hash_set<slp_tree> visited;
2660 dot_slp_tree (f, node, visited);
2661 }
2662 fflush (f);
2663 fprintf (f, "}\n");
2664 fclose (f);
2665 }
2666
2667 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2668
2669 static void
2670 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2671 slp_tree node, hash_set<slp_tree> &visited)
2672 {
2673 unsigned i;
2674 slp_tree child;
2675
2676 if (visited.add (node))
2677 return;
2678
2679 vect_print_slp_tree (dump_kind, loc, node);
2680
2681 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2682 if (child)
2683 vect_print_slp_graph (dump_kind, loc, child, visited);
2684 }
2685
2686 static void
2687 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2688 slp_tree entry)
2689 {
2690 hash_set<slp_tree> visited;
2691 vect_print_slp_graph (dump_kind, loc, entry, visited);
2692 }
2693
2694 /* Mark the tree rooted at NODE with PURE_SLP. */
2695
2696 static void
2697 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2698 {
2699 int i;
2700 stmt_vec_info stmt_info;
2701 slp_tree child;
2702
2703 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2704 return;
2705
2706 if (visited.add (node))
2707 return;
2708
2709 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2710 STMT_SLP_TYPE (stmt_info) = pure_slp;
2711
2712 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2713 if (child)
2714 vect_mark_slp_stmts (child, visited);
2715 }
2716
2717 static void
2718 vect_mark_slp_stmts (slp_tree node)
2719 {
2720 hash_set<slp_tree> visited;
2721 vect_mark_slp_stmts (node, visited);
2722 }
2723
2724 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2725
2726 static void
2727 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2728 {
2729 int i;
2730 stmt_vec_info stmt_info;
2731 slp_tree child;
2732
2733 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2734 return;
2735
2736 if (visited.add (node))
2737 return;
2738
2739 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2740 {
2741 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2742 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2743 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2744 }
2745
2746 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2747 if (child)
2748 vect_mark_slp_stmts_relevant (child, visited);
2749 }
2750
2751 static void
2752 vect_mark_slp_stmts_relevant (slp_tree node)
2753 {
2754 hash_set<slp_tree> visited;
2755 vect_mark_slp_stmts_relevant (node, visited);
2756 }
2757
2758
2759 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2760
2761 static void
2762 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2763 hash_set<slp_tree> &visited)
2764 {
2765 if (!node || visited.add (node))
2766 return;
2767
2768 if (SLP_TREE_CHILDREN (node).length () == 0)
2769 {
2770 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2771 return;
2772 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2773 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2774 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2775 loads.safe_push (node);
2776 }
2777 else
2778 {
2779 unsigned i;
2780 slp_tree child;
2781 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2782 vect_gather_slp_loads (loads, child, visited);
2783 }
2784 }
2785
2786
2787 /* Find the last store in SLP INSTANCE. */
2788
2789 stmt_vec_info
2790 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2791 {
2792 stmt_vec_info last = NULL;
2793 stmt_vec_info stmt_vinfo;
2794
2795 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2796 {
2797 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2798 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2799 }
2800
2801 return last;
2802 }
2803
2804 /* Find the first stmt in NODE. */
2805
2806 stmt_vec_info
2807 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2808 {
2809 stmt_vec_info first = NULL;
2810 stmt_vec_info stmt_vinfo;
2811
2812 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2813 {
2814 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2815 if (!first
2816 || get_later_stmt (stmt_vinfo, first) == first)
2817 first = stmt_vinfo;
2818 }
2819
2820 return first;
2821 }
2822
2823 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2824 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2825 (also containing the first GROUP1_SIZE stmts, since stores are
2826 consecutive), the second containing the remainder.
2827 Return the first stmt in the second group. */
2828
2829 static stmt_vec_info
2830 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2831 {
2832 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2833 gcc_assert (group1_size > 0);
2834 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2835 gcc_assert (group2_size > 0);
2836 DR_GROUP_SIZE (first_vinfo) = group1_size;
2837
2838 stmt_vec_info stmt_info = first_vinfo;
2839 for (unsigned i = group1_size; i > 1; i--)
2840 {
2841 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2842 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2843 }
2844 /* STMT is now the last element of the first group. */
2845 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2846 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2847
2848 DR_GROUP_SIZE (group2) = group2_size;
2849 for (stmt_info = group2; stmt_info;
2850 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2851 {
2852 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2853 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2854 }
2855
2856 /* For the second group, the DR_GROUP_GAP is that before the original group,
2857 plus skipping over the first vector. */
2858 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2859
2860 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2861 DR_GROUP_GAP (first_vinfo) += group2_size;
2862
2863 if (dump_enabled_p ())
2864 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2865 group1_size, group2_size);
2866
2867 return group2;
2868 }
2869
2870 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2871 statements and a vector of NUNITS elements. */
2872
2873 static poly_uint64
2874 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2875 {
2876 return exact_div (common_multiple (nunits, group_size), group_size);
2877 }
2878
2879 /* Helper that checks to see if a node is a load node. */
2880
2881 static inline bool
2882 vect_is_slp_load_node (slp_tree root)
2883 {
2884 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2885 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2886 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2887 }
2888
2889
2890 /* Helper function of optimize_load_redistribution that performs the operation
2891 recursively. */
2892
2893 static slp_tree
2894 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2895 vec_info *vinfo, unsigned int group_size,
2896 hash_map<slp_tree, slp_tree> *load_map,
2897 slp_tree root)
2898 {
2899 if (slp_tree *leader = load_map->get (root))
2900 return *leader;
2901
2902 slp_tree node;
2903 unsigned i;
2904
2905 /* For now, we don't know anything about externals so do not do anything. */
2906 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2907 return NULL;
2908 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2909 {
2910 /* First convert this node into a load node and add it to the leaves
2911 list and flatten the permute from a lane to a load one. If it's
2912 unneeded it will be elided later. */
2913 vec<stmt_vec_info> stmts;
2914 stmts.create (SLP_TREE_LANES (root));
2915 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2916 for (unsigned j = 0; j < lane_perm.length (); j++)
2917 {
2918 std::pair<unsigned, unsigned> perm = lane_perm[j];
2919 node = SLP_TREE_CHILDREN (root)[perm.first];
2920
2921 if (!vect_is_slp_load_node (node)
2922 || SLP_TREE_CHILDREN (node).exists ())
2923 {
2924 stmts.release ();
2925 goto next;
2926 }
2927
2928 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2929 }
2930
2931 if (dump_enabled_p ())
2932 dump_printf_loc (MSG_NOTE, vect_location,
2933 "converting stmts on permute node %p\n",
2934 (void *) root);
2935
2936 bool *matches = XALLOCAVEC (bool, group_size);
2937 poly_uint64 max_nunits = 1;
2938 unsigned tree_size = 0, limit = 1;
2939 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2940 matches, &limit, &tree_size, bst_map);
2941 if (!node)
2942 stmts.release ();
2943
2944 load_map->put (root, node);
2945 return node;
2946 }
2947
2948 next:
2949 load_map->put (root, NULL);
2950
2951 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2952 {
2953 slp_tree value
2954 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2955 node);
2956 if (value)
2957 {
2958 SLP_TREE_REF_COUNT (value)++;
2959 SLP_TREE_CHILDREN (root)[i] = value;
2960 /* ??? We know the original leafs of the replaced nodes will
2961 be referenced by bst_map, only the permutes created by
2962 pattern matching are not. */
2963 if (SLP_TREE_REF_COUNT (node) == 1)
2964 load_map->remove (node);
2965 vect_free_slp_tree (node);
2966 }
2967 }
2968
2969 return NULL;
2970 }
2971
2972 /* Temporary workaround for loads not being CSEd during SLP build. This
2973 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2974 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2975 same DR such that the final operation is equal to a permuted load. Such
2976 NODES are then directly converted into LOADS themselves. The nodes are
2977 CSEd using BST_MAP. */
2978
2979 static void
2980 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2981 vec_info *vinfo, unsigned int group_size,
2982 hash_map<slp_tree, slp_tree> *load_map,
2983 slp_tree root)
2984 {
2985 slp_tree node;
2986 unsigned i;
2987
2988 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2989 {
2990 slp_tree value
2991 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2992 node);
2993 if (value)
2994 {
2995 SLP_TREE_REF_COUNT (value)++;
2996 SLP_TREE_CHILDREN (root)[i] = value;
2997 /* ??? We know the original leafs of the replaced nodes will
2998 be referenced by bst_map, only the permutes created by
2999 pattern matching are not. */
3000 if (SLP_TREE_REF_COUNT (node) == 1)
3001 load_map->remove (node);
3002 vect_free_slp_tree (node);
3003 }
3004 }
3005 }
3006
3007 /* Helper function of vect_match_slp_patterns.
3008
3009 Attempts to match patterns against the slp tree rooted in REF_NODE using
3010 VINFO. Patterns are matched in post-order traversal.
3011
3012 If matching is successful the value in REF_NODE is updated and returned, if
3013 not then it is returned unchanged. */
3014
3015 static bool
3016 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3017 slp_tree_to_load_perm_map_t *perm_cache,
3018 slp_compat_nodes_map_t *compat_cache,
3019 hash_set<slp_tree> *visited)
3020 {
3021 unsigned i;
3022 slp_tree node = *ref_node;
3023 bool found_p = false;
3024 if (!node || visited->add (node))
3025 return false;
3026
3027 slp_tree child;
3028 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3029 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3030 vinfo, perm_cache, compat_cache,
3031 visited);
3032
3033 for (unsigned x = 0; x < num__slp_patterns; x++)
3034 {
3035 vect_pattern *pattern
3036 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3037 if (pattern)
3038 {
3039 pattern->build (vinfo);
3040 delete pattern;
3041 found_p = true;
3042 }
3043 }
3044
3045 return found_p;
3046 }
3047
3048 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3049 vec_info VINFO.
3050
3051 The modified tree is returned. Patterns are tried in order and multiple
3052 patterns may match. */
3053
3054 static bool
3055 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3056 hash_set<slp_tree> *visited,
3057 slp_tree_to_load_perm_map_t *perm_cache,
3058 slp_compat_nodes_map_t *compat_cache)
3059 {
3060 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3061 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3062
3063 if (dump_enabled_p ())
3064 dump_printf_loc (MSG_NOTE, vect_location,
3065 "Analyzing SLP tree %p for patterns\n",
3066 (void *) SLP_INSTANCE_TREE (instance));
3067
3068 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3069 visited);
3070 }
3071
3072 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3073 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3074 Return true if we could use IFN_STORE_LANES instead and if that appears
3075 to be the better approach. */
3076
3077 static bool
3078 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3079 unsigned int group_size,
3080 unsigned int new_group_size)
3081 {
3082 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3083 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3084 if (!vectype)
3085 return false;
3086 /* Allow the split if one of the two new groups would operate on full
3087 vectors *within* rather than across one scalar loop iteration.
3088 This is purely a heuristic, but it should work well for group
3089 sizes of 3 and 4, where the possible splits are:
3090
3091 3->2+1: OK if the vector has exactly two elements
3092 4->2+2: Likewise
3093 4->3+1: Less clear-cut. */
3094 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3095 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3096 return false;
3097 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3098 }
3099
3100 /* Analyze an SLP instance starting from a group of grouped stores. Call
3101 vect_build_slp_tree to build a tree of packed stmts if possible.
3102 Return FALSE if it's impossible to SLP any stmt in the loop. */
3103
3104 static bool
3105 vect_analyze_slp_instance (vec_info *vinfo,
3106 scalar_stmts_to_slp_tree_map_t *bst_map,
3107 stmt_vec_info stmt_info, slp_instance_kind kind,
3108 unsigned max_tree_size, unsigned *limit);
3109
3110 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3111 of KIND. Return true if successful. */
3112
3113 static bool
3114 vect_build_slp_instance (vec_info *vinfo,
3115 slp_instance_kind kind,
3116 vec<stmt_vec_info> &scalar_stmts,
3117 vec<stmt_vec_info> &root_stmt_infos,
3118 vec<tree> &remain,
3119 unsigned max_tree_size, unsigned *limit,
3120 scalar_stmts_to_slp_tree_map_t *bst_map,
3121 /* ??? We need stmt_info for group splitting. */
3122 stmt_vec_info stmt_info_)
3123 {
3124 if (kind == slp_inst_kind_ctor)
3125 {
3126 if (dump_enabled_p ())
3127 dump_printf_loc (MSG_NOTE, vect_location,
3128 "Analyzing vectorizable constructor: %G\n",
3129 root_stmt_infos[0]->stmt);
3130 }
3131
3132 if (dump_enabled_p ())
3133 {
3134 dump_printf_loc (MSG_NOTE, vect_location,
3135 "Starting SLP discovery for\n");
3136 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3137 dump_printf_loc (MSG_NOTE, vect_location,
3138 " %G", scalar_stmts[i]->stmt);
3139 }
3140
3141 /* When a BB reduction doesn't have an even number of lanes
3142 strip it down, treating the remaining lane as scalar.
3143 ??? Selecting the optimal set of lanes to vectorize would be nice
3144 but SLP build for all lanes will fail quickly because we think
3145 we're going to need unrolling. */
3146 if (kind == slp_inst_kind_bb_reduc
3147 && (scalar_stmts.length () & 1))
3148 remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
3149
3150 /* Build the tree for the SLP instance. */
3151 unsigned int group_size = scalar_stmts.length ();
3152 bool *matches = XALLOCAVEC (bool, group_size);
3153 poly_uint64 max_nunits = 1;
3154 unsigned tree_size = 0;
3155 unsigned i;
3156 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3157 &max_nunits, matches, limit,
3158 &tree_size, bst_map);
3159 if (node != NULL)
3160 {
3161 /* Calculate the unrolling factor based on the smallest type. */
3162 poly_uint64 unrolling_factor
3163 = calculate_unrolling_factor (max_nunits, group_size);
3164
3165 if (maybe_ne (unrolling_factor, 1U)
3166 && is_a <bb_vec_info> (vinfo))
3167 {
3168 unsigned HOST_WIDE_INT const_max_nunits;
3169 if (!max_nunits.is_constant (&const_max_nunits)
3170 || const_max_nunits > group_size)
3171 {
3172 if (dump_enabled_p ())
3173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3174 "Build SLP failed: store group "
3175 "size not a multiple of the vector size "
3176 "in basic block SLP\n");
3177 vect_free_slp_tree (node);
3178 return false;
3179 }
3180 /* Fatal mismatch. */
3181 if (dump_enabled_p ())
3182 dump_printf_loc (MSG_NOTE, vect_location,
3183 "SLP discovery succeeded but node needs "
3184 "splitting\n");
3185 memset (matches, true, group_size);
3186 matches[group_size / const_max_nunits * const_max_nunits] = false;
3187 vect_free_slp_tree (node);
3188 }
3189 else
3190 {
3191 /* Create a new SLP instance. */
3192 slp_instance new_instance = XNEW (class _slp_instance);
3193 SLP_INSTANCE_TREE (new_instance) = node;
3194 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3195 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3196 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3197 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3198 SLP_INSTANCE_KIND (new_instance) = kind;
3199 new_instance->reduc_phis = NULL;
3200 new_instance->cost_vec = vNULL;
3201 new_instance->subgraph_entries = vNULL;
3202
3203 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_NOTE, vect_location,
3205 "SLP size %u vs. limit %u.\n",
3206 tree_size, max_tree_size);
3207
3208 /* Fixup SLP reduction chains. */
3209 if (kind == slp_inst_kind_reduc_chain)
3210 {
3211 /* If this is a reduction chain with a conversion in front
3212 amend the SLP tree with a node for that. */
3213 gimple *scalar_def
3214 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3215 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3216 {
3217 /* Get at the conversion stmt - we know it's the single use
3218 of the last stmt of the reduction chain. */
3219 use_operand_p use_p;
3220 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3221 &use_p, &scalar_def);
3222 gcc_assert (r);
3223 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3224 next_info = vect_stmt_to_vectorize (next_info);
3225 scalar_stmts = vNULL;
3226 scalar_stmts.create (group_size);
3227 for (unsigned i = 0; i < group_size; ++i)
3228 scalar_stmts.quick_push (next_info);
3229 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3230 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3231 SLP_TREE_CHILDREN (conv).quick_push (node);
3232 SLP_INSTANCE_TREE (new_instance) = conv;
3233 /* We also have to fake this conversion stmt as SLP reduction
3234 group so we don't have to mess with too much code
3235 elsewhere. */
3236 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3237 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3238 }
3239 /* Fill the backedge child of the PHI SLP node. The
3240 general matching code cannot find it because the
3241 scalar code does not reflect how we vectorize the
3242 reduction. */
3243 use_operand_p use_p;
3244 imm_use_iterator imm_iter;
3245 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3246 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3247 gimple_get_lhs (scalar_def))
3248 /* There are exactly two non-debug uses, the reduction
3249 PHI and the loop-closed PHI node. */
3250 if (!is_gimple_debug (USE_STMT (use_p))
3251 && gimple_bb (USE_STMT (use_p)) == loop->header)
3252 {
3253 auto_vec<stmt_vec_info, 64> phis (group_size);
3254 stmt_vec_info phi_info
3255 = vinfo->lookup_stmt (USE_STMT (use_p));
3256 for (unsigned i = 0; i < group_size; ++i)
3257 phis.quick_push (phi_info);
3258 slp_tree *phi_node = bst_map->get (phis);
3259 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3260 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3261 = SLP_INSTANCE_TREE (new_instance);
3262 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3263 }
3264 }
3265
3266 vinfo->slp_instances.safe_push (new_instance);
3267
3268 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3269 the number of scalar stmts in the root in a few places.
3270 Verify that assumption holds. */
3271 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3272 .length () == group_size);
3273
3274 if (dump_enabled_p ())
3275 {
3276 dump_printf_loc (MSG_NOTE, vect_location,
3277 "Final SLP tree for instance %p:\n",
3278 (void *) new_instance);
3279 vect_print_slp_graph (MSG_NOTE, vect_location,
3280 SLP_INSTANCE_TREE (new_instance));
3281 }
3282
3283 return true;
3284 }
3285 }
3286 else
3287 {
3288 /* Failed to SLP. */
3289 /* Free the allocated memory. */
3290 scalar_stmts.release ();
3291 }
3292
3293 stmt_vec_info stmt_info = stmt_info_;
3294 /* Try to break the group up into pieces. */
3295 if (kind == slp_inst_kind_store)
3296 {
3297 /* ??? We could delay all the actual splitting of store-groups
3298 until after SLP discovery of the original group completed.
3299 Then we can recurse to vect_build_slp_instance directly. */
3300 for (i = 0; i < group_size; i++)
3301 if (!matches[i])
3302 break;
3303
3304 /* For basic block SLP, try to break the group up into multiples of
3305 a vector size. */
3306 if (is_a <bb_vec_info> (vinfo)
3307 && (i > 1 && i < group_size))
3308 {
3309 tree scalar_type
3310 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3311 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3312 1 << floor_log2 (i));
3313 unsigned HOST_WIDE_INT const_nunits;
3314 if (vectype
3315 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3316 {
3317 /* Split into two groups at the first vector boundary. */
3318 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3319 unsigned group1_size = i & ~(const_nunits - 1);
3320
3321 if (dump_enabled_p ())
3322 dump_printf_loc (MSG_NOTE, vect_location,
3323 "Splitting SLP group at stmt %u\n", i);
3324 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3325 group1_size);
3326 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3327 kind, max_tree_size,
3328 limit);
3329 /* Split the rest at the failure point and possibly
3330 re-analyze the remaining matching part if it has
3331 at least two lanes. */
3332 if (group1_size < i
3333 && (i + 1 < group_size
3334 || i - group1_size > 1))
3335 {
3336 stmt_vec_info rest2 = rest;
3337 rest = vect_split_slp_store_group (rest, i - group1_size);
3338 if (i - group1_size > 1)
3339 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3340 kind, max_tree_size,
3341 limit);
3342 }
3343 /* Re-analyze the non-matching tail if it has at least
3344 two lanes. */
3345 if (i + 1 < group_size)
3346 res |= vect_analyze_slp_instance (vinfo, bst_map,
3347 rest, kind, max_tree_size,
3348 limit);
3349 return res;
3350 }
3351 }
3352
3353 /* For loop vectorization split into arbitrary pieces of size > 1. */
3354 if (is_a <loop_vec_info> (vinfo)
3355 && (i > 1 && i < group_size)
3356 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3357 {
3358 unsigned group1_size = i;
3359
3360 if (dump_enabled_p ())
3361 dump_printf_loc (MSG_NOTE, vect_location,
3362 "Splitting SLP group at stmt %u\n", i);
3363
3364 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3365 group1_size);
3366 /* Loop vectorization cannot handle gaps in stores, make sure
3367 the split group appears as strided. */
3368 STMT_VINFO_STRIDED_P (rest) = 1;
3369 DR_GROUP_GAP (rest) = 0;
3370 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3371 DR_GROUP_GAP (stmt_info) = 0;
3372
3373 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3374 kind, max_tree_size, limit);
3375 if (i + 1 < group_size)
3376 res |= vect_analyze_slp_instance (vinfo, bst_map,
3377 rest, kind, max_tree_size, limit);
3378
3379 return res;
3380 }
3381
3382 /* Even though the first vector did not all match, we might be able to SLP
3383 (some) of the remainder. FORNOW ignore this possibility. */
3384 }
3385
3386 /* Failed to SLP. */
3387 if (dump_enabled_p ())
3388 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3389 return false;
3390 }
3391
3392
3393 /* Analyze an SLP instance starting from a group of grouped stores. Call
3394 vect_build_slp_tree to build a tree of packed stmts if possible.
3395 Return FALSE if it's impossible to SLP any stmt in the loop. */
3396
3397 static bool
3398 vect_analyze_slp_instance (vec_info *vinfo,
3399 scalar_stmts_to_slp_tree_map_t *bst_map,
3400 stmt_vec_info stmt_info,
3401 slp_instance_kind kind,
3402 unsigned max_tree_size, unsigned *limit)
3403 {
3404 unsigned int i;
3405 vec<stmt_vec_info> scalar_stmts;
3406
3407 if (is_a <bb_vec_info> (vinfo))
3408 vect_location = stmt_info->stmt;
3409
3410 stmt_vec_info next_info = stmt_info;
3411 if (kind == slp_inst_kind_store)
3412 {
3413 /* Collect the stores and store them in scalar_stmts. */
3414 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3415 while (next_info)
3416 {
3417 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3418 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3419 }
3420 }
3421 else if (kind == slp_inst_kind_reduc_chain)
3422 {
3423 /* Collect the reduction stmts and store them in scalar_stmts. */
3424 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3425 while (next_info)
3426 {
3427 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3428 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3429 }
3430 /* Mark the first element of the reduction chain as reduction to properly
3431 transform the node. In the reduction analysis phase only the last
3432 element of the chain is marked as reduction. */
3433 STMT_VINFO_DEF_TYPE (stmt_info)
3434 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3435 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3436 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3437 }
3438 else if (kind == slp_inst_kind_reduc_group)
3439 {
3440 /* Collect reduction statements. */
3441 const vec<stmt_vec_info> &reductions
3442 = as_a <loop_vec_info> (vinfo)->reductions;
3443 scalar_stmts.create (reductions.length ());
3444 for (i = 0; reductions.iterate (i, &next_info); i++)
3445 if ((STMT_VINFO_RELEVANT_P (next_info)
3446 || STMT_VINFO_LIVE_P (next_info))
3447 /* ??? Make sure we didn't skip a conversion around a reduction
3448 path. In that case we'd have to reverse engineer that conversion
3449 stmt following the chain using reduc_idx and from the PHI
3450 using reduc_def. */
3451 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3452 scalar_stmts.quick_push (next_info);
3453 /* If less than two were relevant/live there's nothing to SLP. */
3454 if (scalar_stmts.length () < 2)
3455 return false;
3456 }
3457 else
3458 gcc_unreachable ();
3459
3460 vec<stmt_vec_info> roots = vNULL;
3461 vec<tree> remain = vNULL;
3462 /* Build the tree for the SLP instance. */
3463 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3464 roots, remain,
3465 max_tree_size, limit, bst_map,
3466 kind == slp_inst_kind_store
3467 ? stmt_info : NULL);
3468
3469 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3470 where we should do store group splitting. */
3471
3472 return res;
3473 }
3474
3475 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3476 trees of packed scalar stmts if SLP is possible. */
3477
3478 opt_result
3479 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3480 {
3481 unsigned int i;
3482 stmt_vec_info first_element;
3483 slp_instance instance;
3484
3485 DUMP_VECT_SCOPE ("vect_analyze_slp");
3486
3487 unsigned limit = max_tree_size;
3488
3489 scalar_stmts_to_slp_tree_map_t *bst_map
3490 = new scalar_stmts_to_slp_tree_map_t ();
3491
3492 /* Find SLP sequences starting from groups of grouped stores. */
3493 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3494 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3495 slp_inst_kind_store, max_tree_size, &limit);
3496
3497 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3498 {
3499 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3500 {
3501 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3502 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3503 bb_vinfo->roots[i].stmts,
3504 bb_vinfo->roots[i].roots,
3505 bb_vinfo->roots[i].remain,
3506 max_tree_size, &limit, bst_map, NULL))
3507 {
3508 bb_vinfo->roots[i].stmts = vNULL;
3509 bb_vinfo->roots[i].roots = vNULL;
3510 bb_vinfo->roots[i].remain = vNULL;
3511 }
3512 }
3513 }
3514
3515 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3516 {
3517 /* Find SLP sequences starting from reduction chains. */
3518 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3519 if (! STMT_VINFO_RELEVANT_P (first_element)
3520 && ! STMT_VINFO_LIVE_P (first_element))
3521 ;
3522 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3523 slp_inst_kind_reduc_chain,
3524 max_tree_size, &limit))
3525 {
3526 /* Dissolve reduction chain group. */
3527 stmt_vec_info vinfo = first_element;
3528 stmt_vec_info last = NULL;
3529 while (vinfo)
3530 {
3531 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3532 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3533 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3534 last = vinfo;
3535 vinfo = next;
3536 }
3537 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3538 /* It can be still vectorized as part of an SLP reduction. */
3539 loop_vinfo->reductions.safe_push (last);
3540 }
3541
3542 /* Find SLP sequences starting from groups of reductions. */
3543 if (loop_vinfo->reductions.length () > 1)
3544 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3545 slp_inst_kind_reduc_group, max_tree_size,
3546 &limit);
3547 }
3548
3549 hash_set<slp_tree> visited_patterns;
3550 slp_tree_to_load_perm_map_t perm_cache;
3551 slp_compat_nodes_map_t compat_cache;
3552
3553 /* See if any patterns can be found in the SLP tree. */
3554 bool pattern_found = false;
3555 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3556 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3557 &visited_patterns, &perm_cache,
3558 &compat_cache);
3559
3560 /* If any were found optimize permutations of loads. */
3561 if (pattern_found)
3562 {
3563 hash_map<slp_tree, slp_tree> load_map;
3564 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3565 {
3566 slp_tree root = SLP_INSTANCE_TREE (instance);
3567 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3568 &load_map, root);
3569 }
3570 }
3571
3572
3573
3574 /* The map keeps a reference on SLP nodes built, release that. */
3575 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3576 it != bst_map->end (); ++it)
3577 if ((*it).second)
3578 vect_free_slp_tree ((*it).second);
3579 delete bst_map;
3580
3581 if (pattern_found && dump_enabled_p ())
3582 {
3583 dump_printf_loc (MSG_NOTE, vect_location,
3584 "Pattern matched SLP tree\n");
3585 hash_set<slp_tree> visited;
3586 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3587 vect_print_slp_graph (MSG_NOTE, vect_location,
3588 SLP_INSTANCE_TREE (instance), visited);
3589 }
3590
3591 return opt_result::success ();
3592 }
3593
3594 /* Estimates the cost of inserting layout changes into the SLP graph.
3595 It can also say that the insertion is impossible. */
3596
3597 struct slpg_layout_cost
3598 {
3599 slpg_layout_cost () = default;
3600 slpg_layout_cost (sreal, bool);
3601
3602 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3603 bool is_possible () const { return depth != sreal::max (); }
3604
3605 bool operator== (const slpg_layout_cost &) const;
3606 bool operator!= (const slpg_layout_cost &) const;
3607
3608 bool is_better_than (const slpg_layout_cost &, bool) const;
3609
3610 void add_parallel_cost (const slpg_layout_cost &);
3611 void add_serial_cost (const slpg_layout_cost &);
3612 void split (unsigned int);
3613
3614 /* The longest sequence of layout changes needed during any traversal
3615 of the partition dag, weighted by execution frequency.
3616
3617 This is the most important metric when optimizing for speed, since
3618 it helps to ensure that we keep the number of operations on
3619 critical paths to a minimum. */
3620 sreal depth = 0;
3621
3622 /* An estimate of the total number of operations needed. It is weighted by
3623 execution frequency when optimizing for speed but not when optimizing for
3624 size. In order to avoid double-counting, a node with a fanout of N will
3625 distribute 1/N of its total cost to each successor.
3626
3627 This is the most important metric when optimizing for size, since
3628 it helps to keep the total number of operations to a minimum, */
3629 sreal total = 0;
3630 };
3631
3632 /* Construct costs for a node with weight WEIGHT. A higher weight
3633 indicates more frequent execution. IS_FOR_SIZE is true if we are
3634 optimizing for size rather than speed. */
3635
3636 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3637 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3638 {
3639 }
3640
3641 bool
3642 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3643 {
3644 return depth == other.depth && total == other.total;
3645 }
3646
3647 bool
3648 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3649 {
3650 return !operator== (other);
3651 }
3652
3653 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3654 true if we are optimizing for size rather than speed. */
3655
3656 bool
3657 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3658 bool is_for_size) const
3659 {
3660 if (is_for_size)
3661 {
3662 if (total != other.total)
3663 return total < other.total;
3664 return depth < other.depth;
3665 }
3666 else
3667 {
3668 if (depth != other.depth)
3669 return depth < other.depth;
3670 return total < other.total;
3671 }
3672 }
3673
3674 /* Increase the costs to account for something with cost INPUT_COST
3675 happening in parallel with the current costs. */
3676
3677 void
3678 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3679 {
3680 depth = std::max (depth, input_cost.depth);
3681 total += input_cost.total;
3682 }
3683
3684 /* Increase the costs to account for something with cost INPUT_COST
3685 happening in series with the current costs. */
3686
3687 void
3688 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3689 {
3690 depth += other.depth;
3691 total += other.total;
3692 }
3693
3694 /* Split the total cost among TIMES successors or predecessors. */
3695
3696 void
3697 slpg_layout_cost::split (unsigned int times)
3698 {
3699 if (times > 1)
3700 total /= times;
3701 }
3702
3703 /* Information about one node in the SLP graph, for use during
3704 vect_optimize_slp_pass. */
3705
3706 struct slpg_vertex
3707 {
3708 slpg_vertex (slp_tree node_) : node (node_) {}
3709
3710 /* The node itself. */
3711 slp_tree node;
3712
3713 /* Which partition the node belongs to, or -1 if none. Nodes outside of
3714 partitions are flexible; they can have whichever layout consumers
3715 want them to have. */
3716 int partition = -1;
3717
3718 /* The number of nodes that directly use the result of this one
3719 (i.e. the number of nodes that count this one as a child). */
3720 unsigned int out_degree = 0;
3721
3722 /* The execution frequency of the node. */
3723 sreal weight = 0;
3724
3725 /* The total execution frequency of all nodes that directly use the
3726 result of this one. */
3727 sreal out_weight = 0;
3728 };
3729
3730 /* Information about one partition of the SLP graph, for use during
3731 vect_optimize_slp_pass. */
3732
3733 struct slpg_partition_info
3734 {
3735 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3736 of m_partitioned_nodes. */
3737 unsigned int node_begin = 0;
3738 unsigned int node_end = 0;
3739
3740 /* Which layout we've chosen to use for this partition, or -1 if
3741 we haven't picked one yet. */
3742 int layout = -1;
3743
3744 /* The number of predecessors and successors in the partition dag.
3745 The predecessors always have lower partition numbers and the
3746 successors always have higher partition numbers.
3747
3748 Note that the directions of these edges are not necessarily the
3749 same as in the data flow graph. For example, if an SCC has separate
3750 partitions for an inner loop and an outer loop, the inner loop's
3751 partition will have at least two incoming edges from the outer loop's
3752 partition: one for a live-in value and one for a live-out value.
3753 In data flow terms, one of these edges would also be from the outer loop
3754 to the inner loop, but the other would be in the opposite direction. */
3755 unsigned int in_degree = 0;
3756 unsigned int out_degree = 0;
3757 };
3758
3759 /* Information about the costs of using a particular layout for a
3760 particular partition. It can also say that the combination is
3761 impossible. */
3762
3763 struct slpg_partition_layout_costs
3764 {
3765 bool is_possible () const { return internal_cost.is_possible (); }
3766 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3767
3768 /* The costs inherited from predecessor partitions. */
3769 slpg_layout_cost in_cost;
3770
3771 /* The inherent cost of the layout within the node itself. For example,
3772 this is nonzero for a load if choosing a particular layout would require
3773 the load to permute the loaded elements. It is nonzero for a
3774 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3775 to full-vector moves. */
3776 slpg_layout_cost internal_cost;
3777
3778 /* The costs inherited from successor partitions. */
3779 slpg_layout_cost out_cost;
3780 };
3781
3782 /* This class tries to optimize the layout of vectors in order to avoid
3783 unnecessary shuffling. At the moment, the set of possible layouts are
3784 restricted to bijective permutations.
3785
3786 The goal of the pass depends on whether we're optimizing for size or
3787 for speed. When optimizing for size, the goal is to reduce the overall
3788 number of layout changes (including layout changes implied by things
3789 like load permutations). When optimizing for speed, the goal is to
3790 reduce the maximum latency attributable to layout changes on any
3791 non-cyclical path through the data flow graph.
3792
3793 For example, when optimizing a loop nest for speed, we will prefer
3794 to make layout changes outside of a loop rather than inside of a loop,
3795 and will prefer to make layout changes in parallel rather than serially,
3796 even if that increases the overall number of layout changes.
3797
3798 The high-level procedure is:
3799
3800 (1) Build a graph in which edges go from uses (parents) to definitions
3801 (children).
3802
3803 (2) Divide the graph into a dag of strongly-connected components (SCCs).
3804
3805 (3) When optimizing for speed, partition the nodes in each SCC based
3806 on their containing cfg loop. When optimizing for size, treat
3807 each SCC as a single partition.
3808
3809 This gives us a dag of partitions. The goal is now to assign a
3810 layout to each partition.
3811
3812 (4) Construct a set of vector layouts that are worth considering.
3813 Record which nodes must keep their current layout.
3814
3815 (5) Perform a forward walk over the partition dag (from loads to stores)
3816 accumulating the "forward" cost of using each layout. When visiting
3817 each partition, assign a tentative choice of layout to the partition
3818 and use that choice when calculating the cost of using a different
3819 layout in successor partitions.
3820
3821 (6) Perform a backward walk over the partition dag (from stores to loads),
3822 accumulating the "backward" cost of using each layout. When visiting
3823 each partition, make a final choice of layout for that partition based
3824 on the accumulated forward costs (from (5)) and backward costs
3825 (from (6)).
3826
3827 (7) Apply the chosen layouts to the SLP graph.
3828
3829 For example, consider the SLP statements:
3830
3831 S1: a_1 = load
3832 loop:
3833 S2: a_2 = PHI<a_1, a_3>
3834 S3: b_1 = load
3835 S4: a_3 = a_2 + b_1
3836 exit:
3837 S5: a_4 = PHI<a_3>
3838 S6: store a_4
3839
3840 S2 and S4 form an SCC and are part of the same loop. Every other
3841 statement is in a singleton SCC. In this example there is a one-to-one
3842 mapping between SCCs and partitions and the partition dag looks like this;
3843
3844 S1 S3
3845 \ /
3846 S2+S4
3847 |
3848 S5
3849 |
3850 S6
3851
3852 S2, S3 and S4 will have a higher execution frequency than the other
3853 statements, so when optimizing for speed, the goal is to avoid any
3854 layout changes:
3855
3856 - within S3
3857 - within S2+S4
3858 - on the S3->S2+S4 edge
3859
3860 For example, if S3 was originally a reversing load, the goal of the
3861 pass is to make it an unreversed load and change the layout on the
3862 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
3863 on S1->S2+S4 and S5->S6 would also be acceptable.)
3864
3865 The difference between SCCs and partitions becomes important if we
3866 add an outer loop:
3867
3868 S1: a_1 = ...
3869 loop1:
3870 S2: a_2 = PHI<a_1, a_6>
3871 S3: b_1 = load
3872 S4: a_3 = a_2 + b_1
3873 loop2:
3874 S5: a_4 = PHI<a_3, a_5>
3875 S6: c_1 = load
3876 S7: a_5 = a_4 + c_1
3877 exit2:
3878 S8: a_6 = PHI<a_5>
3879 S9: store a_6
3880 exit1:
3881
3882 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
3883 for speed, we usually do not want restrictions in the outer loop to "infect"
3884 the decision for the inner loop. For example, if an outer-loop node
3885 in the SCC contains a statement with a fixed layout, that should not
3886 prevent the inner loop from using a different layout. Conversely,
3887 the inner loop should not dictate a layout to the outer loop: if the
3888 outer loop does a lot of computation, then it may not be efficient to
3889 do all of that computation in the inner loop's preferred layout.
3890
3891 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
3892 and S5+S7 (inner). We also try to arrange partitions so that:
3893
3894 - the partition for an outer loop comes before the partition for
3895 an inner loop
3896
3897 - if a sibling loop A dominates a sibling loop B, A's partition
3898 comes before B's
3899
3900 This gives the following partition dag for the example above:
3901
3902 S1 S3
3903 \ /
3904 S2+S4+S8 S6
3905 | \\ /
3906 | S5+S7
3907 |
3908 S9
3909
3910 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
3911 one for a reversal of the edge S7->S8.
3912
3913 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
3914 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
3915 preferred layout against the cost of changing the layout on entry to the
3916 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
3917
3918 Although this works well when optimizing for speed, it has the downside
3919 when optimizing for size that the choice of layout for S5+S7 is completely
3920 independent of S9, which lessens the chance of reducing the overall number
3921 of permutations. We therefore do not partition SCCs when optimizing
3922 for size.
3923
3924 To give a concrete example of the difference between optimizing
3925 for size and speed, consider:
3926
3927 a[0] = (b[1] << c[3]) - d[1];
3928 a[1] = (b[0] << c[2]) - d[0];
3929 a[2] = (b[3] << c[1]) - d[3];
3930 a[3] = (b[2] << c[0]) - d[2];
3931
3932 There are three different layouts here: one for a, one for b and d,
3933 and one for c. When optimizing for speed it is better to permute each
3934 of b, c and d into the order required by a, since those permutations
3935 happen in parallel. But when optimizing for size, it is better to:
3936
3937 - permute c into the same order as b
3938 - do the arithmetic
3939 - permute the result into the order required by a
3940
3941 This gives 2 permutations rather than 3. */
3942
3943 class vect_optimize_slp_pass
3944 {
3945 public:
3946 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
3947 void run ();
3948
3949 private:
3950 /* Graph building. */
3951 struct loop *containing_loop (slp_tree);
3952 bool is_cfg_latch_edge (graph_edge *);
3953 void build_vertices (hash_set<slp_tree> &, slp_tree);
3954 void build_vertices ();
3955 void build_graph ();
3956
3957 /* Partitioning. */
3958 void create_partitions ();
3959 template<typename T> void for_each_partition_edge (unsigned int, T);
3960
3961 /* Layout selection. */
3962 bool is_compatible_layout (slp_tree, unsigned int);
3963 int change_layout_cost (slp_tree, unsigned int, unsigned int);
3964 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
3965 unsigned int);
3966 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
3967 int, unsigned int);
3968 int internal_node_cost (slp_tree, int, unsigned int);
3969 void start_choosing_layouts ();
3970
3971 /* Cost propagation. */
3972 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
3973 unsigned int, unsigned int);
3974 slpg_layout_cost total_in_cost (unsigned int);
3975 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
3976 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
3977 void forward_pass ();
3978 void backward_pass ();
3979
3980 /* Rematerialization. */
3981 slp_tree get_result_with_layout (slp_tree, unsigned int);
3982 void materialize ();
3983
3984 /* Clean-up. */
3985 void remove_redundant_permutations ();
3986
3987 void dump ();
3988
3989 vec_info *m_vinfo;
3990
3991 /* True if we should optimize the graph for size, false if we should
3992 optimize it for speed. (It wouldn't be easy to make this decision
3993 more locally.) */
3994 bool m_optimize_size;
3995
3996 /* A graph of all SLP nodes, with edges leading from uses to definitions.
3997 In other words, a node's predecessors are its slp_tree parents and
3998 a node's successors are its slp_tree children. */
3999 graph *m_slpg = nullptr;
4000
4001 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4002 auto_vec<slpg_vertex> m_vertices;
4003
4004 /* The list of all leaves of M_SLPG. such as external definitions, constants,
4005 and loads. */
4006 auto_vec<int> m_leafs;
4007
4008 /* This array has one entry for every vector layout that we're considering.
4009 Element 0 is null and indicates "no change". Other entries describe
4010 permutations that are inherent in the current graph and that we would
4011 like to reverse if possible.
4012
4013 For example, a permutation { 1, 2, 3, 0 } means that something has
4014 effectively been permuted in that way, such as a load group
4015 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4016 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4017 in order to put things "back" in order. */
4018 auto_vec<vec<unsigned> > m_perms;
4019
4020 /* A partitioning of the nodes for which a layout must be chosen.
4021 Each partition represents an <SCC, cfg loop> pair; that is,
4022 nodes in different SCCs belong to different partitions, and nodes
4023 within an SCC can be further partitioned according to a containing
4024 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4025
4026 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4027 from leaves (such as loads) to roots (such as stores).
4028
4029 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4030 auto_vec<slpg_partition_info> m_partitions;
4031
4032 /* The list of all nodes for which a layout must be chosen. Nodes for
4033 partition P come before the nodes for partition P+1. Nodes within a
4034 partition are in reverse postorder. */
4035 auto_vec<unsigned int> m_partitioned_nodes;
4036
4037 /* Index P * num-layouts + L contains the cost of using layout L
4038 for partition P. */
4039 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4040
4041 /* Index N * num-layouts + L, if nonnull, is a node that provides the
4042 original output of node N adjusted to have layout L. */
4043 auto_vec<slp_tree> m_node_layouts;
4044 };
4045
4046 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4047 Also record whether we should optimize anything for speed rather
4048 than size. */
4049
4050 void
4051 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4052 slp_tree node)
4053 {
4054 unsigned i;
4055 slp_tree child;
4056
4057 if (visited.add (node))
4058 return;
4059
4060 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4061 {
4062 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4063 if (optimize_bb_for_speed_p (bb))
4064 m_optimize_size = false;
4065 }
4066
4067 node->vertex = m_vertices.length ();
4068 m_vertices.safe_push (slpg_vertex (node));
4069
4070 bool leaf = true;
4071 bool force_leaf = false;
4072 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4073 if (child)
4074 {
4075 leaf = false;
4076 build_vertices (visited, child);
4077 }
4078 else
4079 force_leaf = true;
4080 /* Since SLP discovery works along use-def edges all cycles have an
4081 entry - but there's the exception of cycles where we do not handle
4082 the entry explicitely (but with a NULL SLP node), like some reductions
4083 and inductions. Force those SLP PHIs to act as leafs to make them
4084 backwards reachable. */
4085 if (leaf || force_leaf)
4086 m_leafs.safe_push (node->vertex);
4087 }
4088
4089 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4090
4091 void
4092 vect_optimize_slp_pass::build_vertices ()
4093 {
4094 hash_set<slp_tree> visited;
4095 unsigned i;
4096 slp_instance instance;
4097 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4098 build_vertices (visited, SLP_INSTANCE_TREE (instance));
4099 }
4100
4101 /* Apply (reverse) bijectite PERM to VEC. */
4102
4103 template <class T>
4104 static void
4105 vect_slp_permute (vec<unsigned> perm,
4106 vec<T> &vec, bool reverse)
4107 {
4108 auto_vec<T, 64> saved;
4109 saved.create (vec.length ());
4110 for (unsigned i = 0; i < vec.length (); ++i)
4111 saved.quick_push (vec[i]);
4112
4113 if (reverse)
4114 {
4115 for (unsigned i = 0; i < vec.length (); ++i)
4116 vec[perm[i]] = saved[i];
4117 for (unsigned i = 0; i < vec.length (); ++i)
4118 gcc_assert (vec[perm[i]] == saved[i]);
4119 }
4120 else
4121 {
4122 for (unsigned i = 0; i < vec.length (); ++i)
4123 vec[i] = saved[perm[i]];
4124 for (unsigned i = 0; i < vec.length (); ++i)
4125 gcc_assert (vec[i] == saved[perm[i]]);
4126 }
4127 }
4128
4129 /* Return the cfg loop that contains NODE. */
4130
4131 struct loop *
4132 vect_optimize_slp_pass::containing_loop (slp_tree node)
4133 {
4134 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4135 if (!rep)
4136 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4137 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4138 }
4139
4140 /* Return true if UD (an edge from a use to a definition) is associated
4141 with a loop latch edge in the cfg. */
4142
4143 bool
4144 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4145 {
4146 slp_tree use = m_vertices[ud->src].node;
4147 slp_tree def = m_vertices[ud->dest].node;
4148 if (SLP_TREE_DEF_TYPE (use) != vect_internal_def
4149 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4150 return false;
4151
4152 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4153 return (is_a<gphi *> (use_rep->stmt)
4154 && bb_loop_header_p (gimple_bb (use_rep->stmt))
4155 && containing_loop (def) == containing_loop (use));
4156 }
4157
4158 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4159 a nonnull data field. */
4160
4161 void
4162 vect_optimize_slp_pass::build_graph ()
4163 {
4164 m_optimize_size = true;
4165 build_vertices ();
4166
4167 m_slpg = new_graph (m_vertices.length ());
4168 for (slpg_vertex &v : m_vertices)
4169 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4170 if (child)
4171 {
4172 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4173 if (is_cfg_latch_edge (ud))
4174 ud->data = this;
4175 }
4176 }
4177
4178 /* Return true if E corresponds to a loop latch edge in the cfg. */
4179
4180 static bool
4181 skip_cfg_latch_edges (graph_edge *e)
4182 {
4183 return e->data;
4184 }
4185
4186 /* Create the node partitions. */
4187
4188 void
4189 vect_optimize_slp_pass::create_partitions ()
4190 {
4191 /* Calculate a postorder of the graph, ignoring edges that correspond
4192 to natural latch edges in the cfg. Reading the vector from the end
4193 to the beginning gives the reverse postorder. */
4194 auto_vec<int> initial_rpo;
4195 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4196 false, NULL, skip_cfg_latch_edges);
4197 gcc_assert (initial_rpo.length () == m_vertices.length ());
4198
4199 /* Calculate the strongly connected components of the graph. */
4200 auto_vec<int> scc_grouping;
4201 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4202
4203 /* Create a new index order in which all nodes from the same SCC are
4204 consecutive. Use scc_pos to record the index of the first node in
4205 each SCC. */
4206 auto_vec<unsigned int> scc_pos (num_sccs);
4207 int last_component = -1;
4208 unsigned int node_count = 0;
4209 for (unsigned int node_i : scc_grouping)
4210 {
4211 if (last_component != m_slpg->vertices[node_i].component)
4212 {
4213 last_component = m_slpg->vertices[node_i].component;
4214 gcc_assert (last_component == int (scc_pos.length ()));
4215 scc_pos.quick_push (node_count);
4216 }
4217 node_count += 1;
4218 }
4219 gcc_assert (node_count == initial_rpo.length ()
4220 && last_component + 1 == int (num_sccs));
4221
4222 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4223 inside each SCC following the RPO we calculated above. The fact that
4224 we ignored natural latch edges when calculating the RPO should ensure
4225 that, for natural loop nests:
4226
4227 - the first node that we encounter in a cfg loop is the loop header phi
4228 - the loop header phis are in dominance order
4229
4230 Arranging for this is an optimization (see below) rather than a
4231 correctness issue. Unnatural loops with a tangled mess of backedges
4232 will still work correctly, but might give poorer results.
4233
4234 Also update scc_pos so that it gives 1 + the index of the last node
4235 in the SCC. */
4236 m_partitioned_nodes.safe_grow (node_count);
4237 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4238 {
4239 unsigned int node_i = initial_rpo[old_i];
4240 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4241 m_partitioned_nodes[new_i] = node_i;
4242 }
4243
4244 /* When optimizing for speed, partition each SCC based on the containing
4245 cfg loop. The order we constructed above should ensure that, for natural
4246 cfg loops, we'll create sub-SCC partitions for outer loops before
4247 the corresponding sub-SCC partitions for inner loops. Similarly,
4248 when one sibling loop A dominates another sibling loop B, we should
4249 create a sub-SCC partition for A before a sub-SCC partition for B.
4250
4251 As above, nothing depends for correctness on whether this achieves
4252 a natural nesting, but we should get better results when it does. */
4253 m_partitions.reserve (m_vertices.length ());
4254 unsigned int next_partition_i = 0;
4255 hash_map<struct loop *, int> loop_partitions;
4256 unsigned int rpo_begin = 0;
4257 unsigned int num_partitioned_nodes = 0;
4258 for (unsigned int rpo_end : scc_pos)
4259 {
4260 loop_partitions.empty ();
4261 unsigned int partition_i = next_partition_i;
4262 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4263 {
4264 /* Handle externals and constants optimistically throughout.
4265 But treat existing vectors as fixed since we do not handle
4266 permuting them. */
4267 unsigned int node_i = m_partitioned_nodes[rpo_i];
4268 auto &vertex = m_vertices[node_i];
4269 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4270 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4271 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4272 vertex.partition = -1;
4273 else
4274 {
4275 bool existed;
4276 if (m_optimize_size)
4277 existed = next_partition_i > partition_i;
4278 else
4279 {
4280 struct loop *loop = containing_loop (vertex.node);
4281 auto &entry = loop_partitions.get_or_insert (loop, &existed);
4282 if (!existed)
4283 entry = next_partition_i;
4284 partition_i = entry;
4285 }
4286 if (!existed)
4287 {
4288 m_partitions.quick_push (slpg_partition_info ());
4289 next_partition_i += 1;
4290 }
4291 vertex.partition = partition_i;
4292 num_partitioned_nodes += 1;
4293 m_partitions[partition_i].node_end += 1;
4294 }
4295 }
4296 rpo_begin = rpo_end;
4297 }
4298
4299 /* Assign ranges of consecutive node indices to each partition,
4300 in partition order. Start with node_end being the same as
4301 node_begin so that the next loop can use it as a counter. */
4302 unsigned int node_begin = 0;
4303 for (auto &partition : m_partitions)
4304 {
4305 partition.node_begin = node_begin;
4306 node_begin += partition.node_end;
4307 partition.node_end = partition.node_begin;
4308 }
4309 gcc_assert (node_begin == num_partitioned_nodes);
4310
4311 /* Finally build the list of nodes in partition order. */
4312 m_partitioned_nodes.truncate (num_partitioned_nodes);
4313 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4314 {
4315 int partition_i = m_vertices[node_i].partition;
4316 if (partition_i >= 0)
4317 {
4318 unsigned int order_i = m_partitions[partition_i].node_end++;
4319 m_partitioned_nodes[order_i] = node_i;
4320 }
4321 }
4322 }
4323
4324 /* Look for edges from earlier partitions into node NODE_I and edges from
4325 node NODE_I into later partitions. Call:
4326
4327 FN (ud, other_node_i)
4328
4329 for each such use-to-def edge ud, where other_node_i is the node at the
4330 other end of the edge. */
4331
4332 template<typename T>
4333 void
4334 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4335 {
4336 int partition_i = m_vertices[node_i].partition;
4337 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4338 pred; pred = pred->pred_next)
4339 {
4340 int src_partition_i = m_vertices[pred->src].partition;
4341 if (src_partition_i >= 0 && src_partition_i != partition_i)
4342 fn (pred, pred->src);
4343 }
4344 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4345 succ; succ = succ->succ_next)
4346 {
4347 int dest_partition_i = m_vertices[succ->dest].partition;
4348 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4349 fn (succ, succ->dest);
4350 }
4351 }
4352
4353 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4354 that NODE would operate on. This test is independent of NODE's actual
4355 operation. */
4356
4357 bool
4358 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4359 unsigned int layout_i)
4360 {
4361 if (layout_i == 0)
4362 return true;
4363
4364 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4365 return false;
4366
4367 return true;
4368 }
4369
4370 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4371 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4372 layouts is incompatible with NODE or if the change is not possible for
4373 some other reason.
4374
4375 The properties taken from NODE include the number of lanes and the
4376 vector type. The actual operation doesn't matter. */
4377
4378 int
4379 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4380 unsigned int from_layout_i,
4381 unsigned int to_layout_i)
4382 {
4383 if (!is_compatible_layout (node, from_layout_i)
4384 || !is_compatible_layout (node, to_layout_i))
4385 return -1;
4386
4387 if (from_layout_i == to_layout_i)
4388 return 0;
4389
4390 auto_vec<slp_tree, 1> children (1);
4391 children.quick_push (node);
4392 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4393 if (from_layout_i > 0)
4394 for (unsigned int i : m_perms[from_layout_i])
4395 perm.quick_push ({ 0, i });
4396 else
4397 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4398 perm.quick_push ({ 0, i });
4399 if (to_layout_i > 0)
4400 vect_slp_permute (m_perms[to_layout_i], perm, true);
4401 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4402 children, false);
4403 if (count >= 0)
4404 return MAX (count, 1);
4405
4406 /* ??? In principle we could try changing via layout 0, giving two
4407 layout changes rather than 1. Doing that would require
4408 corresponding support in get_result_with_layout. */
4409 return -1;
4410 }
4411
4412 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4413
4414 inline slpg_partition_layout_costs &
4415 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4416 unsigned int layout_i)
4417 {
4418 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4419 }
4420
4421 /* Change PERM in one of two ways:
4422
4423 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4424 chosen for child I of NODE.
4425
4426 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4427
4428 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4429
4430 void
4431 vect_optimize_slp_pass::
4432 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4433 int in_layout_i, unsigned int out_layout_i)
4434 {
4435 for (auto &entry : perm)
4436 {
4437 int this_in_layout_i = in_layout_i;
4438 if (this_in_layout_i < 0)
4439 {
4440 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4441 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4442 this_in_layout_i = m_partitions[in_partition_i].layout;
4443 }
4444 if (this_in_layout_i > 0)
4445 entry.second = m_perms[this_in_layout_i][entry.second];
4446 }
4447 if (out_layout_i > 0)
4448 vect_slp_permute (m_perms[out_layout_i], perm, true);
4449 }
4450
4451 /* Check whether the target allows NODE to be rearranged so that the node's
4452 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4453 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4454
4455 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4456 NODE can adapt to the layout changes that have (perhaps provisionally)
4457 been chosen for NODE's children, so that no extra permutations are
4458 needed on either the input or the output of NODE.
4459
4460 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4461 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4462
4463 IN_LAYOUT_I has no meaning for other types of node.
4464
4465 Keeping the node as-is is always valid. If the target doesn't appear
4466 to support the node as-is, but might realistically support other layouts,
4467 then layout 0 instead has the cost of a worst-case permutation. On the
4468 one hand, this ensures that every node has at least one valid layout,
4469 avoiding what would otherwise be an awkward special case. On the other,
4470 it still encourages the pass to change an invalid pre-existing layout
4471 choice into a valid one. */
4472
4473 int
4474 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4475 unsigned int out_layout_i)
4476 {
4477 const int fallback_cost = 1;
4478
4479 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4480 {
4481 auto_lane_permutation_t tmp_perm;
4482 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4483
4484 /* Check that the child nodes support the chosen layout. Checking
4485 the first child is enough, since any second child would have the
4486 same shape. */
4487 auto first_child = SLP_TREE_CHILDREN (node)[0];
4488 if (in_layout_i > 0
4489 && !is_compatible_layout (first_child, in_layout_i))
4490 return -1;
4491
4492 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4493 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4494 node, tmp_perm,
4495 SLP_TREE_CHILDREN (node),
4496 false);
4497 if (count < 0)
4498 {
4499 if (in_layout_i == 0 && out_layout_i == 0)
4500 {
4501 /* Use the fallback cost if the node could in principle support
4502 some nonzero layout for both the inputs and the outputs.
4503 Otherwise assume that the node will be rejected later
4504 and rebuilt from scalars. */
4505 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4506 return fallback_cost;
4507 return 0;
4508 }
4509 return -1;
4510 }
4511
4512 /* We currently have no way of telling whether the new layout is cheaper
4513 or more expensive than the old one. But at least in principle,
4514 it should be worth making zero permutations (whole-vector shuffles)
4515 cheaper than real permutations, in case the pass is able to remove
4516 the latter. */
4517 return count == 0 ? 0 : 1;
4518 }
4519
4520 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4521 if (rep
4522 && STMT_VINFO_DATA_REF (rep)
4523 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4524 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4525 {
4526 auto_load_permutation_t tmp_perm;
4527 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4528 if (out_layout_i > 0)
4529 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4530
4531 poly_uint64 vf = 1;
4532 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4533 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4534 unsigned int n_perms;
4535 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4536 nullptr, vf, true, false, &n_perms))
4537 {
4538 auto rep = SLP_TREE_REPRESENTATIVE (node);
4539 if (out_layout_i == 0)
4540 {
4541 /* Use the fallback cost if the load is an N-to-N permutation.
4542 Otherwise assume that the node will be rejected later
4543 and rebuilt from scalars. */
4544 if (STMT_VINFO_GROUPED_ACCESS (rep)
4545 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4546 == SLP_TREE_LANES (node)))
4547 return fallback_cost;
4548 return 0;
4549 }
4550 return -1;
4551 }
4552
4553 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4554 return n_perms == 0 ? 0 : 1;
4555 }
4556
4557 return 0;
4558 }
4559
4560 /* Decide which element layouts we should consider using. Calculate the
4561 weights associated with inserting layout changes on partition edges.
4562 Also mark partitions that cannot change layout, by setting their
4563 layout to zero. */
4564
4565 void
4566 vect_optimize_slp_pass::start_choosing_layouts ()
4567 {
4568 /* Used to assign unique permutation indices. */
4569 using perm_hash = unbounded_hashmap_traits<
4570 vec_free_hash_base<int_hash_base<unsigned>>,
4571 int_hash<int, -1, -2>
4572 >;
4573 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4574
4575 /* Layout 0 is "no change". */
4576 m_perms.safe_push (vNULL);
4577
4578 /* Create layouts from existing permutations. */
4579 auto_load_permutation_t tmp_perm;
4580 for (unsigned int node_i : m_partitioned_nodes)
4581 {
4582 /* Leafs also double as entries to the reverse graph. Allow the
4583 layout of those to be changed. */
4584 auto &vertex = m_vertices[node_i];
4585 auto &partition = m_partitions[vertex.partition];
4586 if (!m_slpg->vertices[node_i].succ)
4587 partition.layout = 0;
4588
4589 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4590 slp_tree node = vertex.node;
4591 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4592 slp_tree child;
4593 unsigned HOST_WIDE_INT imin, imax = 0;
4594 bool any_permute = false;
4595 tmp_perm.truncate (0);
4596 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4597 {
4598 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4599 unpermuted, record a layout that reverses this permutation.
4600
4601 We would need more work to cope with loads that are internally
4602 permuted and also have inputs (such as masks for
4603 IFN_MASK_LOADs). */
4604 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4605 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4606 {
4607 partition.layout = -1;
4608 continue;
4609 }
4610 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4611 imin = DR_GROUP_SIZE (dr_stmt) + 1;
4612 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4613 }
4614 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4615 && SLP_TREE_CHILDREN (node).length () == 1
4616 && (child = SLP_TREE_CHILDREN (node)[0])
4617 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4618 .is_constant (&imin)))
4619 {
4620 /* If the child has the same vector size as this node,
4621 reversing the permutation can make the permutation a no-op.
4622 In other cases it can change a true permutation into a
4623 full-vector extract. */
4624 tmp_perm.reserve (SLP_TREE_LANES (node));
4625 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4626 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4627 }
4628 else
4629 continue;
4630
4631 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4632 {
4633 unsigned idx = tmp_perm[j];
4634 imin = MIN (imin, idx);
4635 imax = MAX (imax, idx);
4636 if (idx - tmp_perm[0] != j)
4637 any_permute = true;
4638 }
4639 /* If the span doesn't match we'd disrupt VF computation, avoid
4640 that for now. */
4641 if (imax - imin + 1 != SLP_TREE_LANES (node))
4642 continue;
4643 /* If there's no permute no need to split one out. In this case
4644 we can consider turning a load into a permuted load, if that
4645 turns out to be cheaper than alternatives. */
4646 if (!any_permute)
4647 {
4648 partition.layout = -1;
4649 continue;
4650 }
4651
4652 /* For now only handle true permutes, like
4653 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4654 when permuting constants and invariants keeping the permute
4655 bijective. */
4656 auto_sbitmap load_index (SLP_TREE_LANES (node));
4657 bitmap_clear (load_index);
4658 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4659 bitmap_set_bit (load_index, tmp_perm[j] - imin);
4660 unsigned j;
4661 for (j = 0; j < SLP_TREE_LANES (node); ++j)
4662 if (!bitmap_bit_p (load_index, j))
4663 break;
4664 if (j != SLP_TREE_LANES (node))
4665 continue;
4666
4667 vec<unsigned> perm = vNULL;
4668 perm.safe_grow (SLP_TREE_LANES (node), true);
4669 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4670 perm[j] = tmp_perm[j] - imin;
4671
4672 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4673 {
4674 /* Continue to use existing layouts, but don't add any more. */
4675 int *entry = layout_ids.get (perm);
4676 partition.layout = entry ? *entry : 0;
4677 perm.release ();
4678 }
4679 else
4680 {
4681 bool existed;
4682 int &layout_i = layout_ids.get_or_insert (perm, &existed);
4683 if (existed)
4684 perm.release ();
4685 else
4686 {
4687 layout_i = m_perms.length ();
4688 m_perms.safe_push (perm);
4689 }
4690 partition.layout = layout_i;
4691 }
4692 }
4693
4694 /* Initially assume that every layout is possible and has zero cost
4695 in every partition. */
4696 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4697 * m_perms.length ());
4698
4699 /* We have to mark outgoing permutations facing non-associating-reduction
4700 graph entries that are not represented as to be materialized.
4701 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4702 for (slp_instance instance : m_vinfo->slp_instances)
4703 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4704 {
4705 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4706 m_partitions[m_vertices[node_i].partition].layout = 0;
4707 }
4708 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4709 {
4710 stmt_vec_info stmt_info
4711 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4712 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4713 if (needs_fold_left_reduction_p (TREE_TYPE
4714 (gimple_get_lhs (stmt_info->stmt)),
4715 STMT_VINFO_REDUC_CODE (reduc_info)))
4716 {
4717 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4718 m_partitions[m_vertices[node_i].partition].layout = 0;
4719 }
4720 }
4721
4722 /* Check which layouts each node and partition can handle. Calculate the
4723 weights associated with inserting layout changes on edges. */
4724 for (unsigned int node_i : m_partitioned_nodes)
4725 {
4726 auto &vertex = m_vertices[node_i];
4727 auto &partition = m_partitions[vertex.partition];
4728 slp_tree node = vertex.node;
4729
4730 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4731 {
4732 vertex.weight = vect_slp_node_weight (node);
4733
4734 /* We do not handle stores with a permutation, so all
4735 incoming permutations must have been materialized.
4736
4737 We also don't handle masked grouped loads, which lack a
4738 permutation vector. In this case the memory locations
4739 form an implicit second input to the loads, on top of the
4740 explicit mask input, and the memory input's layout cannot
4741 be changed.
4742
4743 On the other hand, we do support permuting gather loads and
4744 masked gather loads, where each scalar load is independent
4745 of the others. This can be useful if the address/index input
4746 benefits from permutation. */
4747 if (STMT_VINFO_DATA_REF (rep)
4748 && STMT_VINFO_GROUPED_ACCESS (rep)
4749 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4750 partition.layout = 0;
4751
4752 /* We cannot change the layout of an operation that is
4753 not independent on lanes. Note this is an explicit
4754 negative list since that's much shorter than the respective
4755 positive one but it's critical to keep maintaining it. */
4756 if (is_gimple_call (STMT_VINFO_STMT (rep)))
4757 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4758 {
4759 case CFN_COMPLEX_ADD_ROT90:
4760 case CFN_COMPLEX_ADD_ROT270:
4761 case CFN_COMPLEX_MUL:
4762 case CFN_COMPLEX_MUL_CONJ:
4763 case CFN_VEC_ADDSUB:
4764 case CFN_VEC_FMADDSUB:
4765 case CFN_VEC_FMSUBADD:
4766 partition.layout = 0;
4767 default:;
4768 }
4769 }
4770
4771 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4772 {
4773 auto &other_vertex = m_vertices[other_node_i];
4774
4775 /* Count the number of edges from earlier partitions and the number
4776 of edges to later partitions. */
4777 if (other_vertex.partition < vertex.partition)
4778 partition.in_degree += 1;
4779 else
4780 partition.out_degree += 1;
4781
4782 /* If the current node uses the result of OTHER_NODE_I, accumulate
4783 the effects of that. */
4784 if (ud->src == int (node_i))
4785 {
4786 other_vertex.out_weight += vertex.weight;
4787 other_vertex.out_degree += 1;
4788 }
4789 };
4790 for_each_partition_edge (node_i, process_edge);
4791 }
4792 }
4793
4794 /* Return the incoming costs for node NODE_I, assuming that each input keeps
4795 its current (provisional) choice of layout. The inputs do not necessarily
4796 have the same layout as each other. */
4797
4798 slpg_layout_cost
4799 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4800 {
4801 auto &vertex = m_vertices[node_i];
4802 slpg_layout_cost cost;
4803 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4804 {
4805 auto &other_vertex = m_vertices[other_node_i];
4806 if (other_vertex.partition < vertex.partition)
4807 {
4808 auto &other_partition = m_partitions[other_vertex.partition];
4809 auto &other_costs = partition_layout_costs (other_vertex.partition,
4810 other_partition.layout);
4811 slpg_layout_cost this_cost = other_costs.in_cost;
4812 this_cost.add_serial_cost (other_costs.internal_cost);
4813 this_cost.split (other_partition.out_degree);
4814 cost.add_parallel_cost (this_cost);
4815 }
4816 };
4817 for_each_partition_edge (node_i, add_cost);
4818 return cost;
4819 }
4820
4821 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4822 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4823 slpg_layout_cost::impossible () if the change isn't possible. */
4824
4825 slpg_layout_cost
4826 vect_optimize_slp_pass::
4827 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4828 unsigned int layout2_i)
4829 {
4830 auto &def_vertex = m_vertices[ud->dest];
4831 auto &use_vertex = m_vertices[ud->src];
4832 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4833 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4834 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
4835 use_layout_i);
4836 if (factor < 0)
4837 return slpg_layout_cost::impossible ();
4838
4839 /* We have a choice of putting the layout change at the site of the
4840 definition or at the site of the use. Prefer the former when
4841 optimizing for size or when the execution frequency of the
4842 definition is no greater than the combined execution frequencies of
4843 the uses. When putting the layout change at the site of the definition,
4844 divvy up the cost among all consumers. */
4845 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
4846 {
4847 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
4848 cost.split (def_vertex.out_degree);
4849 return cost;
4850 }
4851 return { use_vertex.weight * factor, m_optimize_size };
4852 }
4853
4854 /* UD represents a use-def link between FROM_NODE_I and a node in a later
4855 partition; FROM_NODE_I could be the definition node or the use node.
4856 The node at the other end of the link wants to use layout TO_LAYOUT_I.
4857 Return the cost of any necessary fix-ups on edge UD, or return
4858 slpg_layout_cost::impossible () if the change isn't possible.
4859
4860 At this point, FROM_NODE_I's partition has chosen the cheapest
4861 layout based on the information available so far, but this choice
4862 is only provisional. */
4863
4864 slpg_layout_cost
4865 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
4866 unsigned int to_layout_i)
4867 {
4868 auto &from_vertex = m_vertices[from_node_i];
4869 unsigned int from_partition_i = from_vertex.partition;
4870 slpg_partition_info &from_partition = m_partitions[from_partition_i];
4871 gcc_assert (from_partition.layout >= 0);
4872
4873 /* First calculate the cost on the assumption that FROM_PARTITION sticks
4874 with its current layout preference. */
4875 slpg_layout_cost cost = slpg_layout_cost::impossible ();
4876 auto edge_cost = edge_layout_cost (ud, from_node_i,
4877 from_partition.layout, to_layout_i);
4878 if (edge_cost.is_possible ())
4879 {
4880 auto &from_costs = partition_layout_costs (from_partition_i,
4881 from_partition.layout);
4882 cost = from_costs.in_cost;
4883 cost.add_serial_cost (from_costs.internal_cost);
4884 cost.split (from_partition.out_degree);
4885 cost.add_serial_cost (edge_cost);
4886 }
4887
4888 /* Take the minimum of that cost and the cost that applies if
4889 FROM_PARTITION instead switches to TO_LAYOUT_I. */
4890 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
4891 to_layout_i);
4892 if (direct_layout_costs.is_possible ())
4893 {
4894 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
4895 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
4896 direct_cost.split (from_partition.out_degree);
4897 if (!cost.is_possible ()
4898 || direct_cost.is_better_than (cost, m_optimize_size))
4899 cost = direct_cost;
4900 }
4901
4902 return cost;
4903 }
4904
4905 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
4906 partition; TO_NODE_I could be the definition node or the use node.
4907 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
4908 return the cost of any necessary fix-ups on edge UD, or
4909 slpg_layout_cost::impossible () if the choice cannot be made.
4910
4911 At this point, TO_NODE_I's partition has a fixed choice of layout. */
4912
4913 slpg_layout_cost
4914 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
4915 unsigned int from_layout_i)
4916 {
4917 auto &to_vertex = m_vertices[to_node_i];
4918 unsigned int to_partition_i = to_vertex.partition;
4919 slpg_partition_info &to_partition = m_partitions[to_partition_i];
4920 gcc_assert (to_partition.layout >= 0);
4921
4922 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
4923 adjusted for this input having layout FROM_LAYOUT_I. Assume that
4924 any other inputs keep their current choice of layout. */
4925 auto &to_costs = partition_layout_costs (to_partition_i,
4926 to_partition.layout);
4927 if (ud->src == int (to_node_i)
4928 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
4929 {
4930 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
4931 auto old_layout = from_partition.layout;
4932 from_partition.layout = from_layout_i;
4933 int factor = internal_node_cost (to_vertex.node, -1,
4934 to_partition.layout);
4935 from_partition.layout = old_layout;
4936 if (factor >= 0)
4937 {
4938 slpg_layout_cost cost = to_costs.out_cost;
4939 cost.add_serial_cost ({ to_vertex.weight * factor,
4940 m_optimize_size });
4941 cost.split (to_partition.in_degree);
4942 return cost;
4943 }
4944 }
4945
4946 /* Compute the cost if we insert any necessary layout change on edge UD. */
4947 auto edge_cost = edge_layout_cost (ud, to_node_i,
4948 to_partition.layout, from_layout_i);
4949 if (edge_cost.is_possible ())
4950 {
4951 slpg_layout_cost cost = to_costs.out_cost;
4952 cost.add_serial_cost (to_costs.internal_cost);
4953 cost.split (to_partition.in_degree);
4954 cost.add_serial_cost (edge_cost);
4955 return cost;
4956 }
4957
4958 return slpg_layout_cost::impossible ();
4959 }
4960
4961 /* Make a forward pass through the partitions, accumulating input costs.
4962 Make a tentative (provisional) choice of layout for each partition,
4963 ensuring that this choice still allows later partitions to keep
4964 their original layout. */
4965
4966 void
4967 vect_optimize_slp_pass::forward_pass ()
4968 {
4969 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
4970 ++partition_i)
4971 {
4972 auto &partition = m_partitions[partition_i];
4973
4974 /* If the partition consists of a single VEC_PERM_EXPR, precompute
4975 the incoming cost that would apply if every predecessor partition
4976 keeps its current layout. This is used within the loop below. */
4977 slpg_layout_cost in_cost;
4978 slp_tree single_node = nullptr;
4979 if (partition.node_end == partition.node_begin + 1)
4980 {
4981 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
4982 single_node = m_vertices[node_i].node;
4983 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
4984 in_cost = total_in_cost (node_i);
4985 }
4986
4987 /* Go through the possible layouts. Decide which ones are valid
4988 for this partition and record which of the valid layouts has
4989 the lowest cost. */
4990 unsigned int min_layout_i = 0;
4991 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
4992 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
4993 {
4994 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
4995 if (!layout_costs.is_possible ())
4996 continue;
4997
4998 /* If the recorded layout is already 0 then the layout cannot
4999 change. */
5000 if (partition.layout == 0 && layout_i != 0)
5001 {
5002 layout_costs.mark_impossible ();
5003 continue;
5004 }
5005
5006 bool is_possible = true;
5007 for (unsigned int order_i = partition.node_begin;
5008 order_i < partition.node_end; ++order_i)
5009 {
5010 unsigned int node_i = m_partitioned_nodes[order_i];
5011 auto &vertex = m_vertices[node_i];
5012
5013 /* Reject the layout if it is individually incompatible
5014 with any node in the partition. */
5015 if (!is_compatible_layout (vertex.node, layout_i))
5016 {
5017 is_possible = false;
5018 break;
5019 }
5020
5021 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5022 {
5023 auto &other_vertex = m_vertices[other_node_i];
5024 if (other_vertex.partition < vertex.partition)
5025 {
5026 /* Accumulate the incoming costs from earlier
5027 partitions, plus the cost of any layout changes
5028 on UD itself. */
5029 auto cost = forward_cost (ud, other_node_i, layout_i);
5030 if (!cost.is_possible ())
5031 is_possible = false;
5032 else
5033 layout_costs.in_cost.add_parallel_cost (cost);
5034 }
5035 else
5036 /* Reject the layout if it would make layout 0 impossible
5037 for later partitions. This amounts to testing that the
5038 target supports reversing the layout change on edges
5039 to later partitions.
5040
5041 In principle, it might be possible to push a layout
5042 change all the way down a graph, so that it never
5043 needs to be reversed and so that the target doesn't
5044 need to support the reverse operation. But it would
5045 be awkward to bail out if we hit a partition that
5046 does not support the new layout, especially since
5047 we are not dealing with a lattice. */
5048 is_possible &= edge_layout_cost (ud, other_node_i, 0,
5049 layout_i).is_possible ();
5050 };
5051 for_each_partition_edge (node_i, add_cost);
5052
5053 /* Accumulate the cost of using LAYOUT_I within NODE,
5054 both for the inputs and the outputs. */
5055 int factor = internal_node_cost (vertex.node, layout_i,
5056 layout_i);
5057 if (factor < 0)
5058 {
5059 is_possible = false;
5060 break;
5061 }
5062 else if (factor)
5063 layout_costs.internal_cost.add_serial_cost
5064 ({ vertex.weight * factor, m_optimize_size });
5065 }
5066 if (!is_possible)
5067 {
5068 layout_costs.mark_impossible ();
5069 continue;
5070 }
5071
5072 /* Combine the incoming and partition-internal costs. */
5073 slpg_layout_cost combined_cost = layout_costs.in_cost;
5074 combined_cost.add_serial_cost (layout_costs.internal_cost);
5075
5076 /* If this partition consists of a single VEC_PERM_EXPR, see
5077 if the VEC_PERM_EXPR can be changed to support output layout
5078 LAYOUT_I while keeping all the provisional choices of input
5079 layout. */
5080 if (single_node
5081 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5082 {
5083 int factor = internal_node_cost (single_node, -1, layout_i);
5084 if (factor >= 0)
5085 {
5086 auto weight = m_vertices[single_node->vertex].weight;
5087 slpg_layout_cost internal_cost
5088 = { weight * factor, m_optimize_size };
5089
5090 slpg_layout_cost alt_cost = in_cost;
5091 alt_cost.add_serial_cost (internal_cost);
5092 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5093 {
5094 combined_cost = alt_cost;
5095 layout_costs.in_cost = in_cost;
5096 layout_costs.internal_cost = internal_cost;
5097 }
5098 }
5099 }
5100
5101 /* Record the layout with the lowest cost. Prefer layout 0 in
5102 the event of a tie between it and another layout. */
5103 if (!min_layout_cost.is_possible ()
5104 || combined_cost.is_better_than (min_layout_cost,
5105 m_optimize_size))
5106 {
5107 min_layout_i = layout_i;
5108 min_layout_cost = combined_cost;
5109 }
5110 }
5111
5112 /* This loop's handling of earlier partitions should ensure that
5113 choosing the original layout for the current partition is no
5114 less valid than it was in the original graph, even with the
5115 provisional layout choices for those earlier partitions. */
5116 gcc_assert (min_layout_cost.is_possible ());
5117 partition.layout = min_layout_i;
5118 }
5119 }
5120
5121 /* Make a backward pass through the partitions, accumulating output costs.
5122 Make a final choice of layout for each partition. */
5123
5124 void
5125 vect_optimize_slp_pass::backward_pass ()
5126 {
5127 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5128 {
5129 auto &partition = m_partitions[partition_i];
5130
5131 unsigned int min_layout_i = 0;
5132 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5133 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5134 {
5135 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5136 if (!layout_costs.is_possible ())
5137 continue;
5138
5139 /* Accumulate the costs from successor partitions. */
5140 bool is_possible = true;
5141 for (unsigned int order_i = partition.node_begin;
5142 order_i < partition.node_end; ++order_i)
5143 {
5144 unsigned int node_i = m_partitioned_nodes[order_i];
5145 auto &vertex = m_vertices[node_i];
5146 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5147 {
5148 auto &other_vertex = m_vertices[other_node_i];
5149 auto &other_partition = m_partitions[other_vertex.partition];
5150 if (other_vertex.partition > vertex.partition)
5151 {
5152 /* Accumulate the incoming costs from later
5153 partitions, plus the cost of any layout changes
5154 on UD itself. */
5155 auto cost = backward_cost (ud, other_node_i, layout_i);
5156 if (!cost.is_possible ())
5157 is_possible = false;
5158 else
5159 layout_costs.out_cost.add_parallel_cost (cost);
5160 }
5161 else
5162 /* Make sure that earlier partitions can (if necessary
5163 or beneficial) keep the layout that they chose in
5164 the forward pass. This ensures that there is at
5165 least one valid choice of layout. */
5166 is_possible &= edge_layout_cost (ud, other_node_i,
5167 other_partition.layout,
5168 layout_i).is_possible ();
5169 };
5170 for_each_partition_edge (node_i, add_cost);
5171 }
5172 if (!is_possible)
5173 {
5174 layout_costs.mark_impossible ();
5175 continue;
5176 }
5177
5178 /* Locally combine the costs from the forward and backward passes.
5179 (This combined cost is not passed on, since that would lead
5180 to double counting.) */
5181 slpg_layout_cost combined_cost = layout_costs.in_cost;
5182 combined_cost.add_serial_cost (layout_costs.internal_cost);
5183 combined_cost.add_serial_cost (layout_costs.out_cost);
5184
5185 /* Record the layout with the lowest cost. Prefer layout 0 in
5186 the event of a tie between it and another layout. */
5187 if (!min_layout_cost.is_possible ()
5188 || combined_cost.is_better_than (min_layout_cost,
5189 m_optimize_size))
5190 {
5191 min_layout_i = layout_i;
5192 min_layout_cost = combined_cost;
5193 }
5194 }
5195
5196 gcc_assert (min_layout_cost.is_possible ());
5197 partition.layout = min_layout_i;
5198 }
5199 }
5200
5201 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5202 NODE already has the layout that was selected for its partition. */
5203
5204 slp_tree
5205 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5206 unsigned int to_layout_i)
5207 {
5208 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5209 slp_tree result = m_node_layouts[result_i];
5210 if (result)
5211 return result;
5212
5213 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5214 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5215 /* We can't permute vector defs in place. */
5216 && SLP_TREE_VEC_DEFS (node).is_empty ()))
5217 {
5218 /* If the vector is uniform or unchanged, there's nothing to do. */
5219 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5220 result = node;
5221 else
5222 {
5223 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5224 result = vect_create_new_slp_node (scalar_ops);
5225 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5226 }
5227 }
5228 else
5229 {
5230 unsigned int partition_i = m_vertices[node->vertex].partition;
5231 unsigned int from_layout_i = m_partitions[partition_i].layout;
5232 if (from_layout_i == to_layout_i)
5233 return node;
5234
5235 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5236 permutation instead of a serial one. Leave the new permutation
5237 in TMP_PERM on success. */
5238 auto_lane_permutation_t tmp_perm;
5239 unsigned int num_inputs = 1;
5240 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5241 {
5242 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5243 if (from_layout_i != 0)
5244 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5245 if (to_layout_i != 0)
5246 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5247 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5248 tmp_perm,
5249 SLP_TREE_CHILDREN (node),
5250 false) >= 0)
5251 num_inputs = SLP_TREE_CHILDREN (node).length ();
5252 else
5253 tmp_perm.truncate (0);
5254 }
5255
5256 if (dump_enabled_p ())
5257 {
5258 if (tmp_perm.length () > 0)
5259 dump_printf_loc (MSG_NOTE, vect_location,
5260 "duplicating permutation node %p with"
5261 " layout %d\n",
5262 (void *) node, to_layout_i);
5263 else
5264 dump_printf_loc (MSG_NOTE, vect_location,
5265 "inserting permutation node in place of %p\n",
5266 (void *) node);
5267 }
5268
5269 unsigned int num_lanes = SLP_TREE_LANES (node);
5270 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5271 if (SLP_TREE_SCALAR_STMTS (node).length ())
5272 {
5273 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5274 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5275 if (from_layout_i != 0)
5276 vect_slp_permute (m_perms[from_layout_i], stmts, false);
5277 if (to_layout_i != 0)
5278 vect_slp_permute (m_perms[to_layout_i], stmts, true);
5279 }
5280 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5281 SLP_TREE_LANES (result) = num_lanes;
5282 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5283 result->vertex = -1;
5284
5285 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5286 if (tmp_perm.length ())
5287 {
5288 lane_perm.safe_splice (tmp_perm);
5289 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5290 }
5291 else
5292 {
5293 lane_perm.create (num_lanes);
5294 for (unsigned j = 0; j < num_lanes; ++j)
5295 lane_perm.quick_push ({ 0, j });
5296 if (from_layout_i != 0)
5297 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5298 if (to_layout_i != 0)
5299 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5300 SLP_TREE_CHILDREN (result).safe_push (node);
5301 }
5302 for (slp_tree child : SLP_TREE_CHILDREN (result))
5303 child->refcnt++;
5304 }
5305 m_node_layouts[result_i] = result;
5306 return result;
5307 }
5308
5309 /* Apply the chosen vector layouts to the SLP graph. */
5310
5311 void
5312 vect_optimize_slp_pass::materialize ()
5313 {
5314 /* We no longer need the costs, so avoid having two O(N * P) arrays
5315 live at the same time. */
5316 m_partition_layout_costs.release ();
5317 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5318
5319 auto_sbitmap fully_folded (m_vertices.length ());
5320 bitmap_clear (fully_folded);
5321 for (unsigned int node_i : m_partitioned_nodes)
5322 {
5323 auto &vertex = m_vertices[node_i];
5324 slp_tree node = vertex.node;
5325 int layout_i = m_partitions[vertex.partition].layout;
5326 gcc_assert (layout_i >= 0);
5327
5328 /* Rearrange the scalar statements to match the chosen layout. */
5329 if (layout_i > 0)
5330 vect_slp_permute (m_perms[layout_i],
5331 SLP_TREE_SCALAR_STMTS (node), true);
5332
5333 /* Update load and lane permutations. */
5334 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5335 {
5336 /* First try to absorb the input vector layouts. If that fails,
5337 force the inputs to have layout LAYOUT_I too. We checked that
5338 that was possible before deciding to use nonzero output layouts.
5339 (Note that at this stage we don't really have any guarantee that
5340 the target supports the original VEC_PERM_EXPR.) */
5341 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5342 auto_lane_permutation_t tmp_perm;
5343 tmp_perm.safe_splice (perm);
5344 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5345 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5346 tmp_perm,
5347 SLP_TREE_CHILDREN (node),
5348 false) >= 0)
5349 {
5350 if (dump_enabled_p ()
5351 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5352 perm.begin ()))
5353 dump_printf_loc (MSG_NOTE, vect_location,
5354 "absorbing input layouts into %p\n",
5355 (void *) node);
5356 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5357 bitmap_set_bit (fully_folded, node_i);
5358 }
5359 else
5360 {
5361 /* Not MSG_MISSED because it would make no sense to users. */
5362 if (dump_enabled_p ())
5363 dump_printf_loc (MSG_NOTE, vect_location,
5364 "failed to absorb input layouts into %p\n",
5365 (void *) node);
5366 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5367 }
5368 }
5369 else
5370 {
5371 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5372 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5373 if (layout_i > 0)
5374 /* ??? When we handle non-bijective permutes the idea
5375 is that we can force the load-permutation to be
5376 { min, min + 1, min + 2, ... max }. But then the
5377 scalar defs might no longer match the lane content
5378 which means wrong-code with live lane vectorization.
5379 So we possibly have to have NULL entries for those. */
5380 vect_slp_permute (m_perms[layout_i], load_perm, true);
5381 }
5382 }
5383
5384 /* Do this before any nodes disappear, since it involves a walk
5385 over the leaves. */
5386 remove_redundant_permutations ();
5387
5388 /* Replace each child with a correctly laid-out version. */
5389 for (unsigned int node_i : m_partitioned_nodes)
5390 {
5391 /* Skip nodes that have already been handled above. */
5392 if (bitmap_bit_p (fully_folded, node_i))
5393 continue;
5394
5395 auto &vertex = m_vertices[node_i];
5396 int in_layout_i = m_partitions[vertex.partition].layout;
5397 gcc_assert (in_layout_i >= 0);
5398
5399 unsigned j;
5400 slp_tree child;
5401 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5402 {
5403 if (!child)
5404 continue;
5405
5406 slp_tree new_child = get_result_with_layout (child, in_layout_i);
5407 if (new_child != child)
5408 {
5409 vect_free_slp_tree (child);
5410 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5411 new_child->refcnt += 1;
5412 }
5413 }
5414 }
5415 }
5416
5417 /* Elide load permutations that are not necessary. Such permutations might
5418 be pre-existing, rather than created by the layout optimizations. */
5419
5420 void
5421 vect_optimize_slp_pass::remove_redundant_permutations ()
5422 {
5423 for (unsigned int node_i : m_leafs)
5424 {
5425 slp_tree node = m_vertices[node_i].node;
5426 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5427 continue;
5428
5429 /* In basic block vectorization we allow any subchain of an interleaving
5430 chain.
5431 FORNOW: not in loop SLP because of realignment complications. */
5432 if (is_a <bb_vec_info> (m_vinfo))
5433 {
5434 bool subchain_p = true;
5435 stmt_vec_info next_load_info = NULL;
5436 stmt_vec_info load_info;
5437 unsigned j;
5438 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5439 {
5440 if (j != 0
5441 && (next_load_info != load_info
5442 || DR_GROUP_GAP (load_info) != 1))
5443 {
5444 subchain_p = false;
5445 break;
5446 }
5447 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5448 }
5449 if (subchain_p)
5450 {
5451 SLP_TREE_LOAD_PERMUTATION (node).release ();
5452 continue;
5453 }
5454 }
5455 else
5456 {
5457 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5458 stmt_vec_info load_info;
5459 bool this_load_permuted = false;
5460 unsigned j;
5461 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5462 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5463 {
5464 this_load_permuted = true;
5465 break;
5466 }
5467 /* When this isn't a grouped access we know it's single element
5468 and contiguous. */
5469 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5470 {
5471 if (!this_load_permuted
5472 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5473 || SLP_TREE_LANES (node) == 1))
5474 SLP_TREE_LOAD_PERMUTATION (node).release ();
5475 continue;
5476 }
5477 stmt_vec_info first_stmt_info
5478 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5479 if (!this_load_permuted
5480 /* The load requires permutation when unrolling exposes
5481 a gap either because the group is larger than the SLP
5482 group-size or because there is a gap between the groups. */
5483 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5484 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5485 && DR_GROUP_GAP (first_stmt_info) == 0)))
5486 {
5487 SLP_TREE_LOAD_PERMUTATION (node).release ();
5488 continue;
5489 }
5490 }
5491 }
5492 }
5493
5494 /* Print the partition graph and layout information to the dump file. */
5495
5496 void
5497 vect_optimize_slp_pass::dump ()
5498 {
5499 dump_printf_loc (MSG_NOTE, vect_location,
5500 "SLP optimize permutations:\n");
5501 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5502 {
5503 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5504 const char *sep = "";
5505 for (unsigned int idx : m_perms[layout_i])
5506 {
5507 dump_printf (MSG_NOTE, "%s%d", sep, idx);
5508 sep = ", ";
5509 }
5510 dump_printf (MSG_NOTE, " }\n");
5511 }
5512 dump_printf_loc (MSG_NOTE, vect_location,
5513 "SLP optimize partitions:\n");
5514 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5515 ++partition_i)
5516 {
5517 auto &partition = m_partitions[partition_i];
5518 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5519 dump_printf_loc (MSG_NOTE, vect_location,
5520 " partition %d (layout %d):\n",
5521 partition_i, partition.layout);
5522 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5523 for (unsigned int order_i = partition.node_begin;
5524 order_i < partition.node_end; ++order_i)
5525 {
5526 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5527 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5528 (void *) vertex.node);
5529 dump_printf_loc (MSG_NOTE, vect_location,
5530 " weight: %f\n",
5531 vertex.weight.to_double ());
5532 if (vertex.out_degree)
5533 dump_printf_loc (MSG_NOTE, vect_location,
5534 " out weight: %f (degree %d)\n",
5535 vertex.out_weight.to_double (),
5536 vertex.out_degree);
5537 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5538 dump_printf_loc (MSG_NOTE, vect_location,
5539 " op: VEC_PERM_EXPR\n");
5540 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5541 dump_printf_loc (MSG_NOTE, vect_location,
5542 " op template: %G", rep->stmt);
5543 }
5544 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5545 for (unsigned int order_i = partition.node_begin;
5546 order_i < partition.node_end; ++order_i)
5547 {
5548 unsigned int node_i = m_partitioned_nodes[order_i];
5549 auto &vertex = m_vertices[node_i];
5550 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5551 {
5552 auto &other_vertex = m_vertices[other_node_i];
5553 if (other_vertex.partition < vertex.partition)
5554 dump_printf_loc (MSG_NOTE, vect_location,
5555 " - %p [%d] --> %p\n",
5556 (void *) other_vertex.node,
5557 other_vertex.partition,
5558 (void *) vertex.node);
5559 else
5560 dump_printf_loc (MSG_NOTE, vect_location,
5561 " - %p --> [%d] %p\n",
5562 (void *) vertex.node,
5563 other_vertex.partition,
5564 (void *) other_vertex.node);
5565 };
5566 for_each_partition_edge (node_i, print_edge);
5567 }
5568
5569 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5570 {
5571 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5572 if (layout_costs.is_possible ())
5573 {
5574 dump_printf_loc (MSG_NOTE, vect_location,
5575 " layout %d:%s\n", layout_i,
5576 partition.layout == int (layout_i)
5577 ? " (*)" : "");
5578 slpg_layout_cost combined_cost = layout_costs.in_cost;
5579 combined_cost.add_serial_cost (layout_costs.internal_cost);
5580 combined_cost.add_serial_cost (layout_costs.out_cost);
5581 #define TEMPLATE "{depth: %f, total: %f}"
5582 dump_printf_loc (MSG_NOTE, vect_location,
5583 " " TEMPLATE "\n",
5584 layout_costs.in_cost.depth.to_double (),
5585 layout_costs.in_cost.total.to_double ());
5586 dump_printf_loc (MSG_NOTE, vect_location,
5587 " + " TEMPLATE "\n",
5588 layout_costs.internal_cost.depth.to_double (),
5589 layout_costs.internal_cost.total.to_double ());
5590 dump_printf_loc (MSG_NOTE, vect_location,
5591 " + " TEMPLATE "\n",
5592 layout_costs.out_cost.depth.to_double (),
5593 layout_costs.out_cost.total.to_double ());
5594 dump_printf_loc (MSG_NOTE, vect_location,
5595 " = " TEMPLATE "\n",
5596 combined_cost.depth.to_double (),
5597 combined_cost.total.to_double ());
5598 #undef TEMPLATE
5599 }
5600 else
5601 dump_printf_loc (MSG_NOTE, vect_location,
5602 " layout %d: rejected\n", layout_i);
5603 }
5604 }
5605 }
5606
5607 /* Main entry point for the SLP graph optimization pass. */
5608
5609 void
5610 vect_optimize_slp_pass::run ()
5611 {
5612 build_graph ();
5613 create_partitions ();
5614 start_choosing_layouts ();
5615 if (m_perms.length () > 1)
5616 {
5617 forward_pass ();
5618 backward_pass ();
5619 if (dump_enabled_p ())
5620 dump ();
5621 materialize ();
5622 while (!m_perms.is_empty ())
5623 m_perms.pop ().release ();
5624 }
5625 else
5626 remove_redundant_permutations ();
5627 free_graph (m_slpg);
5628 }
5629
5630 /* Optimize the SLP graph of VINFO. */
5631
5632 void
5633 vect_optimize_slp (vec_info *vinfo)
5634 {
5635 if (vinfo->slp_instances.is_empty ())
5636 return;
5637 vect_optimize_slp_pass (vinfo).run ();
5638 }
5639
5640 /* Gather loads reachable from the individual SLP graph entries. */
5641
5642 void
5643 vect_gather_slp_loads (vec_info *vinfo)
5644 {
5645 unsigned i;
5646 slp_instance instance;
5647 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5648 {
5649 hash_set<slp_tree> visited;
5650 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5651 SLP_INSTANCE_TREE (instance), visited);
5652 }
5653 }
5654
5655
5656 /* For each possible SLP instance decide whether to SLP it and calculate overall
5657 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5658 least one instance. */
5659
5660 bool
5661 vect_make_slp_decision (loop_vec_info loop_vinfo)
5662 {
5663 unsigned int i;
5664 poly_uint64 unrolling_factor = 1;
5665 const vec<slp_instance> &slp_instances
5666 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5667 slp_instance instance;
5668 int decided_to_slp = 0;
5669
5670 DUMP_VECT_SCOPE ("vect_make_slp_decision");
5671
5672 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5673 {
5674 /* FORNOW: SLP if you can. */
5675 /* All unroll factors have the form:
5676
5677 GET_MODE_SIZE (vinfo->vector_mode) * X
5678
5679 for some rational X, so they must have a common multiple. */
5680 unrolling_factor
5681 = force_common_multiple (unrolling_factor,
5682 SLP_INSTANCE_UNROLLING_FACTOR (instance));
5683
5684 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5685 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5686 loop-based vectorization. Such stmts will be marked as HYBRID. */
5687 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5688 decided_to_slp++;
5689 }
5690
5691 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5692
5693 if (decided_to_slp && dump_enabled_p ())
5694 {
5695 dump_printf_loc (MSG_NOTE, vect_location,
5696 "Decided to SLP %d instances. Unrolling factor ",
5697 decided_to_slp);
5698 dump_dec (MSG_NOTE, unrolling_factor);
5699 dump_printf (MSG_NOTE, "\n");
5700 }
5701
5702 return (decided_to_slp > 0);
5703 }
5704
5705 /* Private data for vect_detect_hybrid_slp. */
5706 struct vdhs_data
5707 {
5708 loop_vec_info loop_vinfo;
5709 vec<stmt_vec_info> *worklist;
5710 };
5711
5712 /* Walker for walk_gimple_op. */
5713
5714 static tree
5715 vect_detect_hybrid_slp (tree *tp, int *, void *data)
5716 {
5717 walk_stmt_info *wi = (walk_stmt_info *)data;
5718 vdhs_data *dat = (vdhs_data *)wi->info;
5719
5720 if (wi->is_lhs)
5721 return NULL_TREE;
5722
5723 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5724 if (!def_stmt_info)
5725 return NULL_TREE;
5726 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5727 if (PURE_SLP_STMT (def_stmt_info))
5728 {
5729 if (dump_enabled_p ())
5730 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5731 def_stmt_info->stmt);
5732 STMT_SLP_TYPE (def_stmt_info) = hybrid;
5733 dat->worklist->safe_push (def_stmt_info);
5734 }
5735
5736 return NULL_TREE;
5737 }
5738
5739 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5740 if so, otherwise pushing it to WORKLIST. */
5741
5742 static void
5743 maybe_push_to_hybrid_worklist (vec_info *vinfo,
5744 vec<stmt_vec_info> &worklist,
5745 stmt_vec_info stmt_info)
5746 {
5747 if (dump_enabled_p ())
5748 dump_printf_loc (MSG_NOTE, vect_location,
5749 "Processing hybrid candidate : %G", stmt_info->stmt);
5750 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5751 imm_use_iterator iter2;
5752 ssa_op_iter iter1;
5753 use_operand_p use_p;
5754 def_operand_p def_p;
5755 bool any_def = false;
5756 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5757 {
5758 any_def = true;
5759 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5760 {
5761 if (is_gimple_debug (USE_STMT (use_p)))
5762 continue;
5763 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5764 /* An out-of loop use means this is a loop_vect sink. */
5765 if (!use_info)
5766 {
5767 if (dump_enabled_p ())
5768 dump_printf_loc (MSG_NOTE, vect_location,
5769 "Found loop_vect sink: %G", stmt_info->stmt);
5770 worklist.safe_push (stmt_info);
5771 return;
5772 }
5773 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5774 {
5775 if (dump_enabled_p ())
5776 dump_printf_loc (MSG_NOTE, vect_location,
5777 "Found loop_vect use: %G", use_info->stmt);
5778 worklist.safe_push (stmt_info);
5779 return;
5780 }
5781 }
5782 }
5783 /* No def means this is a loo_vect sink. */
5784 if (!any_def)
5785 {
5786 if (dump_enabled_p ())
5787 dump_printf_loc (MSG_NOTE, vect_location,
5788 "Found loop_vect sink: %G", stmt_info->stmt);
5789 worklist.safe_push (stmt_info);
5790 return;
5791 }
5792 if (dump_enabled_p ())
5793 dump_printf_loc (MSG_NOTE, vect_location,
5794 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5795 STMT_SLP_TYPE (stmt_info) = pure_slp;
5796 }
5797
5798 /* Find stmts that must be both vectorized and SLPed. */
5799
5800 void
5801 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5802 {
5803 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5804
5805 /* All stmts participating in SLP are marked pure_slp, all other
5806 stmts are loop_vect.
5807 First collect all loop_vect stmts into a worklist.
5808 SLP patterns cause not all original scalar stmts to appear in
5809 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5810 Rectify this here and do a backward walk over the IL only considering
5811 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5812 mark them as pure_slp. */
5813 auto_vec<stmt_vec_info> worklist;
5814 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5815 {
5816 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5817 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5818 gsi_next (&gsi))
5819 {
5820 gphi *phi = gsi.phi ();
5821 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5822 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5823 maybe_push_to_hybrid_worklist (loop_vinfo,
5824 worklist, stmt_info);
5825 }
5826 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5827 gsi_prev (&gsi))
5828 {
5829 gimple *stmt = gsi_stmt (gsi);
5830 if (is_gimple_debug (stmt))
5831 continue;
5832 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
5833 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
5834 {
5835 for (gimple_stmt_iterator gsi2
5836 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
5837 !gsi_end_p (gsi2); gsi_next (&gsi2))
5838 {
5839 stmt_vec_info patt_info
5840 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
5841 if (!STMT_SLP_TYPE (patt_info)
5842 && STMT_VINFO_RELEVANT (patt_info))
5843 maybe_push_to_hybrid_worklist (loop_vinfo,
5844 worklist, patt_info);
5845 }
5846 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5847 }
5848 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5849 maybe_push_to_hybrid_worklist (loop_vinfo,
5850 worklist, stmt_info);
5851 }
5852 }
5853
5854 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
5855 mark any SLP vectorized stmt as hybrid.
5856 ??? We're visiting def stmts N times (once for each non-SLP and
5857 once for each hybrid-SLP use). */
5858 walk_stmt_info wi;
5859 vdhs_data dat;
5860 dat.worklist = &worklist;
5861 dat.loop_vinfo = loop_vinfo;
5862 memset (&wi, 0, sizeof (wi));
5863 wi.info = (void *)&dat;
5864 while (!worklist.is_empty ())
5865 {
5866 stmt_vec_info stmt_info = worklist.pop ();
5867 /* Since SSA operands are not set up for pattern stmts we need
5868 to use walk_gimple_op. */
5869 wi.is_lhs = 0;
5870 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
5871 /* For gather/scatter make sure to walk the offset operand, that
5872 can be a scaling and conversion away. */
5873 gather_scatter_info gs_info;
5874 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5875 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
5876 {
5877 int dummy;
5878 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
5879 }
5880 }
5881 }
5882
5883
5884 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
5885
5886 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
5887 : vec_info (vec_info::bb, shared),
5888 bbs (_bbs),
5889 roots (vNULL)
5890 {
5891 for (unsigned i = 0; i < bbs.length (); ++i)
5892 {
5893 if (i != 0)
5894 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5895 gsi_next (&si))
5896 {
5897 gphi *phi = si.phi ();
5898 gimple_set_uid (phi, 0);
5899 add_stmt (phi);
5900 }
5901 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5902 !gsi_end_p (gsi); gsi_next (&gsi))
5903 {
5904 gimple *stmt = gsi_stmt (gsi);
5905 gimple_set_uid (stmt, 0);
5906 if (is_gimple_debug (stmt))
5907 continue;
5908 add_stmt (stmt);
5909 }
5910 }
5911 }
5912
5913
5914 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
5915 stmts in the basic block. */
5916
5917 _bb_vec_info::~_bb_vec_info ()
5918 {
5919 /* Reset region marker. */
5920 for (unsigned i = 0; i < bbs.length (); ++i)
5921 {
5922 if (i != 0)
5923 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
5924 gsi_next (&si))
5925 {
5926 gphi *phi = si.phi ();
5927 gimple_set_uid (phi, -1);
5928 }
5929 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
5930 !gsi_end_p (gsi); gsi_next (&gsi))
5931 {
5932 gimple *stmt = gsi_stmt (gsi);
5933 gimple_set_uid (stmt, -1);
5934 }
5935 }
5936
5937 for (unsigned i = 0; i < roots.length (); ++i)
5938 {
5939 roots[i].stmts.release ();
5940 roots[i].roots.release ();
5941 roots[i].remain.release ();
5942 }
5943 roots.release ();
5944 }
5945
5946 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
5947 given then that child nodes have already been processed, and that
5948 their def types currently match their SLP node's def type. */
5949
5950 static bool
5951 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
5952 slp_instance node_instance,
5953 stmt_vector_for_cost *cost_vec)
5954 {
5955 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5956
5957 /* Calculate the number of vector statements to be created for the
5958 scalar stmts in this node. For SLP reductions it is equal to the
5959 number of vector statements in the children (which has already been
5960 calculated by the recursive call). Otherwise it is the number of
5961 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
5962 VF divided by the number of elements in a vector. */
5963 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5964 && !STMT_VINFO_DATA_REF (stmt_info)
5965 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5966 {
5967 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
5968 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
5969 {
5970 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5971 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
5972 break;
5973 }
5974 }
5975 else
5976 {
5977 poly_uint64 vf;
5978 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5979 vf = loop_vinfo->vectorization_factor;
5980 else
5981 vf = 1;
5982 unsigned int group_size = SLP_TREE_LANES (node);
5983 tree vectype = SLP_TREE_VECTYPE (node);
5984 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
5985 = vect_get_num_vectors (vf * group_size, vectype);
5986 }
5987
5988 /* Handle purely internal nodes. */
5989 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5990 {
5991 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
5992 return false;
5993
5994 stmt_vec_info slp_stmt_info;
5995 unsigned int i;
5996 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
5997 {
5998 if (STMT_VINFO_LIVE_P (slp_stmt_info)
5999 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6000 node_instance, i,
6001 false, cost_vec))
6002 return false;
6003 }
6004 return true;
6005 }
6006
6007 bool dummy;
6008 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6009 node, node_instance, cost_vec);
6010 }
6011
6012 /* Try to build NODE from scalars, returning true on success.
6013 NODE_INSTANCE is the SLP instance that contains NODE. */
6014
6015 static bool
6016 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6017 slp_instance node_instance)
6018 {
6019 stmt_vec_info stmt_info;
6020 unsigned int i;
6021
6022 if (!is_a <bb_vec_info> (vinfo)
6023 || node == SLP_INSTANCE_TREE (node_instance)
6024 || !SLP_TREE_SCALAR_STMTS (node).exists ()
6025 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6026 /* Force the mask use to be built from scalars instead. */
6027 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6028 return false;
6029
6030 if (dump_enabled_p ())
6031 dump_printf_loc (MSG_NOTE, vect_location,
6032 "Building vector operands of %p from scalars instead\n",
6033 (void *) node);
6034
6035 /* Don't remove and free the child nodes here, since they could be
6036 referenced by other structures. The analysis and scheduling phases
6037 (need to) ignore child nodes of anything that isn't vect_internal_def. */
6038 unsigned int group_size = SLP_TREE_LANES (node);
6039 SLP_TREE_DEF_TYPE (node) = vect_external_def;
6040 /* Invariants get their vector type from the uses. */
6041 SLP_TREE_VECTYPE (node) = NULL_TREE;
6042 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6043 SLP_TREE_LOAD_PERMUTATION (node).release ();
6044 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6045 {
6046 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6047 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6048 }
6049 return true;
6050 }
6051
6052 /* Return true if all elements of the slice are the same. */
6053 bool
6054 vect_scalar_ops_slice::all_same_p () const
6055 {
6056 for (unsigned int i = 1; i < length; ++i)
6057 if (!operand_equal_p (op (0), op (i)))
6058 return false;
6059 return true;
6060 }
6061
6062 hashval_t
6063 vect_scalar_ops_slice_hash::hash (const value_type &s)
6064 {
6065 hashval_t hash = 0;
6066 for (unsigned i = 0; i < s.length; ++i)
6067 hash = iterative_hash_expr (s.op (i), hash);
6068 return hash;
6069 }
6070
6071 bool
6072 vect_scalar_ops_slice_hash::equal (const value_type &s1,
6073 const compare_type &s2)
6074 {
6075 if (s1.length != s2.length)
6076 return false;
6077 for (unsigned i = 0; i < s1.length; ++i)
6078 if (!operand_equal_p (s1.op (i), s2.op (i)))
6079 return false;
6080 return true;
6081 }
6082
6083 /* Compute the prologue cost for invariant or constant operands represented
6084 by NODE. */
6085
6086 static void
6087 vect_prologue_cost_for_slp (slp_tree node,
6088 stmt_vector_for_cost *cost_vec)
6089 {
6090 /* There's a special case of an existing vector, that costs nothing. */
6091 if (SLP_TREE_SCALAR_OPS (node).length () == 0
6092 && !SLP_TREE_VEC_DEFS (node).is_empty ())
6093 return;
6094 /* Without looking at the actual initializer a vector of
6095 constants can be implemented as load from the constant pool.
6096 When all elements are the same we can use a splat. */
6097 tree vectype = SLP_TREE_VECTYPE (node);
6098 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6099 unsigned HOST_WIDE_INT const_nunits;
6100 unsigned nelt_limit;
6101 auto ops = &SLP_TREE_SCALAR_OPS (node);
6102 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6103 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6104 && ! multiple_p (const_nunits, group_size))
6105 {
6106 nelt_limit = const_nunits;
6107 hash_set<vect_scalar_ops_slice_hash> vector_ops;
6108 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6109 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6110 starts.quick_push (i * const_nunits);
6111 }
6112 else
6113 {
6114 /* If either the vector has variable length or the vectors
6115 are composed of repeated whole groups we only need to
6116 cost construction once. All vectors will be the same. */
6117 nelt_limit = group_size;
6118 starts.quick_push (0);
6119 }
6120 /* ??? We're just tracking whether vectors in a single node are the same.
6121 Ideally we'd do something more global. */
6122 bool passed = false;
6123 for (unsigned int start : starts)
6124 {
6125 vect_cost_for_stmt kind;
6126 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6127 kind = vector_load;
6128 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6129 kind = scalar_to_vec;
6130 else
6131 kind = vec_construct;
6132 /* The target cost hook has no idea which part of the SLP node
6133 we are costing so avoid passing it down more than once. Pass
6134 it to the first vec_construct or scalar_to_vec part since for those
6135 the x86 backend tries to account for GPR to XMM register moves. */
6136 record_stmt_cost (cost_vec, 1, kind,
6137 (kind != vector_load && !passed) ? node : nullptr,
6138 vectype, 0, vect_prologue);
6139 if (kind != vector_load)
6140 passed = true;
6141 }
6142 }
6143
6144 /* Analyze statements contained in SLP tree NODE after recursively analyzing
6145 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6146
6147 Return true if the operations are supported. */
6148
6149 static bool
6150 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6151 slp_instance node_instance,
6152 hash_set<slp_tree> &visited_set,
6153 vec<slp_tree> &visited_vec,
6154 stmt_vector_for_cost *cost_vec)
6155 {
6156 int i, j;
6157 slp_tree child;
6158
6159 /* Assume we can code-generate all invariants. */
6160 if (!node
6161 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6162 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6163 return true;
6164
6165 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6166 {
6167 if (dump_enabled_p ())
6168 dump_printf_loc (MSG_NOTE, vect_location,
6169 "Failed cyclic SLP reference in %p\n", (void *) node);
6170 return false;
6171 }
6172 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6173
6174 /* If we already analyzed the exact same set of scalar stmts we're done.
6175 We share the generated vector stmts for those. */
6176 if (visited_set.add (node))
6177 return true;
6178 visited_vec.safe_push (node);
6179
6180 bool res = true;
6181 unsigned visited_rec_start = visited_vec.length ();
6182 unsigned cost_vec_rec_start = cost_vec->length ();
6183 bool seen_non_constant_child = false;
6184 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6185 {
6186 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6187 visited_set, visited_vec,
6188 cost_vec);
6189 if (!res)
6190 break;
6191 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6192 seen_non_constant_child = true;
6193 }
6194 /* We're having difficulties scheduling nodes with just constant
6195 operands and no scalar stmts since we then cannot compute a stmt
6196 insertion place. */
6197 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6198 {
6199 if (dump_enabled_p ())
6200 dump_printf_loc (MSG_NOTE, vect_location,
6201 "Cannot vectorize all-constant op node %p\n",
6202 (void *) node);
6203 res = false;
6204 }
6205
6206 if (res)
6207 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6208 cost_vec);
6209 /* If analysis failed we have to pop all recursive visited nodes
6210 plus ourselves. */
6211 if (!res)
6212 {
6213 while (visited_vec.length () >= visited_rec_start)
6214 visited_set.remove (visited_vec.pop ());
6215 cost_vec->truncate (cost_vec_rec_start);
6216 }
6217
6218 /* When the node can be vectorized cost invariant nodes it references.
6219 This is not done in DFS order to allow the refering node
6220 vectorizable_* calls to nail down the invariant nodes vector type
6221 and possibly unshare it if it needs a different vector type than
6222 other referrers. */
6223 if (res)
6224 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6225 if (child
6226 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6227 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6228 /* Perform usual caching, note code-generation still
6229 code-gens these nodes multiple times but we expect
6230 to CSE them later. */
6231 && !visited_set.add (child))
6232 {
6233 visited_vec.safe_push (child);
6234 /* ??? After auditing more code paths make a "default"
6235 and push the vector type from NODE to all children
6236 if it is not already set. */
6237 /* Compute the number of vectors to be generated. */
6238 tree vector_type = SLP_TREE_VECTYPE (child);
6239 if (!vector_type)
6240 {
6241 /* For shifts with a scalar argument we don't need
6242 to cost or code-generate anything.
6243 ??? Represent this more explicitely. */
6244 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6245 == shift_vec_info_type)
6246 && j == 1);
6247 continue;
6248 }
6249 unsigned group_size = SLP_TREE_LANES (child);
6250 poly_uint64 vf = 1;
6251 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6252 vf = loop_vinfo->vectorization_factor;
6253 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6254 = vect_get_num_vectors (vf * group_size, vector_type);
6255 /* And cost them. */
6256 vect_prologue_cost_for_slp (child, cost_vec);
6257 }
6258
6259 /* If this node or any of its children can't be vectorized, try pruning
6260 the tree here rather than felling the whole thing. */
6261 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6262 {
6263 /* We'll need to revisit this for invariant costing and number
6264 of vectorized stmt setting. */
6265 res = true;
6266 }
6267
6268 return res;
6269 }
6270
6271 /* Mark lanes of NODE that are live outside of the basic-block vectorized
6272 region and that can be vectorized using vectorizable_live_operation
6273 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6274 scalar code computing it to be retained. */
6275
6276 static void
6277 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6278 slp_instance instance,
6279 stmt_vector_for_cost *cost_vec,
6280 hash_set<stmt_vec_info> &svisited,
6281 hash_set<slp_tree> &visited)
6282 {
6283 if (visited.add (node))
6284 return;
6285
6286 unsigned i;
6287 stmt_vec_info stmt_info;
6288 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6289 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6290 {
6291 if (svisited.contains (stmt_info))
6292 continue;
6293 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6294 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6295 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6296 /* Only the pattern root stmt computes the original scalar value. */
6297 continue;
6298 bool mark_visited = true;
6299 gimple *orig_stmt = orig_stmt_info->stmt;
6300 ssa_op_iter op_iter;
6301 def_operand_p def_p;
6302 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6303 {
6304 imm_use_iterator use_iter;
6305 gimple *use_stmt;
6306 stmt_vec_info use_stmt_info;
6307 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6308 if (!is_gimple_debug (use_stmt))
6309 {
6310 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6311 if (!use_stmt_info
6312 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6313 {
6314 STMT_VINFO_LIVE_P (stmt_info) = true;
6315 if (vectorizable_live_operation (bb_vinfo, stmt_info,
6316 node, instance, i,
6317 false, cost_vec))
6318 /* ??? So we know we can vectorize the live stmt
6319 from one SLP node. If we cannot do so from all
6320 or none consistently we'd have to record which
6321 SLP node (and lane) we want to use for the live
6322 operation. So make sure we can code-generate
6323 from all nodes. */
6324 mark_visited = false;
6325 else
6326 STMT_VINFO_LIVE_P (stmt_info) = false;
6327 break;
6328 }
6329 }
6330 /* We have to verify whether we can insert the lane extract
6331 before all uses. The following is a conservative approximation.
6332 We cannot put this into vectorizable_live_operation because
6333 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6334 doesn't work.
6335 Note that while the fact that we emit code for loads at the
6336 first load should make this a non-problem leafs we construct
6337 from scalars are vectorized after the last scalar def.
6338 ??? If we'd actually compute the insert location during
6339 analysis we could use sth less conservative than the last
6340 scalar stmt in the node for the dominance check. */
6341 /* ??? What remains is "live" uses in vector CTORs in the same
6342 SLP graph which is where those uses can end up code-generated
6343 right after their definition instead of close to their original
6344 use. But that would restrict us to code-generate lane-extracts
6345 from the latest stmt in a node. So we compensate for this
6346 during code-generation, simply not replacing uses for those
6347 hopefully rare cases. */
6348 if (STMT_VINFO_LIVE_P (stmt_info))
6349 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6350 if (!is_gimple_debug (use_stmt)
6351 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6352 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6353 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6354 {
6355 if (dump_enabled_p ())
6356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357 "Cannot determine insertion place for "
6358 "lane extract\n");
6359 STMT_VINFO_LIVE_P (stmt_info) = false;
6360 mark_visited = true;
6361 }
6362 }
6363 if (mark_visited)
6364 svisited.add (stmt_info);
6365 }
6366
6367 slp_tree child;
6368 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6369 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6370 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
6371 cost_vec, svisited, visited);
6372 }
6373
6374 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6375
6376 static bool
6377 vectorizable_bb_reduc_epilogue (slp_instance instance,
6378 stmt_vector_for_cost *cost_vec)
6379 {
6380 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6381 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6382 if (reduc_code == MINUS_EXPR)
6383 reduc_code = PLUS_EXPR;
6384 internal_fn reduc_fn;
6385 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6386 if (!vectype
6387 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6388 || reduc_fn == IFN_LAST
6389 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6390 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6391 TREE_TYPE (vectype)))
6392 {
6393 if (dump_enabled_p ())
6394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6395 "not vectorized: basic block reduction epilogue "
6396 "operation unsupported.\n");
6397 return false;
6398 }
6399
6400 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6401 cost log2 vector operations plus shuffles and one extraction. */
6402 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6403 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6404 vectype, 0, vect_body);
6405 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6406 vectype, 0, vect_body);
6407 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6408 vectype, 0, vect_body);
6409 return true;
6410 }
6411
6412 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6413 and recurse to children. */
6414
6415 static void
6416 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6417 hash_set<slp_tree> &visited)
6418 {
6419 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6420 || visited.add (node))
6421 return;
6422
6423 stmt_vec_info stmt;
6424 unsigned i;
6425 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6426 roots.remove (vect_orig_stmt (stmt));
6427
6428 slp_tree child;
6429 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6430 if (child)
6431 vect_slp_prune_covered_roots (child, roots, visited);
6432 }
6433
6434 /* Analyze statements in SLP instances of VINFO. Return true if the
6435 operations are supported. */
6436
6437 bool
6438 vect_slp_analyze_operations (vec_info *vinfo)
6439 {
6440 slp_instance instance;
6441 int i;
6442
6443 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6444
6445 hash_set<slp_tree> visited;
6446 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6447 {
6448 auto_vec<slp_tree> visited_vec;
6449 stmt_vector_for_cost cost_vec;
6450 cost_vec.create (2);
6451 if (is_a <bb_vec_info> (vinfo))
6452 vect_location = instance->location ();
6453 if (!vect_slp_analyze_node_operations (vinfo,
6454 SLP_INSTANCE_TREE (instance),
6455 instance, visited, visited_vec,
6456 &cost_vec)
6457 /* CTOR instances require vectorized defs for the SLP tree root. */
6458 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6459 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6460 != vect_internal_def
6461 /* Make sure we vectorized with the expected type. */
6462 || !useless_type_conversion_p
6463 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6464 (instance->root_stmts[0]->stmt))),
6465 TREE_TYPE (SLP_TREE_VECTYPE
6466 (SLP_INSTANCE_TREE (instance))))))
6467 /* Check we can vectorize the reduction. */
6468 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6469 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6470 {
6471 slp_tree node = SLP_INSTANCE_TREE (instance);
6472 stmt_vec_info stmt_info;
6473 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6474 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6475 else
6476 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6477 if (dump_enabled_p ())
6478 dump_printf_loc (MSG_NOTE, vect_location,
6479 "removing SLP instance operations starting from: %G",
6480 stmt_info->stmt);
6481 vect_free_slp_instance (instance);
6482 vinfo->slp_instances.ordered_remove (i);
6483 cost_vec.release ();
6484 while (!visited_vec.is_empty ())
6485 visited.remove (visited_vec.pop ());
6486 }
6487 else
6488 {
6489 i++;
6490 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6491 {
6492 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6493 cost_vec.release ();
6494 }
6495 else
6496 /* For BB vectorization remember the SLP graph entry
6497 cost for later. */
6498 instance->cost_vec = cost_vec;
6499 }
6500 }
6501
6502 /* Now look for SLP instances with a root that are covered by other
6503 instances and remove them. */
6504 hash_set<stmt_vec_info> roots;
6505 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6506 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6507 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6508 if (!roots.is_empty ())
6509 {
6510 visited.empty ();
6511 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6512 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6513 visited);
6514 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6515 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6516 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6517 {
6518 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6519 if (dump_enabled_p ())
6520 dump_printf_loc (MSG_NOTE, vect_location,
6521 "removing SLP instance operations starting "
6522 "from: %G", root->stmt);
6523 vect_free_slp_instance (instance);
6524 vinfo->slp_instances.ordered_remove (i);
6525 }
6526 else
6527 ++i;
6528 }
6529
6530 /* Compute vectorizable live stmts. */
6531 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6532 {
6533 hash_set<stmt_vec_info> svisited;
6534 hash_set<slp_tree> visited;
6535 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6536 {
6537 vect_location = instance->location ();
6538 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6539 instance, &instance->cost_vec, svisited,
6540 visited);
6541 }
6542 }
6543
6544 return !vinfo->slp_instances.is_empty ();
6545 }
6546
6547 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6548 closing the eventual chain. */
6549
6550 static slp_instance
6551 get_ultimate_leader (slp_instance instance,
6552 hash_map<slp_instance, slp_instance> &instance_leader)
6553 {
6554 auto_vec<slp_instance *, 8> chain;
6555 slp_instance *tem;
6556 while (*(tem = instance_leader.get (instance)) != instance)
6557 {
6558 chain.safe_push (tem);
6559 instance = *tem;
6560 }
6561 while (!chain.is_empty ())
6562 *chain.pop () = instance;
6563 return instance;
6564 }
6565
6566 namespace {
6567 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6568 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6569 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6570
6571 INSTANCE_LEADER is as for get_ultimate_leader. */
6572
6573 template<typename T>
6574 bool
6575 vect_map_to_instance (slp_instance instance, T key,
6576 hash_map<T, slp_instance> &key_to_instance,
6577 hash_map<slp_instance, slp_instance> &instance_leader)
6578 {
6579 bool existed_p;
6580 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6581 if (!existed_p)
6582 ;
6583 else if (key_instance != instance)
6584 {
6585 /* If we're running into a previously marked key make us the
6586 leader of the current ultimate leader. This keeps the
6587 leader chain acyclic and works even when the current instance
6588 connects two previously independent graph parts. */
6589 slp_instance key_leader
6590 = get_ultimate_leader (key_instance, instance_leader);
6591 if (key_leader != instance)
6592 instance_leader.put (key_leader, instance);
6593 }
6594 key_instance = instance;
6595 return existed_p;
6596 }
6597 }
6598
6599 /* Worker of vect_bb_partition_graph, recurse on NODE. */
6600
6601 static void
6602 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6603 slp_instance instance, slp_tree node,
6604 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6605 hash_map<slp_tree, slp_instance> &node_to_instance,
6606 hash_map<slp_instance, slp_instance> &instance_leader)
6607 {
6608 stmt_vec_info stmt_info;
6609 unsigned i;
6610
6611 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6612 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6613 instance_leader);
6614
6615 if (vect_map_to_instance (instance, node, node_to_instance,
6616 instance_leader))
6617 return;
6618
6619 slp_tree child;
6620 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6621 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6622 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6623 node_to_instance, instance_leader);
6624 }
6625
6626 /* Partition the SLP graph into pieces that can be costed independently. */
6627
6628 static void
6629 vect_bb_partition_graph (bb_vec_info bb_vinfo)
6630 {
6631 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6632
6633 /* First walk the SLP graph assigning each involved scalar stmt a
6634 corresponding SLP graph entry and upon visiting a previously
6635 marked stmt, make the stmts leader the current SLP graph entry. */
6636 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6637 hash_map<slp_tree, slp_instance> node_to_instance;
6638 hash_map<slp_instance, slp_instance> instance_leader;
6639 slp_instance instance;
6640 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6641 {
6642 instance_leader.put (instance, instance);
6643 vect_bb_partition_graph_r (bb_vinfo,
6644 instance, SLP_INSTANCE_TREE (instance),
6645 stmt_to_instance, node_to_instance,
6646 instance_leader);
6647 }
6648
6649 /* Then collect entries to each independent subgraph. */
6650 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6651 {
6652 slp_instance leader = get_ultimate_leader (instance, instance_leader);
6653 leader->subgraph_entries.safe_push (instance);
6654 if (dump_enabled_p ()
6655 && leader != instance)
6656 dump_printf_loc (MSG_NOTE, vect_location,
6657 "instance %p is leader of %p\n",
6658 (void *) leader, (void *) instance);
6659 }
6660 }
6661
6662 /* Compute the set of scalar stmts participating in internal and external
6663 nodes. */
6664
6665 static void
6666 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6667 hash_set<slp_tree> &visited,
6668 hash_set<stmt_vec_info> &vstmts,
6669 hash_set<stmt_vec_info> &estmts)
6670 {
6671 int i;
6672 stmt_vec_info stmt_info;
6673 slp_tree child;
6674
6675 if (visited.add (node))
6676 return;
6677
6678 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6679 {
6680 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6681 vstmts.add (stmt_info);
6682
6683 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6684 if (child)
6685 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6686 vstmts, estmts);
6687 }
6688 else
6689 for (tree def : SLP_TREE_SCALAR_OPS (node))
6690 {
6691 stmt_vec_info def_stmt = vinfo->lookup_def (def);
6692 if (def_stmt)
6693 estmts.add (def_stmt);
6694 }
6695 }
6696
6697
6698 /* Compute the scalar cost of the SLP node NODE and its children
6699 and return it. Do not account defs that are marked in LIFE and
6700 update LIFE according to uses of NODE. */
6701
6702 static void
6703 vect_bb_slp_scalar_cost (vec_info *vinfo,
6704 slp_tree node, vec<bool, va_heap> *life,
6705 stmt_vector_for_cost *cost_vec,
6706 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
6707 hash_set<slp_tree> &visited)
6708 {
6709 unsigned i;
6710 stmt_vec_info stmt_info;
6711 slp_tree child;
6712
6713 if (visited.add (node))
6714 return;
6715
6716 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6717 {
6718 ssa_op_iter op_iter;
6719 def_operand_p def_p;
6720
6721 if ((*life)[i])
6722 continue;
6723
6724 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6725 gimple *orig_stmt = orig_stmt_info->stmt;
6726
6727 /* If there is a non-vectorized use of the defs then the scalar
6728 stmt is kept live in which case we do not account it or any
6729 required defs in the SLP children in the scalar cost. This
6730 way we make the vectorization more costly when compared to
6731 the scalar cost. */
6732 if (!STMT_VINFO_LIVE_P (stmt_info))
6733 {
6734 auto_vec<gimple *, 8> worklist;
6735 hash_set<gimple *> *worklist_visited = NULL;
6736 worklist.quick_push (orig_stmt);
6737 do
6738 {
6739 gimple *work_stmt = worklist.pop ();
6740 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
6741 {
6742 imm_use_iterator use_iter;
6743 gimple *use_stmt;
6744 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
6745 DEF_FROM_PTR (def_p))
6746 if (!is_gimple_debug (use_stmt))
6747 {
6748 stmt_vec_info use_stmt_info
6749 = vinfo->lookup_stmt (use_stmt);
6750 if (!use_stmt_info
6751 || !vectorized_scalar_stmts.contains (use_stmt_info))
6752 {
6753 if (use_stmt_info
6754 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
6755 {
6756 /* For stmts participating in patterns we have
6757 to check its uses recursively. */
6758 if (!worklist_visited)
6759 worklist_visited = new hash_set<gimple *> ();
6760 if (!worklist_visited->add (use_stmt))
6761 worklist.safe_push (use_stmt);
6762 continue;
6763 }
6764 (*life)[i] = true;
6765 goto next_lane;
6766 }
6767 }
6768 }
6769 }
6770 while (!worklist.is_empty ());
6771 next_lane:
6772 if (worklist_visited)
6773 delete worklist_visited;
6774 if ((*life)[i])
6775 continue;
6776 }
6777
6778 /* Count scalar stmts only once. */
6779 if (gimple_visited_p (orig_stmt))
6780 continue;
6781 gimple_set_visited (orig_stmt, true);
6782
6783 vect_cost_for_stmt kind;
6784 if (STMT_VINFO_DATA_REF (orig_stmt_info))
6785 {
6786 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
6787 kind = scalar_load;
6788 else
6789 kind = scalar_store;
6790 }
6791 else if (vect_nop_conversion_p (orig_stmt_info))
6792 continue;
6793 /* For single-argument PHIs assume coalescing which means zero cost
6794 for the scalar and the vector PHIs. This avoids artificially
6795 favoring the vector path (but may pessimize it in some cases). */
6796 else if (is_a <gphi *> (orig_stmt_info->stmt)
6797 && gimple_phi_num_args
6798 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
6799 continue;
6800 else
6801 kind = scalar_stmt;
6802 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
6803 SLP_TREE_VECTYPE (node), 0, vect_body);
6804 }
6805
6806 auto_vec<bool, 20> subtree_life;
6807 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6808 {
6809 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6810 {
6811 /* Do not directly pass LIFE to the recursive call, copy it to
6812 confine changes in the callee to the current child/subtree. */
6813 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6814 {
6815 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
6816 for (unsigned j = 0;
6817 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
6818 {
6819 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
6820 if (perm.first == i)
6821 subtree_life[perm.second] = (*life)[j];
6822 }
6823 }
6824 else
6825 {
6826 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
6827 subtree_life.safe_splice (*life);
6828 }
6829 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
6830 vectorized_scalar_stmts, visited);
6831 subtree_life.truncate (0);
6832 }
6833 }
6834 }
6835
6836 /* Comparator for the loop-index sorted cost vectors. */
6837
6838 static int
6839 li_cost_vec_cmp (const void *a_, const void *b_)
6840 {
6841 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
6842 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
6843 if (a->first < b->first)
6844 return -1;
6845 else if (a->first == b->first)
6846 return 0;
6847 return 1;
6848 }
6849
6850 /* Check if vectorization of the basic block is profitable for the
6851 subgraph denoted by SLP_INSTANCES. */
6852
6853 static bool
6854 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
6855 vec<slp_instance> slp_instances,
6856 loop_p orig_loop)
6857 {
6858 slp_instance instance;
6859 int i;
6860 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
6861 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
6862
6863 if (dump_enabled_p ())
6864 {
6865 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
6866 hash_set<slp_tree> visited;
6867 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6868 vect_print_slp_graph (MSG_NOTE, vect_location,
6869 SLP_INSTANCE_TREE (instance), visited);
6870 }
6871
6872 /* Compute the set of scalar stmts we know will go away 'locally' when
6873 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
6874 not accurate for nodes promoted extern late or for scalar stmts that
6875 are used both in extern defs and in vectorized defs. */
6876 hash_set<stmt_vec_info> vectorized_scalar_stmts;
6877 hash_set<stmt_vec_info> scalar_stmts_in_externs;
6878 hash_set<slp_tree> visited;
6879 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6880 {
6881 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
6882 SLP_INSTANCE_TREE (instance),
6883 visited,
6884 vectorized_scalar_stmts,
6885 scalar_stmts_in_externs);
6886 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
6887 vectorized_scalar_stmts.add (rstmt);
6888 }
6889 /* Scalar stmts used as defs in external nodes need to be preseved, so
6890 remove them from vectorized_scalar_stmts. */
6891 for (stmt_vec_info stmt : scalar_stmts_in_externs)
6892 vectorized_scalar_stmts.remove (stmt);
6893
6894 /* Calculate scalar cost and sum the cost for the vector stmts
6895 previously collected. */
6896 stmt_vector_for_cost scalar_costs = vNULL;
6897 stmt_vector_for_cost vector_costs = vNULL;
6898 visited.empty ();
6899 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6900 {
6901 auto_vec<bool, 20> life;
6902 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
6903 true);
6904 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6905 record_stmt_cost (&scalar_costs,
6906 SLP_INSTANCE_ROOT_STMTS (instance).length (),
6907 scalar_stmt,
6908 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
6909 vect_bb_slp_scalar_cost (bb_vinfo,
6910 SLP_INSTANCE_TREE (instance),
6911 &life, &scalar_costs, vectorized_scalar_stmts,
6912 visited);
6913 vector_costs.safe_splice (instance->cost_vec);
6914 instance->cost_vec.release ();
6915 }
6916
6917 if (dump_enabled_p ())
6918 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
6919
6920 /* When costing non-loop vectorization we need to consider each covered
6921 loop independently and make sure vectorization is profitable. For
6922 now we assume a loop may be not entered or executed an arbitrary
6923 number of iterations (??? static information can provide more
6924 precise info here) which means we can simply cost each containing
6925 loops stmts separately. */
6926
6927 /* First produce cost vectors sorted by loop index. */
6928 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6929 li_scalar_costs (scalar_costs.length ());
6930 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
6931 li_vector_costs (vector_costs.length ());
6932 stmt_info_for_cost *cost;
6933 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
6934 {
6935 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6936 li_scalar_costs.quick_push (std::make_pair (l, cost));
6937 }
6938 /* Use a random used loop as fallback in case the first vector_costs
6939 entry does not have a stmt_info associated with it. */
6940 unsigned l = li_scalar_costs[0].first;
6941 FOR_EACH_VEC_ELT (vector_costs, i, cost)
6942 {
6943 /* We inherit from the previous COST, invariants, externals and
6944 extracts immediately follow the cost for the related stmt. */
6945 if (cost->stmt_info)
6946 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
6947 li_vector_costs.quick_push (std::make_pair (l, cost));
6948 }
6949 li_scalar_costs.qsort (li_cost_vec_cmp);
6950 li_vector_costs.qsort (li_cost_vec_cmp);
6951
6952 /* Now cost the portions individually. */
6953 unsigned vi = 0;
6954 unsigned si = 0;
6955 bool profitable = true;
6956 while (si < li_scalar_costs.length ()
6957 && vi < li_vector_costs.length ())
6958 {
6959 unsigned sl = li_scalar_costs[si].first;
6960 unsigned vl = li_vector_costs[vi].first;
6961 if (sl != vl)
6962 {
6963 if (dump_enabled_p ())
6964 dump_printf_loc (MSG_NOTE, vect_location,
6965 "Scalar %d and vector %d loop part do not "
6966 "match up, skipping scalar part\n", sl, vl);
6967 /* Skip the scalar part, assuming zero cost on the vector side. */
6968 do
6969 {
6970 si++;
6971 }
6972 while (si < li_scalar_costs.length ()
6973 && li_scalar_costs[si].first == sl);
6974 continue;
6975 }
6976
6977 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
6978 do
6979 {
6980 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
6981 si++;
6982 }
6983 while (si < li_scalar_costs.length ()
6984 && li_scalar_costs[si].first == sl);
6985 unsigned dummy;
6986 finish_cost (scalar_target_cost_data, nullptr,
6987 &dummy, &scalar_cost, &dummy);
6988
6989 /* Complete the target-specific vector cost calculation. */
6990 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
6991 do
6992 {
6993 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
6994 vi++;
6995 }
6996 while (vi < li_vector_costs.length ()
6997 && li_vector_costs[vi].first == vl);
6998 finish_cost (vect_target_cost_data, scalar_target_cost_data,
6999 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7000 delete scalar_target_cost_data;
7001 delete vect_target_cost_data;
7002
7003 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7004
7005 if (dump_enabled_p ())
7006 {
7007 dump_printf_loc (MSG_NOTE, vect_location,
7008 "Cost model analysis for part in loop %d:\n", sl);
7009 dump_printf (MSG_NOTE, " Vector cost: %d\n",
7010 vec_inside_cost + vec_outside_cost);
7011 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7012 }
7013
7014 /* Vectorization is profitable if its cost is more than the cost of scalar
7015 version. Note that we err on the vector side for equal cost because
7016 the cost estimate is otherwise quite pessimistic (constant uses are
7017 free on the scalar side but cost a load on the vector side for
7018 example). */
7019 if (vec_outside_cost + vec_inside_cost > scalar_cost)
7020 {
7021 profitable = false;
7022 break;
7023 }
7024 }
7025 if (profitable && vi < li_vector_costs.length ())
7026 {
7027 if (dump_enabled_p ())
7028 dump_printf_loc (MSG_NOTE, vect_location,
7029 "Excess vector cost for part in loop %d:\n",
7030 li_vector_costs[vi].first);
7031 profitable = false;
7032 }
7033
7034 /* Unset visited flag. This is delayed when the subgraph is profitable
7035 and we process the loop for remaining unvectorized if-converted code. */
7036 if (!orig_loop || !profitable)
7037 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7038 gimple_set_visited (cost->stmt_info->stmt, false);
7039
7040 scalar_costs.release ();
7041 vector_costs.release ();
7042
7043 return profitable;
7044 }
7045
7046 /* qsort comparator for lane defs. */
7047
7048 static int
7049 vld_cmp (const void *a_, const void *b_)
7050 {
7051 auto *a = (const std::pair<unsigned, tree> *)a_;
7052 auto *b = (const std::pair<unsigned, tree> *)b_;
7053 return a->first - b->first;
7054 }
7055
7056 /* Return true if USE_STMT is a vector lane insert into VEC and set
7057 *THIS_LANE to the lane number that is set. */
7058
7059 static bool
7060 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7061 {
7062 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7063 if (!use_ass
7064 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7065 || (vec
7066 ? gimple_assign_rhs1 (use_ass) != vec
7067 : ((vec = gimple_assign_rhs1 (use_ass)), false))
7068 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7069 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7070 || !constant_multiple_p
7071 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7072 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7073 this_lane))
7074 return false;
7075 return true;
7076 }
7077
7078 /* Find any vectorizable constructors and add them to the grouped_store
7079 array. */
7080
7081 static void
7082 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7083 {
7084 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7085 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7086 !gsi_end_p (gsi); gsi_next (&gsi))
7087 {
7088 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7089 if (!assign)
7090 continue;
7091
7092 tree rhs = gimple_assign_rhs1 (assign);
7093 enum tree_code code = gimple_assign_rhs_code (assign);
7094 use_operand_p use_p;
7095 gimple *use_stmt;
7096 if (code == CONSTRUCTOR)
7097 {
7098 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7099 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7100 CONSTRUCTOR_NELTS (rhs))
7101 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7102 || uniform_vector_p (rhs))
7103 continue;
7104
7105 unsigned j;
7106 tree val;
7107 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7108 if (TREE_CODE (val) != SSA_NAME
7109 || !bb_vinfo->lookup_def (val))
7110 break;
7111 if (j != CONSTRUCTOR_NELTS (rhs))
7112 continue;
7113
7114 vec<stmt_vec_info> roots = vNULL;
7115 roots.safe_push (bb_vinfo->lookup_stmt (assign));
7116 vec<stmt_vec_info> stmts;
7117 stmts.create (CONSTRUCTOR_NELTS (rhs));
7118 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7119 stmts.quick_push
7120 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7121 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7122 stmts, roots));
7123 }
7124 else if (code == BIT_INSERT_EXPR
7125 && VECTOR_TYPE_P (TREE_TYPE (rhs))
7126 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7127 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7128 && integer_zerop (gimple_assign_rhs3 (assign))
7129 && useless_type_conversion_p
7130 (TREE_TYPE (TREE_TYPE (rhs)),
7131 TREE_TYPE (gimple_assign_rhs2 (assign)))
7132 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7133 {
7134 /* We start to match on insert to lane zero but since the
7135 inserts need not be ordered we'd have to search both
7136 the def and the use chains. */
7137 tree vectype = TREE_TYPE (rhs);
7138 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7139 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7140 auto_sbitmap lanes (nlanes);
7141 bitmap_clear (lanes);
7142 bitmap_set_bit (lanes, 0);
7143 tree def = gimple_assign_lhs (assign);
7144 lane_defs.quick_push
7145 (std::make_pair (0, gimple_assign_rhs2 (assign)));
7146 unsigned lanes_found = 1;
7147 /* Start with the use chains, the last stmt will be the root. */
7148 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7149 vec<stmt_vec_info> roots = vNULL;
7150 roots.safe_push (last);
7151 do
7152 {
7153 use_operand_p use_p;
7154 gimple *use_stmt;
7155 if (!single_imm_use (def, &use_p, &use_stmt))
7156 break;
7157 unsigned this_lane;
7158 if (!bb_vinfo->lookup_stmt (use_stmt)
7159 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7160 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7161 break;
7162 if (bitmap_bit_p (lanes, this_lane))
7163 break;
7164 lanes_found++;
7165 bitmap_set_bit (lanes, this_lane);
7166 gassign *use_ass = as_a <gassign *> (use_stmt);
7167 lane_defs.quick_push (std::make_pair
7168 (this_lane, gimple_assign_rhs2 (use_ass)));
7169 last = bb_vinfo->lookup_stmt (use_ass);
7170 roots.safe_push (last);
7171 def = gimple_assign_lhs (use_ass);
7172 }
7173 while (lanes_found < nlanes);
7174 if (roots.length () > 1)
7175 std::swap(roots[0], roots[roots.length () - 1]);
7176 if (lanes_found < nlanes)
7177 {
7178 /* Now search the def chain. */
7179 def = gimple_assign_rhs1 (assign);
7180 do
7181 {
7182 if (TREE_CODE (def) != SSA_NAME
7183 || !has_single_use (def))
7184 break;
7185 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7186 unsigned this_lane;
7187 if (!bb_vinfo->lookup_stmt (def_stmt)
7188 || !vect_slp_is_lane_insert (def_stmt,
7189 NULL_TREE, &this_lane)
7190 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7191 break;
7192 if (bitmap_bit_p (lanes, this_lane))
7193 break;
7194 lanes_found++;
7195 bitmap_set_bit (lanes, this_lane);
7196 lane_defs.quick_push (std::make_pair
7197 (this_lane,
7198 gimple_assign_rhs2 (def_stmt)));
7199 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7200 def = gimple_assign_rhs1 (def_stmt);
7201 }
7202 while (lanes_found < nlanes);
7203 }
7204 if (lanes_found == nlanes)
7205 {
7206 /* Sort lane_defs after the lane index and register the root. */
7207 lane_defs.qsort (vld_cmp);
7208 vec<stmt_vec_info> stmts;
7209 stmts.create (nlanes);
7210 for (unsigned i = 0; i < nlanes; ++i)
7211 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7212 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7213 stmts, roots));
7214 }
7215 else
7216 roots.release ();
7217 }
7218 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7219 && (associative_tree_code (code) || code == MINUS_EXPR)
7220 /* ??? This pessimizes a two-element reduction. PR54400.
7221 ??? In-order reduction could be handled if we only
7222 traverse one operand chain in vect_slp_linearize_chain. */
7223 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7224 /* Ops with constants at the tail can be stripped here. */
7225 && TREE_CODE (rhs) == SSA_NAME
7226 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7227 /* Should be the chain end. */
7228 && (!single_imm_use (gimple_assign_lhs (assign),
7229 &use_p, &use_stmt)
7230 || !is_gimple_assign (use_stmt)
7231 || (gimple_assign_rhs_code (use_stmt) != code
7232 && ((code != PLUS_EXPR && code != MINUS_EXPR)
7233 || (gimple_assign_rhs_code (use_stmt)
7234 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7235 {
7236 /* We start the match at the end of a possible association
7237 chain. */
7238 auto_vec<chain_op_t> chain;
7239 auto_vec<std::pair<tree_code, gimple *> > worklist;
7240 auto_vec<gimple *> chain_stmts;
7241 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7242 if (code == MINUS_EXPR)
7243 code = PLUS_EXPR;
7244 internal_fn reduc_fn;
7245 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7246 || reduc_fn == IFN_LAST)
7247 continue;
7248 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7249 /* ??? */
7250 code_stmt, alt_code_stmt, &chain_stmts);
7251 if (chain.length () > 1)
7252 {
7253 /* Sort the chain according to def_type and operation. */
7254 chain.sort (dt_sort_cmp, bb_vinfo);
7255 /* ??? Now we'd want to strip externals and constants
7256 but record those to be handled in the epilogue. */
7257 /* ??? For now do not allow mixing ops or externs/constants. */
7258 bool invalid = false;
7259 unsigned remain_cnt = 0;
7260 for (unsigned i = 0; i < chain.length (); ++i)
7261 {
7262 if (chain[i].code != code)
7263 {
7264 invalid = true;
7265 break;
7266 }
7267 if (chain[i].dt != vect_internal_def)
7268 remain_cnt++;
7269 }
7270 if (!invalid && chain.length () - remain_cnt > 1)
7271 {
7272 vec<stmt_vec_info> stmts;
7273 vec<tree> remain = vNULL;
7274 stmts.create (chain.length ());
7275 if (remain_cnt > 0)
7276 remain.create (remain_cnt);
7277 for (unsigned i = 0; i < chain.length (); ++i)
7278 {
7279 if (chain[i].dt == vect_internal_def)
7280 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
7281 else
7282 remain.quick_push (chain[i].op);
7283 }
7284 vec<stmt_vec_info> roots;
7285 roots.create (chain_stmts.length ());
7286 for (unsigned i = 0; i < chain_stmts.length (); ++i)
7287 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7288 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7289 stmts, roots, remain));
7290 }
7291 }
7292 }
7293 }
7294 }
7295
7296 /* Walk the grouped store chains and replace entries with their
7297 pattern variant if any. */
7298
7299 static void
7300 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7301 {
7302 stmt_vec_info first_element;
7303 unsigned i;
7304
7305 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7306 {
7307 /* We also have CTORs in this array. */
7308 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7309 continue;
7310 if (STMT_VINFO_IN_PATTERN_P (first_element))
7311 {
7312 stmt_vec_info orig = first_element;
7313 first_element = STMT_VINFO_RELATED_STMT (first_element);
7314 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7315 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7316 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7317 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7318 vinfo->grouped_stores[i] = first_element;
7319 }
7320 stmt_vec_info prev = first_element;
7321 while (DR_GROUP_NEXT_ELEMENT (prev))
7322 {
7323 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7324 if (STMT_VINFO_IN_PATTERN_P (elt))
7325 {
7326 stmt_vec_info orig = elt;
7327 elt = STMT_VINFO_RELATED_STMT (elt);
7328 DR_GROUP_NEXT_ELEMENT (prev) = elt;
7329 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7330 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7331 }
7332 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7333 prev = elt;
7334 }
7335 }
7336 }
7337
7338 /* Check if the region described by BB_VINFO can be vectorized, returning
7339 true if so. When returning false, set FATAL to true if the same failure
7340 would prevent vectorization at other vector sizes, false if it is still
7341 worth trying other sizes. N_STMTS is the number of statements in the
7342 region. */
7343
7344 static bool
7345 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7346 vec<int> *dataref_groups)
7347 {
7348 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7349
7350 slp_instance instance;
7351 int i;
7352 poly_uint64 min_vf = 2;
7353
7354 /* The first group of checks is independent of the vector size. */
7355 fatal = true;
7356
7357 /* Analyze the data references. */
7358
7359 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7360 {
7361 if (dump_enabled_p ())
7362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7363 "not vectorized: unhandled data-ref in basic "
7364 "block.\n");
7365 return false;
7366 }
7367
7368 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7369 {
7370 if (dump_enabled_p ())
7371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7372 "not vectorized: unhandled data access in "
7373 "basic block.\n");
7374 return false;
7375 }
7376
7377 vect_slp_check_for_roots (bb_vinfo);
7378
7379 /* If there are no grouped stores and no constructors in the region
7380 there is no need to continue with pattern recog as vect_analyze_slp
7381 will fail anyway. */
7382 if (bb_vinfo->grouped_stores.is_empty ()
7383 && bb_vinfo->roots.is_empty ())
7384 {
7385 if (dump_enabled_p ())
7386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7387 "not vectorized: no grouped stores in "
7388 "basic block.\n");
7389 return false;
7390 }
7391
7392 /* While the rest of the analysis below depends on it in some way. */
7393 fatal = false;
7394
7395 vect_pattern_recog (bb_vinfo);
7396
7397 /* Update store groups from pattern processing. */
7398 vect_fixup_store_groups_with_patterns (bb_vinfo);
7399
7400 /* Check the SLP opportunities in the basic block, analyze and build SLP
7401 trees. */
7402 if (!vect_analyze_slp (bb_vinfo, n_stmts))
7403 {
7404 if (dump_enabled_p ())
7405 {
7406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7407 "Failed to SLP the basic block.\n");
7408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7409 "not vectorized: failed to find SLP opportunities "
7410 "in basic block.\n");
7411 }
7412 return false;
7413 }
7414
7415 /* Optimize permutations. */
7416 vect_optimize_slp (bb_vinfo);
7417
7418 /* Gather the loads reachable from the SLP graph entries. */
7419 vect_gather_slp_loads (bb_vinfo);
7420
7421 vect_record_base_alignments (bb_vinfo);
7422
7423 /* Analyze and verify the alignment of data references and the
7424 dependence in the SLP instances. */
7425 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7426 {
7427 vect_location = instance->location ();
7428 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7429 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7430 {
7431 slp_tree node = SLP_INSTANCE_TREE (instance);
7432 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7433 if (dump_enabled_p ())
7434 dump_printf_loc (MSG_NOTE, vect_location,
7435 "removing SLP instance operations starting from: %G",
7436 stmt_info->stmt);
7437 vect_free_slp_instance (instance);
7438 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7439 continue;
7440 }
7441
7442 /* Mark all the statements that we want to vectorize as pure SLP and
7443 relevant. */
7444 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7445 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7446 unsigned j;
7447 stmt_vec_info root;
7448 /* Likewise consider instance root stmts as vectorized. */
7449 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7450 STMT_SLP_TYPE (root) = pure_slp;
7451
7452 i++;
7453 }
7454 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7455 return false;
7456
7457 if (!vect_slp_analyze_operations (bb_vinfo))
7458 {
7459 if (dump_enabled_p ())
7460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7461 "not vectorized: bad operation in basic block.\n");
7462 return false;
7463 }
7464
7465 vect_bb_partition_graph (bb_vinfo);
7466
7467 return true;
7468 }
7469
7470 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7471 basic blocks in BBS, returning true on success.
7472 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7473
7474 static bool
7475 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7476 vec<int> *dataref_groups, unsigned int n_stmts,
7477 loop_p orig_loop)
7478 {
7479 bb_vec_info bb_vinfo;
7480 auto_vector_modes vector_modes;
7481
7482 /* Autodetect first vector size we try. */
7483 machine_mode next_vector_mode = VOIDmode;
7484 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7485 unsigned int mode_i = 0;
7486
7487 vec_info_shared shared;
7488
7489 machine_mode autodetected_vector_mode = VOIDmode;
7490 while (1)
7491 {
7492 bool vectorized = false;
7493 bool fatal = false;
7494 bb_vinfo = new _bb_vec_info (bbs, &shared);
7495
7496 bool first_time_p = shared.datarefs.is_empty ();
7497 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7498 if (first_time_p)
7499 bb_vinfo->shared->save_datarefs ();
7500 else
7501 bb_vinfo->shared->check_datarefs ();
7502 bb_vinfo->vector_mode = next_vector_mode;
7503
7504 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7505 {
7506 if (dump_enabled_p ())
7507 {
7508 dump_printf_loc (MSG_NOTE, vect_location,
7509 "***** Analysis succeeded with vector mode"
7510 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7511 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7512 }
7513
7514 bb_vinfo->shared->check_datarefs ();
7515
7516 auto_vec<slp_instance> profitable_subgraphs;
7517 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7518 {
7519 if (instance->subgraph_entries.is_empty ())
7520 continue;
7521
7522 dump_user_location_t saved_vect_location = vect_location;
7523 vect_location = instance->location ();
7524 if (!unlimited_cost_model (NULL)
7525 && !vect_bb_vectorization_profitable_p
7526 (bb_vinfo, instance->subgraph_entries, orig_loop))
7527 {
7528 if (dump_enabled_p ())
7529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7530 "not vectorized: vectorization is not "
7531 "profitable.\n");
7532 vect_location = saved_vect_location;
7533 continue;
7534 }
7535
7536 vect_location = saved_vect_location;
7537 if (!dbg_cnt (vect_slp))
7538 continue;
7539
7540 profitable_subgraphs.safe_push (instance);
7541 }
7542
7543 /* When we're vectorizing an if-converted loop body make sure
7544 we vectorized all if-converted code. */
7545 if (!profitable_subgraphs.is_empty ()
7546 && orig_loop)
7547 {
7548 gcc_assert (bb_vinfo->bbs.length () == 1);
7549 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7550 !gsi_end_p (gsi); gsi_next (&gsi))
7551 {
7552 /* The costing above left us with DCEable vectorized scalar
7553 stmts having the visited flag set on profitable
7554 subgraphs. Do the delayed clearing of the flag here. */
7555 if (gimple_visited_p (gsi_stmt (gsi)))
7556 {
7557 gimple_set_visited (gsi_stmt (gsi), false);
7558 continue;
7559 }
7560 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7561 continue;
7562
7563 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7564 if (gimple_assign_rhs_code (ass) == COND_EXPR)
7565 {
7566 if (!profitable_subgraphs.is_empty ()
7567 && dump_enabled_p ())
7568 dump_printf_loc (MSG_NOTE, vect_location,
7569 "not profitable because of "
7570 "unprofitable if-converted scalar "
7571 "code\n");
7572 profitable_subgraphs.truncate (0);
7573 }
7574 }
7575 }
7576
7577 /* Finally schedule the profitable subgraphs. */
7578 for (slp_instance instance : profitable_subgraphs)
7579 {
7580 if (!vectorized && dump_enabled_p ())
7581 dump_printf_loc (MSG_NOTE, vect_location,
7582 "Basic block will be vectorized "
7583 "using SLP\n");
7584 vectorized = true;
7585
7586 /* Dump before scheduling as store vectorization will remove
7587 the original stores and mess with the instance tree
7588 so querying its location will eventually ICE. */
7589 if (flag_checking)
7590 for (slp_instance sub : instance->subgraph_entries)
7591 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7592 unsigned HOST_WIDE_INT bytes;
7593 if (dump_enabled_p ())
7594 for (slp_instance sub : instance->subgraph_entries)
7595 {
7596 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7597 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7598 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7599 sub->location (),
7600 "basic block part vectorized using %wu "
7601 "byte vectors\n", bytes);
7602 else
7603 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7604 sub->location (),
7605 "basic block part vectorized using "
7606 "variable length vectors\n");
7607 }
7608
7609 dump_user_location_t saved_vect_location = vect_location;
7610 vect_location = instance->location ();
7611
7612 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7613
7614 vect_location = saved_vect_location;
7615 }
7616 }
7617 else
7618 {
7619 if (dump_enabled_p ())
7620 dump_printf_loc (MSG_NOTE, vect_location,
7621 "***** Analysis failed with vector mode %s\n",
7622 GET_MODE_NAME (bb_vinfo->vector_mode));
7623 }
7624
7625 if (mode_i == 0)
7626 autodetected_vector_mode = bb_vinfo->vector_mode;
7627
7628 if (!fatal)
7629 while (mode_i < vector_modes.length ()
7630 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7631 {
7632 if (dump_enabled_p ())
7633 dump_printf_loc (MSG_NOTE, vect_location,
7634 "***** The result for vector mode %s would"
7635 " be the same\n",
7636 GET_MODE_NAME (vector_modes[mode_i]));
7637 mode_i += 1;
7638 }
7639
7640 delete bb_vinfo;
7641
7642 if (mode_i < vector_modes.length ()
7643 && VECTOR_MODE_P (autodetected_vector_mode)
7644 && (related_vector_mode (vector_modes[mode_i],
7645 GET_MODE_INNER (autodetected_vector_mode))
7646 == autodetected_vector_mode)
7647 && (related_vector_mode (autodetected_vector_mode,
7648 GET_MODE_INNER (vector_modes[mode_i]))
7649 == vector_modes[mode_i]))
7650 {
7651 if (dump_enabled_p ())
7652 dump_printf_loc (MSG_NOTE, vect_location,
7653 "***** Skipping vector mode %s, which would"
7654 " repeat the analysis for %s\n",
7655 GET_MODE_NAME (vector_modes[mode_i]),
7656 GET_MODE_NAME (autodetected_vector_mode));
7657 mode_i += 1;
7658 }
7659
7660 if (vectorized
7661 || mode_i == vector_modes.length ()
7662 || autodetected_vector_mode == VOIDmode
7663 /* If vect_slp_analyze_bb_1 signaled that analysis for all
7664 vector sizes will fail do not bother iterating. */
7665 || fatal)
7666 return vectorized;
7667
7668 /* Try the next biggest vector size. */
7669 next_vector_mode = vector_modes[mode_i++];
7670 if (dump_enabled_p ())
7671 dump_printf_loc (MSG_NOTE, vect_location,
7672 "***** Re-trying analysis with vector mode %s\n",
7673 GET_MODE_NAME (next_vector_mode));
7674 }
7675 }
7676
7677
7678 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
7679 true if anything in the basic-block was vectorized. */
7680
7681 static bool
7682 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
7683 {
7684 vec<data_reference_p> datarefs = vNULL;
7685 auto_vec<int> dataref_groups;
7686 int insns = 0;
7687 int current_group = 0;
7688
7689 for (unsigned i = 0; i < bbs.length (); i++)
7690 {
7691 basic_block bb = bbs[i];
7692 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
7693 gsi_next (&gsi))
7694 {
7695 gimple *stmt = gsi_stmt (gsi);
7696 if (is_gimple_debug (stmt))
7697 continue;
7698
7699 insns++;
7700
7701 if (gimple_location (stmt) != UNKNOWN_LOCATION)
7702 vect_location = stmt;
7703
7704 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
7705 &dataref_groups, current_group))
7706 ++current_group;
7707 }
7708 /* New BBs always start a new DR group. */
7709 ++current_group;
7710 }
7711
7712 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
7713 }
7714
7715 /* Special entry for the BB vectorizer. Analyze and transform a single
7716 if-converted BB with ORIG_LOOPs body being the not if-converted
7717 representation. Returns true if anything in the basic-block was
7718 vectorized. */
7719
7720 bool
7721 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
7722 {
7723 auto_vec<basic_block> bbs;
7724 bbs.safe_push (bb);
7725 return vect_slp_bbs (bbs, orig_loop);
7726 }
7727
7728 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
7729 true if anything in the basic-block was vectorized. */
7730
7731 bool
7732 vect_slp_function (function *fun)
7733 {
7734 bool r = false;
7735 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
7736 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
7737
7738 /* For the moment split the function into pieces to avoid making
7739 the iteration on the vector mode moot. Split at points we know
7740 to not handle well which is CFG merges (SLP discovery doesn't
7741 handle non-loop-header PHIs) and loop exits. Since pattern
7742 recog requires reverse iteration to visit uses before defs
7743 simply chop RPO into pieces. */
7744 auto_vec<basic_block> bbs;
7745 for (unsigned i = 0; i < n; i++)
7746 {
7747 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
7748 bool split = false;
7749
7750 /* Split when a BB is not dominated by the first block. */
7751 if (!bbs.is_empty ()
7752 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
7753 {
7754 if (dump_enabled_p ())
7755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7756 "splitting region at dominance boundary bb%d\n",
7757 bb->index);
7758 split = true;
7759 }
7760 /* Split when the loop determined by the first block
7761 is exited. This is because we eventually insert
7762 invariants at region begin. */
7763 else if (!bbs.is_empty ()
7764 && bbs[0]->loop_father != bb->loop_father
7765 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
7766 {
7767 if (dump_enabled_p ())
7768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7769 "splitting region at loop %d exit at bb%d\n",
7770 bbs[0]->loop_father->num, bb->index);
7771 split = true;
7772 }
7773
7774 if (split && !bbs.is_empty ())
7775 {
7776 r |= vect_slp_bbs (bbs, NULL);
7777 bbs.truncate (0);
7778 }
7779
7780 /* We need to be able to insert at the head of the region which
7781 we cannot for region starting with a returns-twice call. */
7782 if (bbs.is_empty ())
7783 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
7784 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
7785 {
7786 if (dump_enabled_p ())
7787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7788 "skipping bb%d as start of region as it "
7789 "starts with returns-twice call\n",
7790 bb->index);
7791 continue;
7792 }
7793
7794 bbs.safe_push (bb);
7795
7796 /* When we have a stmt ending this block and defining a
7797 value we have to insert on edges when inserting after it for
7798 a vector containing its definition. Avoid this for now. */
7799 if (gimple *last = *gsi_last_bb (bb))
7800 if (gimple_get_lhs (last)
7801 && is_ctrl_altering_stmt (last))
7802 {
7803 if (dump_enabled_p ())
7804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7805 "splitting region at control altering "
7806 "definition %G", last);
7807 r |= vect_slp_bbs (bbs, NULL);
7808 bbs.truncate (0);
7809 }
7810 }
7811
7812 if (!bbs.is_empty ())
7813 r |= vect_slp_bbs (bbs, NULL);
7814
7815 free (rpo);
7816
7817 return r;
7818 }
7819
7820 /* Build a variable-length vector in which the elements in ELTS are repeated
7821 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
7822 RESULTS and add any new instructions to SEQ.
7823
7824 The approach we use is:
7825
7826 (1) Find a vector mode VM with integer elements of mode IM.
7827
7828 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7829 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
7830 from small vectors to IM.
7831
7832 (3) Duplicate each ELTS'[I] into a vector of mode VM.
7833
7834 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
7835 correct byte contents.
7836
7837 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
7838
7839 We try to find the largest IM for which this sequence works, in order
7840 to cut down on the number of interleaves. */
7841
7842 void
7843 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
7844 const vec<tree> &elts, unsigned int nresults,
7845 vec<tree> &results)
7846 {
7847 unsigned int nelts = elts.length ();
7848 tree element_type = TREE_TYPE (vector_type);
7849
7850 /* (1) Find a vector mode VM with integer elements of mode IM. */
7851 unsigned int nvectors = 1;
7852 tree new_vector_type;
7853 tree permutes[2];
7854 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
7855 &nvectors, &new_vector_type,
7856 permutes))
7857 gcc_unreachable ();
7858
7859 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
7860 unsigned int partial_nelts = nelts / nvectors;
7861 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
7862
7863 tree_vector_builder partial_elts;
7864 auto_vec<tree, 32> pieces (nvectors * 2);
7865 pieces.quick_grow_cleared (nvectors * 2);
7866 for (unsigned int i = 0; i < nvectors; ++i)
7867 {
7868 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
7869 ELTS' has mode IM. */
7870 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
7871 for (unsigned int j = 0; j < partial_nelts; ++j)
7872 partial_elts.quick_push (elts[i * partial_nelts + j]);
7873 tree t = gimple_build_vector (seq, &partial_elts);
7874 t = gimple_build (seq, VIEW_CONVERT_EXPR,
7875 TREE_TYPE (new_vector_type), t);
7876
7877 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
7878 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
7879 }
7880
7881 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
7882 correct byte contents.
7883
7884 Conceptually, we need to repeat the following operation log2(nvectors)
7885 times, where hi_start = nvectors / 2:
7886
7887 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
7888 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
7889
7890 However, if each input repeats every N elements and the VF is
7891 a multiple of N * 2, the HI result is the same as the LO result.
7892 This will be true for the first N1 iterations of the outer loop,
7893 followed by N2 iterations for which both the LO and HI results
7894 are needed. I.e.:
7895
7896 N1 + N2 = log2(nvectors)
7897
7898 Each "N1 iteration" doubles the number of redundant vectors and the
7899 effect of the process as a whole is to have a sequence of nvectors/2**N1
7900 vectors that repeats 2**N1 times. Rather than generate these redundant
7901 vectors, we halve the number of vectors for each N1 iteration. */
7902 unsigned int in_start = 0;
7903 unsigned int out_start = nvectors;
7904 unsigned int new_nvectors = nvectors;
7905 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
7906 {
7907 unsigned int hi_start = new_nvectors / 2;
7908 unsigned int out_i = 0;
7909 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
7910 {
7911 if ((in_i & 1) != 0
7912 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
7913 2 * in_repeat))
7914 continue;
7915
7916 tree output = make_ssa_name (new_vector_type);
7917 tree input1 = pieces[in_start + (in_i / 2)];
7918 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
7919 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
7920 input1, input2,
7921 permutes[in_i & 1]);
7922 gimple_seq_add_stmt (seq, stmt);
7923 pieces[out_start + out_i] = output;
7924 out_i += 1;
7925 }
7926 std::swap (in_start, out_start);
7927 new_nvectors = out_i;
7928 }
7929
7930 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
7931 results.reserve (nresults);
7932 for (unsigned int i = 0; i < nresults; ++i)
7933 if (i < new_nvectors)
7934 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
7935 pieces[in_start + i]));
7936 else
7937 results.quick_push (results[i - new_nvectors]);
7938 }
7939
7940
7941 /* For constant and loop invariant defs in OP_NODE this function creates
7942 vector defs that will be used in the vectorized stmts and stores them
7943 to SLP_TREE_VEC_DEFS of OP_NODE. */
7944
7945 static void
7946 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
7947 {
7948 unsigned HOST_WIDE_INT nunits;
7949 tree vec_cst;
7950 unsigned j, number_of_places_left_in_vector;
7951 tree vector_type;
7952 tree vop;
7953 int group_size = op_node->ops.length ();
7954 unsigned int vec_num, i;
7955 unsigned number_of_copies = 1;
7956 bool constant_p;
7957 gimple_seq ctor_seq = NULL;
7958 auto_vec<tree, 16> permute_results;
7959
7960 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
7961 vector_type = SLP_TREE_VECTYPE (op_node);
7962
7963 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
7964 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
7965 auto_vec<tree> voprnds (number_of_vectors);
7966
7967 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
7968 created vectors. It is greater than 1 if unrolling is performed.
7969
7970 For example, we have two scalar operands, s1 and s2 (e.g., group of
7971 strided accesses of size two), while NUNITS is four (i.e., four scalars
7972 of this type can be packed in a vector). The output vector will contain
7973 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
7974 will be 2).
7975
7976 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
7977 containing the operands.
7978
7979 For example, NUNITS is four as before, and the group size is 8
7980 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
7981 {s5, s6, s7, s8}. */
7982
7983 /* When using duplicate_and_interleave, we just need one element for
7984 each scalar statement. */
7985 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
7986 nunits = group_size;
7987
7988 number_of_copies = nunits * number_of_vectors / group_size;
7989
7990 number_of_places_left_in_vector = nunits;
7991 constant_p = true;
7992 tree_vector_builder elts (vector_type, nunits, 1);
7993 elts.quick_grow (nunits);
7994 stmt_vec_info insert_after = NULL;
7995 for (j = 0; j < number_of_copies; j++)
7996 {
7997 tree op;
7998 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
7999 {
8000 /* Create 'vect_ = {op0,op1,...,opn}'. */
8001 number_of_places_left_in_vector--;
8002 tree orig_op = op;
8003 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8004 {
8005 if (CONSTANT_CLASS_P (op))
8006 {
8007 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8008 {
8009 /* Can't use VIEW_CONVERT_EXPR for booleans because
8010 of possibly different sizes of scalar value and
8011 vector element. */
8012 if (integer_zerop (op))
8013 op = build_int_cst (TREE_TYPE (vector_type), 0);
8014 else if (integer_onep (op))
8015 op = build_all_ones_cst (TREE_TYPE (vector_type));
8016 else
8017 gcc_unreachable ();
8018 }
8019 else
8020 op = fold_unary (VIEW_CONVERT_EXPR,
8021 TREE_TYPE (vector_type), op);
8022 gcc_assert (op && CONSTANT_CLASS_P (op));
8023 }
8024 else
8025 {
8026 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8027 gimple *init_stmt;
8028 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8029 {
8030 tree true_val
8031 = build_all_ones_cst (TREE_TYPE (vector_type));
8032 tree false_val
8033 = build_zero_cst (TREE_TYPE (vector_type));
8034 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8035 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8036 op, true_val,
8037 false_val);
8038 }
8039 else
8040 {
8041 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8042 op);
8043 init_stmt
8044 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8045 op);
8046 }
8047 gimple_seq_add_stmt (&ctor_seq, init_stmt);
8048 op = new_temp;
8049 }
8050 }
8051 elts[number_of_places_left_in_vector] = op;
8052 if (!CONSTANT_CLASS_P (op))
8053 constant_p = false;
8054 /* For BB vectorization we have to compute an insert location
8055 when a def is inside the analyzed region since we cannot
8056 simply insert at the BB start in this case. */
8057 stmt_vec_info opdef;
8058 if (TREE_CODE (orig_op) == SSA_NAME
8059 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8060 && is_a <bb_vec_info> (vinfo)
8061 && (opdef = vinfo->lookup_def (orig_op)))
8062 {
8063 if (!insert_after)
8064 insert_after = opdef;
8065 else
8066 insert_after = get_later_stmt (insert_after, opdef);
8067 }
8068
8069 if (number_of_places_left_in_vector == 0)
8070 {
8071 if (constant_p
8072 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
8073 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
8074 vec_cst = gimple_build_vector (&ctor_seq, &elts);
8075 else
8076 {
8077 if (permute_results.is_empty ())
8078 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8079 elts, number_of_vectors,
8080 permute_results);
8081 vec_cst = permute_results[number_of_vectors - j - 1];
8082 }
8083 if (!gimple_seq_empty_p (ctor_seq))
8084 {
8085 if (insert_after)
8086 {
8087 gimple_stmt_iterator gsi;
8088 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8089 {
8090 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8091 gsi_insert_seq_before (&gsi, ctor_seq,
8092 GSI_CONTINUE_LINKING);
8093 }
8094 else if (!stmt_ends_bb_p (insert_after->stmt))
8095 {
8096 gsi = gsi_for_stmt (insert_after->stmt);
8097 gsi_insert_seq_after (&gsi, ctor_seq,
8098 GSI_CONTINUE_LINKING);
8099 }
8100 else
8101 {
8102 /* When we want to insert after a def where the
8103 defining stmt throws then insert on the fallthru
8104 edge. */
8105 edge e = find_fallthru_edge
8106 (gimple_bb (insert_after->stmt)->succs);
8107 basic_block new_bb
8108 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8109 gcc_assert (!new_bb);
8110 }
8111 }
8112 else
8113 vinfo->insert_seq_on_entry (NULL, ctor_seq);
8114 ctor_seq = NULL;
8115 }
8116 voprnds.quick_push (vec_cst);
8117 insert_after = NULL;
8118 number_of_places_left_in_vector = nunits;
8119 constant_p = true;
8120 elts.new_vector (vector_type, nunits, 1);
8121 elts.quick_grow (nunits);
8122 }
8123 }
8124 }
8125
8126 /* Since the vectors are created in the reverse order, we should invert
8127 them. */
8128 vec_num = voprnds.length ();
8129 for (j = vec_num; j != 0; j--)
8130 {
8131 vop = voprnds[j - 1];
8132 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8133 }
8134
8135 /* In case that VF is greater than the unrolling factor needed for the SLP
8136 group of stmts, NUMBER_OF_VECTORS to be created is greater than
8137 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8138 to replicate the vectors. */
8139 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8140 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8141 i++)
8142 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8143 }
8144
8145 /* Get the Ith vectorized definition from SLP_NODE. */
8146
8147 tree
8148 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8149 {
8150 return SLP_TREE_VEC_DEFS (slp_node)[i];
8151 }
8152
8153 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8154
8155 void
8156 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8157 {
8158 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8159 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8160 }
8161
8162 /* Get N vectorized definitions for SLP_NODE. */
8163
8164 void
8165 vect_get_slp_defs (vec_info *,
8166 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8167 {
8168 if (n == -1U)
8169 n = SLP_TREE_CHILDREN (slp_node).length ();
8170
8171 for (unsigned i = 0; i < n; ++i)
8172 {
8173 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8174 vec<tree> vec_defs = vNULL;
8175 vect_get_slp_defs (child, &vec_defs);
8176 vec_oprnds->quick_push (vec_defs);
8177 }
8178 }
8179
8180 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8181 - PERM gives the permutation that the caller wants to use for NODE,
8182 which might be different from SLP_LOAD_PERMUTATION.
8183 - DUMP_P controls whether the function dumps information. */
8184
8185 static bool
8186 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8187 load_permutation_t &perm,
8188 const vec<tree> &dr_chain,
8189 gimple_stmt_iterator *gsi, poly_uint64 vf,
8190 bool analyze_only, bool dump_p,
8191 unsigned *n_perms, unsigned int *n_loads,
8192 bool dce_chain)
8193 {
8194 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8195 int vec_index = 0;
8196 tree vectype = SLP_TREE_VECTYPE (node);
8197 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8198 unsigned int mask_element;
8199 unsigned dr_group_size;
8200 machine_mode mode;
8201
8202 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8203 dr_group_size = 1;
8204 else
8205 {
8206 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8207 dr_group_size = DR_GROUP_SIZE (stmt_info);
8208 }
8209
8210 mode = TYPE_MODE (vectype);
8211 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8212 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8213
8214 /* Initialize the vect stmts of NODE to properly insert the generated
8215 stmts later. */
8216 if (! analyze_only)
8217 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8218 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8219
8220 /* Generate permutation masks for every NODE. Number of masks for each NODE
8221 is equal to GROUP_SIZE.
8222 E.g., we have a group of three nodes with three loads from the same
8223 location in each node, and the vector size is 4. I.e., we have a
8224 a0b0c0a1b1c1... sequence and we need to create the following vectors:
8225 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8226 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8227 ...
8228
8229 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8230 The last mask is illegal since we assume two operands for permute
8231 operation, and the mask element values can't be outside that range.
8232 Hence, the last mask must be converted into {2,5,5,5}.
8233 For the first two permutations we need the first and the second input
8234 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8235 we need the second and the third vectors: {b1,c1,a2,b2} and
8236 {c2,a3,b3,c3}. */
8237
8238 int vect_stmts_counter = 0;
8239 unsigned int index = 0;
8240 int first_vec_index = -1;
8241 int second_vec_index = -1;
8242 bool noop_p = true;
8243 *n_perms = 0;
8244
8245 vec_perm_builder mask;
8246 unsigned int nelts_to_build;
8247 unsigned int nvectors_per_build;
8248 unsigned int in_nlanes;
8249 bool repeating_p = (group_size == dr_group_size
8250 && multiple_p (nunits, group_size));
8251 if (repeating_p)
8252 {
8253 /* A single vector contains a whole number of copies of the node, so:
8254 (a) all permutes can use the same mask; and
8255 (b) the permutes only need a single vector input. */
8256 mask.new_vector (nunits, group_size, 3);
8257 nelts_to_build = mask.encoded_nelts ();
8258 /* It's possible to obtain zero nstmts during analyze_only, so make
8259 it at least one to ensure the later computation for n_perms
8260 proceed. */
8261 nvectors_per_build = nstmts > 0 ? nstmts : 1;
8262 in_nlanes = dr_group_size * 3;
8263 }
8264 else
8265 {
8266 /* We need to construct a separate mask for each vector statement. */
8267 unsigned HOST_WIDE_INT const_nunits, const_vf;
8268 if (!nunits.is_constant (&const_nunits)
8269 || !vf.is_constant (&const_vf))
8270 return false;
8271 mask.new_vector (const_nunits, const_nunits, 1);
8272 nelts_to_build = const_vf * group_size;
8273 nvectors_per_build = 1;
8274 in_nlanes = const_vf * dr_group_size;
8275 }
8276 auto_sbitmap used_in_lanes (in_nlanes);
8277 bitmap_clear (used_in_lanes);
8278 auto_bitmap used_defs;
8279
8280 unsigned int count = mask.encoded_nelts ();
8281 mask.quick_grow (count);
8282 vec_perm_indices indices;
8283
8284 for (unsigned int j = 0; j < nelts_to_build; j++)
8285 {
8286 unsigned int iter_num = j / group_size;
8287 unsigned int stmt_num = j % group_size;
8288 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8289 bitmap_set_bit (used_in_lanes, i);
8290 if (repeating_p)
8291 {
8292 first_vec_index = 0;
8293 mask_element = i;
8294 }
8295 else
8296 {
8297 /* Enforced before the loop when !repeating_p. */
8298 unsigned int const_nunits = nunits.to_constant ();
8299 vec_index = i / const_nunits;
8300 mask_element = i % const_nunits;
8301 if (vec_index == first_vec_index
8302 || first_vec_index == -1)
8303 {
8304 first_vec_index = vec_index;
8305 }
8306 else if (vec_index == second_vec_index
8307 || second_vec_index == -1)
8308 {
8309 second_vec_index = vec_index;
8310 mask_element += const_nunits;
8311 }
8312 else
8313 {
8314 if (dump_p)
8315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8316 "permutation requires at "
8317 "least three vectors %G",
8318 stmt_info->stmt);
8319 gcc_assert (analyze_only);
8320 return false;
8321 }
8322
8323 gcc_assert (mask_element < 2 * const_nunits);
8324 }
8325
8326 if (mask_element != index)
8327 noop_p = false;
8328 mask[index++] = mask_element;
8329
8330 if (index == count)
8331 {
8332 if (!noop_p)
8333 {
8334 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8335 if (!can_vec_perm_const_p (mode, mode, indices))
8336 {
8337 if (dump_p)
8338 {
8339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8340 "unsupported vect permute { ");
8341 for (i = 0; i < count; ++i)
8342 {
8343 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8344 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8345 }
8346 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8347 }
8348 gcc_assert (analyze_only);
8349 return false;
8350 }
8351
8352 tree mask_vec = NULL_TREE;
8353 if (!analyze_only)
8354 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8355
8356 if (second_vec_index == -1)
8357 second_vec_index = first_vec_index;
8358
8359 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8360 {
8361 ++*n_perms;
8362 if (analyze_only)
8363 continue;
8364 /* Generate the permute statement if necessary. */
8365 tree first_vec = dr_chain[first_vec_index + ri];
8366 tree second_vec = dr_chain[second_vec_index + ri];
8367 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8368 tree perm_dest
8369 = vect_create_destination_var (gimple_assign_lhs (stmt),
8370 vectype);
8371 perm_dest = make_ssa_name (perm_dest);
8372 gimple *perm_stmt
8373 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8374 second_vec, mask_vec);
8375 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8376 gsi);
8377 if (dce_chain)
8378 {
8379 bitmap_set_bit (used_defs, first_vec_index + ri);
8380 bitmap_set_bit (used_defs, second_vec_index + ri);
8381 }
8382
8383 /* Store the vector statement in NODE. */
8384 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8385 }
8386 }
8387 else if (!analyze_only)
8388 {
8389 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8390 {
8391 tree first_vec = dr_chain[first_vec_index + ri];
8392 /* If mask was NULL_TREE generate the requested
8393 identity transform. */
8394 if (dce_chain)
8395 bitmap_set_bit (used_defs, first_vec_index + ri);
8396
8397 /* Store the vector statement in NODE. */
8398 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8399 }
8400 }
8401
8402 index = 0;
8403 first_vec_index = -1;
8404 second_vec_index = -1;
8405 noop_p = true;
8406 }
8407 }
8408
8409 if (n_loads)
8410 {
8411 if (repeating_p)
8412 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8413 else
8414 {
8415 /* Enforced above when !repeating_p. */
8416 unsigned int const_nunits = nunits.to_constant ();
8417 *n_loads = 0;
8418 bool load_seen = false;
8419 for (unsigned i = 0; i < in_nlanes; ++i)
8420 {
8421 if (i % const_nunits == 0)
8422 {
8423 if (load_seen)
8424 *n_loads += 1;
8425 load_seen = false;
8426 }
8427 if (bitmap_bit_p (used_in_lanes, i))
8428 load_seen = true;
8429 }
8430 if (load_seen)
8431 *n_loads += 1;
8432 }
8433 }
8434
8435 if (dce_chain)
8436 for (unsigned i = 0; i < dr_chain.length (); ++i)
8437 if (!bitmap_bit_p (used_defs, i))
8438 {
8439 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
8440 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8441 gsi_remove (&rgsi, true);
8442 release_defs (stmt);
8443 }
8444
8445 return true;
8446 }
8447
8448 /* Generate vector permute statements from a list of loads in DR_CHAIN.
8449 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8450 permute statements for the SLP node NODE. Store the number of vector
8451 permute instructions in *N_PERMS and the number of vector load
8452 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8453 that were not needed. */
8454
8455 bool
8456 vect_transform_slp_perm_load (vec_info *vinfo,
8457 slp_tree node, const vec<tree> &dr_chain,
8458 gimple_stmt_iterator *gsi, poly_uint64 vf,
8459 bool analyze_only, unsigned *n_perms,
8460 unsigned int *n_loads, bool dce_chain)
8461 {
8462 return vect_transform_slp_perm_load_1 (vinfo, node,
8463 SLP_TREE_LOAD_PERMUTATION (node),
8464 dr_chain, gsi, vf, analyze_only,
8465 dump_enabled_p (), n_perms, n_loads,
8466 dce_chain);
8467 }
8468
8469 /* Produce the next vector result for SLP permutation NODE by adding a vector
8470 statement at GSI. If MASK_VEC is nonnull, add:
8471
8472 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8473
8474 otherwise add:
8475
8476 <new SSA name> = FIRST_DEF. */
8477
8478 static void
8479 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8480 slp_tree node, tree first_def, tree second_def,
8481 tree mask_vec, poly_uint64 identity_offset)
8482 {
8483 tree vectype = SLP_TREE_VECTYPE (node);
8484
8485 /* ??? We SLP match existing vector element extracts but
8486 allow punning which we need to re-instantiate at uses
8487 but have no good way of explicitly representing. */
8488 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8489 && !types_compatible_p (TREE_TYPE (first_def), vectype))
8490 {
8491 gassign *conv_stmt
8492 = gimple_build_assign (make_ssa_name (vectype),
8493 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8494 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8495 first_def = gimple_assign_lhs (conv_stmt);
8496 }
8497 gassign *perm_stmt;
8498 tree perm_dest = make_ssa_name (vectype);
8499 if (mask_vec)
8500 {
8501 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8502 TYPE_SIZE (vectype))
8503 && !types_compatible_p (TREE_TYPE (second_def), vectype))
8504 {
8505 gassign *conv_stmt
8506 = gimple_build_assign (make_ssa_name (vectype),
8507 build1 (VIEW_CONVERT_EXPR,
8508 vectype, second_def));
8509 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8510 second_def = gimple_assign_lhs (conv_stmt);
8511 }
8512 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8513 first_def, second_def,
8514 mask_vec);
8515 }
8516 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8517 {
8518 /* For identity permutes we still need to handle the case
8519 of offsetted extracts or concats. */
8520 unsigned HOST_WIDE_INT c;
8521 auto first_def_nunits
8522 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8523 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8524 {
8525 unsigned HOST_WIDE_INT elsz
8526 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8527 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8528 TYPE_SIZE (vectype),
8529 bitsize_int (identity_offset * elsz));
8530 perm_stmt = gimple_build_assign (perm_dest, lowpart);
8531 }
8532 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8533 first_def_nunits, &c) && c == 2)
8534 {
8535 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8536 NULL_TREE, second_def);
8537 perm_stmt = gimple_build_assign (perm_dest, ctor);
8538 }
8539 else
8540 gcc_unreachable ();
8541 }
8542 else
8543 {
8544 /* We need a copy here in case the def was external. */
8545 perm_stmt = gimple_build_assign (perm_dest, first_def);
8546 }
8547 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8548 /* Store the vector statement in NODE. */
8549 node->push_vec_def (perm_stmt);
8550 }
8551
8552 /* Subroutine of vectorizable_slp_permutation. Check whether the target
8553 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8554 If GSI is nonnull, emit the permutation there.
8555
8556 When GSI is null, the only purpose of NODE is to give properties
8557 of the result, such as the vector type and number of SLP lanes.
8558 The node does not need to be a VEC_PERM_EXPR.
8559
8560 If the target supports the operation, return the number of individual
8561 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8562 dump file if DUMP_P is true. */
8563
8564 static int
8565 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8566 slp_tree node, lane_permutation_t &perm,
8567 vec<slp_tree> &children, bool dump_p)
8568 {
8569 tree vectype = SLP_TREE_VECTYPE (node);
8570
8571 /* ??? We currently only support all same vector input types
8572 while the SLP IL should really do a concat + select and thus accept
8573 arbitrary mismatches. */
8574 slp_tree child;
8575 unsigned i;
8576 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8577 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8578 tree op_vectype = NULL_TREE;
8579 FOR_EACH_VEC_ELT (children, i, child)
8580 if (SLP_TREE_VECTYPE (child))
8581 {
8582 op_vectype = SLP_TREE_VECTYPE (child);
8583 break;
8584 }
8585 if (!op_vectype)
8586 op_vectype = vectype;
8587 FOR_EACH_VEC_ELT (children, i, child)
8588 {
8589 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8590 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8591 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8592 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8593 {
8594 if (dump_p)
8595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8596 "Unsupported vector types in lane permutation\n");
8597 return -1;
8598 }
8599 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8600 repeating_p = false;
8601 }
8602
8603 gcc_assert (perm.length () == SLP_TREE_LANES (node));
8604 if (dump_p)
8605 {
8606 dump_printf_loc (MSG_NOTE, vect_location,
8607 "vectorizing permutation");
8608 for (unsigned i = 0; i < perm.length (); ++i)
8609 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8610 if (repeating_p)
8611 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8612 dump_printf (MSG_NOTE, "\n");
8613 }
8614
8615 /* REPEATING_P is true if every output vector is guaranteed to use the
8616 same permute vector. We can handle that case for both variable-length
8617 and constant-length vectors, but we only handle other cases for
8618 constant-length vectors.
8619
8620 Set:
8621
8622 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8623 mask vector that we want to build.
8624
8625 - NCOPIES to the number of copies of PERM that we need in order
8626 to build the necessary permute mask vectors.
8627
8628 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
8629 for each permute mask vector. This is only relevant when GSI is
8630 nonnull. */
8631 uint64_t npatterns;
8632 unsigned nelts_per_pattern;
8633 uint64_t ncopies;
8634 unsigned noutputs_per_mask;
8635 if (repeating_p)
8636 {
8637 /* We need a single permute mask vector that has the form:
8638
8639 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
8640
8641 In other words, the original n-element permute in PERM is
8642 "unrolled" to fill a full vector. The stepped vector encoding
8643 that we use for permutes requires 3n elements. */
8644 npatterns = SLP_TREE_LANES (node);
8645 nelts_per_pattern = ncopies = 3;
8646 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8647 }
8648 else
8649 {
8650 /* Calculate every element of every permute mask vector explicitly,
8651 instead of relying on the pattern described above. */
8652 if (!nunits.is_constant (&npatterns))
8653 return -1;
8654 nelts_per_pattern = ncopies = 1;
8655 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
8656 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
8657 return -1;
8658 noutputs_per_mask = 1;
8659 }
8660 unsigned olanes = ncopies * SLP_TREE_LANES (node);
8661 gcc_assert (repeating_p || multiple_p (olanes, nunits));
8662
8663 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
8664 from the { SLP operand, scalar lane } permutation as recorded in the
8665 SLP node as intermediate step. This part should already work
8666 with SLP children with arbitrary number of lanes. */
8667 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
8668 auto_vec<unsigned> active_lane;
8669 vperm.create (olanes);
8670 active_lane.safe_grow_cleared (children.length (), true);
8671 for (unsigned i = 0; i < ncopies; ++i)
8672 {
8673 for (unsigned pi = 0; pi < perm.length (); ++pi)
8674 {
8675 std::pair<unsigned, unsigned> p = perm[pi];
8676 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
8677 if (repeating_p)
8678 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
8679 else
8680 {
8681 /* We checked above that the vectors are constant-length. */
8682 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
8683 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
8684 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
8685 vperm.quick_push ({{p.first, vi}, vl});
8686 }
8687 }
8688 /* Advance to the next group. */
8689 for (unsigned j = 0; j < children.length (); ++j)
8690 active_lane[j] += SLP_TREE_LANES (children[j]);
8691 }
8692
8693 if (dump_p)
8694 {
8695 dump_printf_loc (MSG_NOTE, vect_location,
8696 "vectorizing permutation");
8697 for (unsigned i = 0; i < perm.length (); ++i)
8698 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8699 if (repeating_p)
8700 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8701 dump_printf (MSG_NOTE, "\n");
8702 dump_printf_loc (MSG_NOTE, vect_location, "as");
8703 for (unsigned i = 0; i < vperm.length (); ++i)
8704 {
8705 if (i != 0
8706 && (repeating_p
8707 ? multiple_p (i, npatterns)
8708 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
8709 dump_printf (MSG_NOTE, ",");
8710 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
8711 vperm[i].first.first, vperm[i].first.second,
8712 vperm[i].second);
8713 }
8714 dump_printf (MSG_NOTE, "\n");
8715 }
8716
8717 /* We can only handle two-vector permutes, everything else should
8718 be lowered on the SLP level. The following is closely inspired
8719 by vect_transform_slp_perm_load and is supposed to eventually
8720 replace it.
8721 ??? As intermediate step do code-gen in the SLP tree representation
8722 somehow? */
8723 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
8724 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
8725 unsigned int index = 0;
8726 poly_uint64 mask_element;
8727 vec_perm_builder mask;
8728 mask.new_vector (nunits, npatterns, nelts_per_pattern);
8729 unsigned int count = mask.encoded_nelts ();
8730 mask.quick_grow (count);
8731 vec_perm_indices indices;
8732 unsigned nperms = 0;
8733 for (unsigned i = 0; i < vperm.length (); ++i)
8734 {
8735 mask_element = vperm[i].second;
8736 if (first_vec.first == -1U
8737 || first_vec == vperm[i].first)
8738 first_vec = vperm[i].first;
8739 else if (second_vec.first == -1U
8740 || second_vec == vperm[i].first)
8741 {
8742 second_vec = vperm[i].first;
8743 mask_element += nunits;
8744 }
8745 else
8746 {
8747 if (dump_p)
8748 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8749 "permutation requires at "
8750 "least three vectors\n");
8751 gcc_assert (!gsi);
8752 return -1;
8753 }
8754
8755 mask[index++] = mask_element;
8756
8757 if (index == count)
8758 {
8759 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
8760 TYPE_VECTOR_SUBPARTS (op_vectype));
8761 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
8762 && constant_multiple_p (mask[0], nunits));
8763 machine_mode vmode = TYPE_MODE (vectype);
8764 machine_mode op_vmode = TYPE_MODE (op_vectype);
8765 unsigned HOST_WIDE_INT c;
8766 if ((!identity_p
8767 && !can_vec_perm_const_p (vmode, op_vmode, indices))
8768 || (identity_p
8769 && !known_le (nunits,
8770 TYPE_VECTOR_SUBPARTS (op_vectype))
8771 && (!constant_multiple_p (nunits,
8772 TYPE_VECTOR_SUBPARTS (op_vectype),
8773 &c) || c != 2)))
8774 {
8775 if (dump_p)
8776 {
8777 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
8778 vect_location,
8779 "unsupported vect permute { ");
8780 for (i = 0; i < count; ++i)
8781 {
8782 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8783 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8784 }
8785 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8786 }
8787 gcc_assert (!gsi);
8788 return -1;
8789 }
8790
8791 if (!identity_p)
8792 nperms++;
8793 if (gsi)
8794 {
8795 if (second_vec.first == -1U)
8796 second_vec = first_vec;
8797
8798 slp_tree
8799 first_node = children[first_vec.first],
8800 second_node = children[second_vec.first];
8801
8802 tree mask_vec = NULL_TREE;
8803 if (!identity_p)
8804 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8805
8806 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
8807 {
8808 tree first_def
8809 = vect_get_slp_vect_def (first_node,
8810 first_vec.second + vi);
8811 tree second_def
8812 = vect_get_slp_vect_def (second_node,
8813 second_vec.second + vi);
8814 vect_add_slp_permutation (vinfo, gsi, node, first_def,
8815 second_def, mask_vec, mask[0]);
8816 }
8817 }
8818
8819 index = 0;
8820 first_vec = std::make_pair (-1U, -1U);
8821 second_vec = std::make_pair (-1U, -1U);
8822 }
8823 }
8824
8825 return nperms;
8826 }
8827
8828 /* Vectorize the SLP permutations in NODE as specified
8829 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
8830 child number and lane number.
8831 Interleaving of two two-lane two-child SLP subtrees (not supported):
8832 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
8833 A blend of two four-lane two-child SLP subtrees:
8834 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
8835 Highpart of a four-lane one-child SLP subtree (not supported):
8836 [ { 0, 2 }, { 0, 3 } ]
8837 Where currently only a subset is supported by code generating below. */
8838
8839 static bool
8840 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8841 slp_tree node, stmt_vector_for_cost *cost_vec)
8842 {
8843 tree vectype = SLP_TREE_VECTYPE (node);
8844 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
8845 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
8846 SLP_TREE_CHILDREN (node),
8847 dump_enabled_p ());
8848 if (nperms < 0)
8849 return false;
8850
8851 if (!gsi)
8852 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
8853
8854 return true;
8855 }
8856
8857 /* Vectorize SLP NODE. */
8858
8859 static void
8860 vect_schedule_slp_node (vec_info *vinfo,
8861 slp_tree node, slp_instance instance)
8862 {
8863 gimple_stmt_iterator si;
8864 int i;
8865 slp_tree child;
8866
8867 /* For existing vectors there's nothing to do. */
8868 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
8869 && SLP_TREE_VEC_DEFS (node).exists ())
8870 return;
8871
8872 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
8873
8874 /* Vectorize externals and constants. */
8875 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
8876 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8877 {
8878 /* ??? vectorizable_shift can end up using a scalar operand which is
8879 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
8880 node in this case. */
8881 if (!SLP_TREE_VECTYPE (node))
8882 return;
8883
8884 vect_create_constant_vectors (vinfo, node);
8885 return;
8886 }
8887
8888 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8889
8890 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
8891 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
8892
8893 if (dump_enabled_p ())
8894 dump_printf_loc (MSG_NOTE, vect_location,
8895 "------>vectorizing SLP node starting from: %G",
8896 stmt_info->stmt);
8897
8898 if (STMT_VINFO_DATA_REF (stmt_info)
8899 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8900 {
8901 /* Vectorized loads go before the first scalar load to make it
8902 ready early, vectorized stores go before the last scalar
8903 stmt which is where all uses are ready. */
8904 stmt_vec_info last_stmt_info = NULL;
8905 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
8906 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
8907 else /* DR_IS_WRITE */
8908 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
8909 si = gsi_for_stmt (last_stmt_info->stmt);
8910 }
8911 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
8912 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
8913 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
8914 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
8915 {
8916 /* For PHI node vectorization we do not use the insertion iterator. */
8917 si = gsi_none ();
8918 }
8919 else
8920 {
8921 /* Emit other stmts after the children vectorized defs which is
8922 earliest possible. */
8923 gimple *last_stmt = NULL;
8924 bool seen_vector_def = false;
8925 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8926 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8927 {
8928 /* For fold-left reductions we are retaining the scalar
8929 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
8930 set so the representation isn't perfect. Resort to the
8931 last scalar def here. */
8932 if (SLP_TREE_VEC_DEFS (child).is_empty ())
8933 {
8934 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
8935 == cycle_phi_info_type);
8936 gphi *phi = as_a <gphi *>
8937 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
8938 if (!last_stmt
8939 || vect_stmt_dominates_stmt_p (last_stmt, phi))
8940 last_stmt = phi;
8941 }
8942 /* We are emitting all vectorized stmts in the same place and
8943 the last one is the last.
8944 ??? Unless we have a load permutation applied and that
8945 figures to re-use an earlier generated load. */
8946 unsigned j;
8947 tree vdef;
8948 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
8949 {
8950 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
8951 if (!last_stmt
8952 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8953 last_stmt = vstmt;
8954 }
8955 }
8956 else if (!SLP_TREE_VECTYPE (child))
8957 {
8958 /* For externals we use unvectorized at all scalar defs. */
8959 unsigned j;
8960 tree def;
8961 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
8962 if (TREE_CODE (def) == SSA_NAME
8963 && !SSA_NAME_IS_DEFAULT_DEF (def))
8964 {
8965 gimple *stmt = SSA_NAME_DEF_STMT (def);
8966 if (!last_stmt
8967 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
8968 last_stmt = stmt;
8969 }
8970 }
8971 else
8972 {
8973 /* For externals we have to look at all defs since their
8974 insertion place is decided per vector. But beware
8975 of pre-existing vectors where we need to make sure
8976 we do not insert before the region boundary. */
8977 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
8978 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
8979 seen_vector_def = true;
8980 else
8981 {
8982 unsigned j;
8983 tree vdef;
8984 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
8985 if (TREE_CODE (vdef) == SSA_NAME
8986 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
8987 {
8988 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
8989 if (!last_stmt
8990 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
8991 last_stmt = vstmt;
8992 }
8993 }
8994 }
8995 /* This can happen when all children are pre-existing vectors or
8996 constants. */
8997 if (!last_stmt)
8998 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
8999 if (!last_stmt)
9000 {
9001 gcc_assert (seen_vector_def);
9002 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9003 }
9004 else if (is_ctrl_altering_stmt (last_stmt))
9005 {
9006 /* We split regions to vectorize at control altering stmts
9007 with a definition so this must be an external which
9008 we can insert at the start of the region. */
9009 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9010 }
9011 else if (is_a <bb_vec_info> (vinfo)
9012 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9013 && gimple_could_trap_p (stmt_info->stmt))
9014 {
9015 /* We've constrained possibly trapping operations to all come
9016 from the same basic-block, if vectorized defs would allow earlier
9017 scheduling still force vectorized stmts to the original block.
9018 This is only necessary for BB vectorization since for loop vect
9019 all operations are in a single BB and scalar stmt based
9020 placement doesn't play well with epilogue vectorization. */
9021 gcc_assert (dominated_by_p (CDI_DOMINATORS,
9022 gimple_bb (stmt_info->stmt),
9023 gimple_bb (last_stmt)));
9024 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9025 }
9026 else if (is_a <gphi *> (last_stmt))
9027 si = gsi_after_labels (gimple_bb (last_stmt));
9028 else
9029 {
9030 si = gsi_for_stmt (last_stmt);
9031 gsi_next (&si);
9032 }
9033 }
9034
9035 /* Handle purely internal nodes. */
9036 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9037 {
9038 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9039 be shared with different SLP nodes (but usually it's the same
9040 operation apart from the case the stmt is only there for denoting
9041 the actual scalar lane defs ...). So do not call vect_transform_stmt
9042 but open-code it here (partly). */
9043 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9044 gcc_assert (done);
9045 stmt_vec_info slp_stmt_info;
9046 unsigned int i;
9047 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9048 if (STMT_VINFO_LIVE_P (slp_stmt_info))
9049 {
9050 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9051 instance, i, true, NULL);
9052 gcc_assert (done);
9053 }
9054 }
9055 else
9056 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9057 }
9058
9059 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9060 For loop vectorization this is done in vectorizable_call, but for SLP
9061 it needs to be deferred until end of vect_schedule_slp, because multiple
9062 SLP instances may refer to the same scalar stmt. */
9063
9064 static void
9065 vect_remove_slp_scalar_calls (vec_info *vinfo,
9066 slp_tree node, hash_set<slp_tree> &visited)
9067 {
9068 gimple *new_stmt;
9069 gimple_stmt_iterator gsi;
9070 int i;
9071 slp_tree child;
9072 tree lhs;
9073 stmt_vec_info stmt_info;
9074
9075 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9076 return;
9077
9078 if (visited.add (node))
9079 return;
9080
9081 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9082 vect_remove_slp_scalar_calls (vinfo, child, visited);
9083
9084 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9085 {
9086 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9087 if (!stmt || gimple_bb (stmt) == NULL)
9088 continue;
9089 if (is_pattern_stmt_p (stmt_info)
9090 || !PURE_SLP_STMT (stmt_info))
9091 continue;
9092 lhs = gimple_call_lhs (stmt);
9093 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9094 gsi = gsi_for_stmt (stmt);
9095 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9096 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
9097 }
9098 }
9099
9100 static void
9101 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9102 {
9103 hash_set<slp_tree> visited;
9104 vect_remove_slp_scalar_calls (vinfo, node, visited);
9105 }
9106
9107 /* Vectorize the instance root. */
9108
9109 void
9110 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9111 {
9112 gassign *rstmt = NULL;
9113
9114 if (instance->kind == slp_inst_kind_ctor)
9115 {
9116 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9117 {
9118 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9119 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9120 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9121 TREE_TYPE (vect_lhs)))
9122 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9123 vect_lhs);
9124 rstmt = gimple_build_assign (root_lhs, vect_lhs);
9125 }
9126 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9127 {
9128 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9129 tree child_def;
9130 int j;
9131 vec<constructor_elt, va_gc> *v;
9132 vec_alloc (v, nelts);
9133
9134 /* A CTOR can handle V16HI composition from VNx8HI so we
9135 do not need to convert vector elements if the types
9136 do not match. */
9137 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9138 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9139 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9140 tree rtype
9141 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9142 tree r_constructor = build_constructor (rtype, v);
9143 rstmt = gimple_build_assign (lhs, r_constructor);
9144 }
9145 }
9146 else if (instance->kind == slp_inst_kind_bb_reduc)
9147 {
9148 /* Largely inspired by reduction chain epilogue handling in
9149 vect_create_epilog_for_reduction. */
9150 vec<tree> vec_defs = vNULL;
9151 vect_get_slp_defs (node, &vec_defs);
9152 enum tree_code reduc_code
9153 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9154 /* ??? We actually have to reflect signs somewhere. */
9155 if (reduc_code == MINUS_EXPR)
9156 reduc_code = PLUS_EXPR;
9157 gimple_seq epilogue = NULL;
9158 /* We may end up with more than one vector result, reduce them
9159 to one vector. */
9160 tree vec_def = vec_defs[0];
9161 tree vectype = TREE_TYPE (vec_def);
9162 tree compute_vectype = vectype;
9163 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9164 && TYPE_OVERFLOW_UNDEFINED (vectype)
9165 && operation_can_overflow (reduc_code));
9166 if (pun_for_overflow_p)
9167 {
9168 compute_vectype = unsigned_type_for (vectype);
9169 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9170 compute_vectype, vec_def);
9171 }
9172 for (unsigned i = 1; i < vec_defs.length (); ++i)
9173 {
9174 tree def = vec_defs[i];
9175 if (pun_for_overflow_p)
9176 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9177 compute_vectype, def);
9178 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9179 vec_def, def);
9180 }
9181 vec_defs.release ();
9182 /* ??? Support other schemes than direct internal fn. */
9183 internal_fn reduc_fn;
9184 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9185 || reduc_fn == IFN_LAST)
9186 gcc_unreachable ();
9187 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9188 TREE_TYPE (compute_vectype), vec_def);
9189 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9190 {
9191 tree rem_def = NULL_TREE;
9192 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9193 {
9194 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9195 if (!rem_def)
9196 rem_def = def;
9197 else
9198 rem_def = gimple_build (&epilogue, reduc_code,
9199 TREE_TYPE (scalar_def),
9200 rem_def, def);
9201 }
9202 scalar_def = gimple_build (&epilogue, reduc_code,
9203 TREE_TYPE (scalar_def),
9204 scalar_def, rem_def);
9205 }
9206 scalar_def = gimple_convert (&epilogue,
9207 TREE_TYPE (vectype), scalar_def);
9208 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9209 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9210 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9211 update_stmt (gsi_stmt (rgsi));
9212 return;
9213 }
9214 else
9215 gcc_unreachable ();
9216
9217 gcc_assert (rstmt);
9218
9219 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9220 gsi_replace (&rgsi, rstmt, true);
9221 }
9222
9223 struct slp_scc_info
9224 {
9225 bool on_stack;
9226 int dfs;
9227 int lowlink;
9228 };
9229
9230 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9231
9232 static void
9233 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9234 hash_map<slp_tree, slp_scc_info> &scc_info,
9235 int &maxdfs, vec<slp_tree> &stack)
9236 {
9237 bool existed_p;
9238 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9239 gcc_assert (!existed_p);
9240 info->dfs = maxdfs;
9241 info->lowlink = maxdfs;
9242 maxdfs++;
9243
9244 /* Leaf. */
9245 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9246 {
9247 info->on_stack = false;
9248 vect_schedule_slp_node (vinfo, node, instance);
9249 return;
9250 }
9251
9252 info->on_stack = true;
9253 stack.safe_push (node);
9254
9255 unsigned i;
9256 slp_tree child;
9257 /* DFS recurse. */
9258 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9259 {
9260 if (!child)
9261 continue;
9262 slp_scc_info *child_info = scc_info.get (child);
9263 if (!child_info)
9264 {
9265 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9266 /* Recursion might have re-allocated the node. */
9267 info = scc_info.get (node);
9268 child_info = scc_info.get (child);
9269 info->lowlink = MIN (info->lowlink, child_info->lowlink);
9270 }
9271 else if (child_info->on_stack)
9272 info->lowlink = MIN (info->lowlink, child_info->dfs);
9273 }
9274 if (info->lowlink != info->dfs)
9275 return;
9276
9277 auto_vec<slp_tree, 4> phis_to_fixup;
9278
9279 /* Singleton. */
9280 if (stack.last () == node)
9281 {
9282 stack.pop ();
9283 info->on_stack = false;
9284 vect_schedule_slp_node (vinfo, node, instance);
9285 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9286 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9287 phis_to_fixup.quick_push (node);
9288 }
9289 else
9290 {
9291 /* SCC. */
9292 int last_idx = stack.length () - 1;
9293 while (stack[last_idx] != node)
9294 last_idx--;
9295 /* We can break the cycle at PHIs who have at least one child
9296 code generated. Then we could re-start the DFS walk until
9297 all nodes in the SCC are covered (we might have new entries
9298 for only back-reachable nodes). But it's simpler to just
9299 iterate and schedule those that are ready. */
9300 unsigned todo = stack.length () - last_idx;
9301 do
9302 {
9303 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9304 {
9305 slp_tree entry = stack[idx];
9306 if (!entry)
9307 continue;
9308 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9309 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9310 bool ready = !phi;
9311 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9312 if (!child)
9313 {
9314 gcc_assert (phi);
9315 ready = true;
9316 break;
9317 }
9318 else if (scc_info.get (child)->on_stack)
9319 {
9320 if (!phi)
9321 {
9322 ready = false;
9323 break;
9324 }
9325 }
9326 else
9327 {
9328 if (phi)
9329 {
9330 ready = true;
9331 break;
9332 }
9333 }
9334 if (ready)
9335 {
9336 vect_schedule_slp_node (vinfo, entry, instance);
9337 scc_info.get (entry)->on_stack = false;
9338 stack[idx] = NULL;
9339 todo--;
9340 if (phi)
9341 phis_to_fixup.safe_push (entry);
9342 }
9343 }
9344 }
9345 while (todo != 0);
9346
9347 /* Pop the SCC. */
9348 stack.truncate (last_idx);
9349 }
9350
9351 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9352 slp_tree phi_node;
9353 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9354 {
9355 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9356 edge_iterator ei;
9357 edge e;
9358 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9359 {
9360 unsigned dest_idx = e->dest_idx;
9361 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9362 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9363 continue;
9364 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9365 /* Simply fill all args. */
9366 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9367 != vect_first_order_recurrence)
9368 for (unsigned i = 0; i < n; ++i)
9369 {
9370 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9371 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9372 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9373 e, gimple_phi_arg_location (phi, dest_idx));
9374 }
9375 else
9376 {
9377 /* Unless it is a first order recurrence which needs
9378 args filled in for both the PHI node and the permutes. */
9379 gimple *perm
9380 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9381 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9382 add_phi_arg (as_a <gphi *> (rphi),
9383 vect_get_slp_vect_def (child, n - 1),
9384 e, gimple_phi_arg_location (phi, dest_idx));
9385 for (unsigned i = 0; i < n; ++i)
9386 {
9387 gimple *perm
9388 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9389 if (i > 0)
9390 gimple_assign_set_rhs1 (perm,
9391 vect_get_slp_vect_def (child, i - 1));
9392 gimple_assign_set_rhs2 (perm,
9393 vect_get_slp_vect_def (child, i));
9394 update_stmt (perm);
9395 }
9396 }
9397 }
9398 }
9399 }
9400
9401 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9402
9403 void
9404 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9405 {
9406 slp_instance instance;
9407 unsigned int i;
9408
9409 hash_map<slp_tree, slp_scc_info> scc_info;
9410 int maxdfs = 0;
9411 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9412 {
9413 slp_tree node = SLP_INSTANCE_TREE (instance);
9414 if (dump_enabled_p ())
9415 {
9416 dump_printf_loc (MSG_NOTE, vect_location,
9417 "Vectorizing SLP tree:\n");
9418 /* ??? Dump all? */
9419 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9420 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9421 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9422 vect_print_slp_graph (MSG_NOTE, vect_location,
9423 SLP_INSTANCE_TREE (instance));
9424 }
9425 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9426 have a PHI be the node breaking the cycle. */
9427 auto_vec<slp_tree> stack;
9428 if (!scc_info.get (node))
9429 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9430
9431 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9432 vectorize_slp_instance_root_stmt (node, instance);
9433
9434 if (dump_enabled_p ())
9435 dump_printf_loc (MSG_NOTE, vect_location,
9436 "vectorizing stmts using SLP.\n");
9437 }
9438
9439 FOR_EACH_VEC_ELT (slp_instances, i, instance)
9440 {
9441 slp_tree root = SLP_INSTANCE_TREE (instance);
9442 stmt_vec_info store_info;
9443 unsigned int j;
9444
9445 /* Remove scalar call stmts. Do not do this for basic-block
9446 vectorization as not all uses may be vectorized.
9447 ??? Why should this be necessary? DCE should be able to
9448 remove the stmts itself.
9449 ??? For BB vectorization we can as well remove scalar
9450 stmts starting from the SLP tree root if they have no
9451 uses. */
9452 if (is_a <loop_vec_info> (vinfo))
9453 vect_remove_slp_scalar_calls (vinfo, root);
9454
9455 /* Remove vectorized stores original scalar stmts. */
9456 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9457 {
9458 if (!STMT_VINFO_DATA_REF (store_info)
9459 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9460 break;
9461
9462 store_info = vect_orig_stmt (store_info);
9463 /* Free the attached stmt_vec_info and remove the stmt. */
9464 vinfo->remove_stmt (store_info);
9465
9466 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9467 to not crash in vect_free_slp_tree later. */
9468 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9469 SLP_TREE_REPRESENTATIVE (root) = NULL;
9470 }
9471 }
9472 }
This page took 0.475035 seconds and 5 git commands to generate.