]> gcc.gnu.org Git - gcc.git/blob - gcc/tree-vect-slp.c
tree-optimization/105437 - BB vect with extern defs of throwing stmts
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55
56 static object_allocator<_slp_tree> *slp_tree_pool;
57 static slp_tree slp_first_node;
58
59 void
60 vect_slp_init (void)
61 {
62 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
63 }
64
65 void
66 vect_slp_fini (void)
67 {
68 while (slp_first_node)
69 delete slp_first_node;
70 delete slp_tree_pool;
71 slp_tree_pool = NULL;
72 }
73
74 void *
75 _slp_tree::operator new (size_t n)
76 {
77 gcc_assert (n == sizeof (_slp_tree));
78 return slp_tree_pool->allocate_raw ();
79 }
80
81 void
82 _slp_tree::operator delete (void *node, size_t n)
83 {
84 gcc_assert (n == sizeof (_slp_tree));
85 slp_tree_pool->remove_raw (node);
86 }
87
88
89 /* Initialize a SLP node. */
90
91 _slp_tree::_slp_tree ()
92 {
93 this->prev_node = NULL;
94 if (slp_first_node)
95 slp_first_node->prev_node = this;
96 this->next_node = slp_first_node;
97 slp_first_node = this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL;
99 SLP_TREE_SCALAR_OPS (this) = vNULL;
100 SLP_TREE_VEC_STMTS (this) = vNULL;
101 SLP_TREE_VEC_DEFS (this) = vNULL;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
107 SLP_TREE_CODE (this) = ERROR_MARK;
108 SLP_TREE_VECTYPE (this) = NULL_TREE;
109 SLP_TREE_REPRESENTATIVE (this) = NULL;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits = 1;
112 this->lanes = 0;
113 }
114
115 /* Tear down a SLP node. */
116
117 _slp_tree::~_slp_tree ()
118 {
119 if (this->prev_node)
120 this->prev_node->next_node = this->next_node;
121 else
122 slp_first_node = this->next_node;
123 if (this->next_node)
124 this->next_node->prev_node = this->prev_node;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
132 }
133
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
135
136 void
137 vect_free_slp_tree (slp_tree node)
138 {
139 int i;
140 slp_tree child;
141
142 if (--SLP_TREE_REF_COUNT (node) != 0)
143 return;
144
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
146 if (child)
147 vect_free_slp_tree (child);
148
149 /* If the node defines any SLP only patterns then those patterns are no
150 longer valid and should be removed. */
151 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
152 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
153 {
154 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
155 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
156 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
157 }
158
159 delete node;
160 }
161
162 /* Return a location suitable for dumpings related to the SLP instance. */
163
164 dump_user_location_t
165 _slp_instance::location () const
166 {
167 if (root_stmt)
168 return root_stmt->stmt;
169 else
170 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
171 }
172
173
174 /* Free the memory allocated for the SLP instance. */
175
176 void
177 vect_free_slp_instance (slp_instance instance)
178 {
179 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
180 SLP_INSTANCE_LOADS (instance).release ();
181 instance->subgraph_entries.release ();
182 instance->cost_vec.release ();
183 free (instance);
184 }
185
186
187 /* Create an SLP node for SCALAR_STMTS. */
188
189 slp_tree
190 vect_create_new_slp_node (unsigned nops, tree_code code)
191 {
192 slp_tree node = new _slp_tree;
193 SLP_TREE_SCALAR_STMTS (node) = vNULL;
194 SLP_TREE_CHILDREN (node).create (nops);
195 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
196 SLP_TREE_CODE (node) = code;
197 return node;
198 }
199 /* Create an SLP node for SCALAR_STMTS. */
200
201 static slp_tree
202 vect_create_new_slp_node (slp_tree node,
203 vec<stmt_vec_info> scalar_stmts, unsigned nops)
204 {
205 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
206 SLP_TREE_CHILDREN (node).create (nops);
207 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
208 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
209 SLP_TREE_LANES (node) = scalar_stmts.length ();
210 return node;
211 }
212
213 /* Create an SLP node for SCALAR_STMTS. */
214
215 static slp_tree
216 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
217 {
218 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
219 }
220
221 /* Create an SLP node for OPS. */
222
223 static slp_tree
224 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
225 {
226 SLP_TREE_SCALAR_OPS (node) = ops;
227 SLP_TREE_DEF_TYPE (node) = vect_external_def;
228 SLP_TREE_LANES (node) = ops.length ();
229 return node;
230 }
231
232 /* Create an SLP node for OPS. */
233
234 static slp_tree
235 vect_create_new_slp_node (vec<tree> ops)
236 {
237 return vect_create_new_slp_node (new _slp_tree, ops);
238 }
239
240
241 /* This structure is used in creation of an SLP tree. Each instance
242 corresponds to the same operand in a group of scalar stmts in an SLP
243 node. */
244 typedef struct _slp_oprnd_info
245 {
246 /* Def-stmts for the operands. */
247 vec<stmt_vec_info> def_stmts;
248 /* Operands. */
249 vec<tree> ops;
250 /* Information about the first statement, its vector def-type, type, the
251 operand itself in case it's constant, and an indication if it's a pattern
252 stmt. */
253 tree first_op_type;
254 enum vect_def_type first_dt;
255 bool any_pattern;
256 } *slp_oprnd_info;
257
258
259 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
260 operand. */
261 static vec<slp_oprnd_info>
262 vect_create_oprnd_info (int nops, int group_size)
263 {
264 int i;
265 slp_oprnd_info oprnd_info;
266 vec<slp_oprnd_info> oprnds_info;
267
268 oprnds_info.create (nops);
269 for (i = 0; i < nops; i++)
270 {
271 oprnd_info = XNEW (struct _slp_oprnd_info);
272 oprnd_info->def_stmts.create (group_size);
273 oprnd_info->ops.create (group_size);
274 oprnd_info->first_dt = vect_uninitialized_def;
275 oprnd_info->first_op_type = NULL_TREE;
276 oprnd_info->any_pattern = false;
277 oprnds_info.quick_push (oprnd_info);
278 }
279
280 return oprnds_info;
281 }
282
283
284 /* Free operands info. */
285
286 static void
287 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
288 {
289 int i;
290 slp_oprnd_info oprnd_info;
291
292 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
293 {
294 oprnd_info->def_stmts.release ();
295 oprnd_info->ops.release ();
296 XDELETE (oprnd_info);
297 }
298
299 oprnds_info.release ();
300 }
301
302
303 /* Return true if STMTS contains a pattern statement. */
304
305 static bool
306 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
307 {
308 stmt_vec_info stmt_info;
309 unsigned int i;
310 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
311 if (is_pattern_stmt_p (stmt_info))
312 return true;
313 return false;
314 }
315
316 /* Return true when all lanes in the external or constant NODE have
317 the same value. */
318
319 static bool
320 vect_slp_tree_uniform_p (slp_tree node)
321 {
322 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
323 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
324
325 /* Pre-exsting vectors. */
326 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
327 return false;
328
329 unsigned i;
330 tree op, first = NULL_TREE;
331 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
332 if (!first)
333 first = op;
334 else if (!operand_equal_p (first, op, 0))
335 return false;
336
337 return true;
338 }
339
340 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
341 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
342 of the chain. */
343
344 int
345 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
346 stmt_vec_info first_stmt_info)
347 {
348 stmt_vec_info next_stmt_info = first_stmt_info;
349 int result = 0;
350
351 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
352 return -1;
353
354 do
355 {
356 if (next_stmt_info == stmt_info)
357 return result;
358 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
359 if (next_stmt_info)
360 result += DR_GROUP_GAP (next_stmt_info);
361 }
362 while (next_stmt_info);
363
364 return -1;
365 }
366
367 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
368 using the method implemented by duplicate_and_interleave. Return true
369 if so, returning the number of intermediate vectors in *NVECTORS_OUT
370 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
371 (if nonnull). */
372
373 bool
374 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
375 tree elt_type, unsigned int *nvectors_out,
376 tree *vector_type_out,
377 tree *permutes)
378 {
379 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
380 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
381 return false;
382
383 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
384 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
385 unsigned int nvectors = 1;
386 for (;;)
387 {
388 scalar_int_mode int_mode;
389 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
390 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
391 {
392 /* Get the natural vector type for this SLP group size. */
393 tree int_type = build_nonstandard_integer_type
394 (GET_MODE_BITSIZE (int_mode), 1);
395 tree vector_type
396 = get_vectype_for_scalar_type (vinfo, int_type, count);
397 if (vector_type
398 && VECTOR_MODE_P (TYPE_MODE (vector_type))
399 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
400 GET_MODE_SIZE (base_vector_mode)))
401 {
402 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
403 together into elements of type INT_TYPE and using the result
404 to build NVECTORS vectors. */
405 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
406 vec_perm_builder sel1 (nelts, 2, 3);
407 vec_perm_builder sel2 (nelts, 2, 3);
408 poly_int64 half_nelts = exact_div (nelts, 2);
409 for (unsigned int i = 0; i < 3; ++i)
410 {
411 sel1.quick_push (i);
412 sel1.quick_push (i + nelts);
413 sel2.quick_push (half_nelts + i);
414 sel2.quick_push (half_nelts + i + nelts);
415 }
416 vec_perm_indices indices1 (sel1, 2, nelts);
417 vec_perm_indices indices2 (sel2, 2, nelts);
418 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
419 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
420 {
421 if (nvectors_out)
422 *nvectors_out = nvectors;
423 if (vector_type_out)
424 *vector_type_out = vector_type;
425 if (permutes)
426 {
427 permutes[0] = vect_gen_perm_mask_checked (vector_type,
428 indices1);
429 permutes[1] = vect_gen_perm_mask_checked (vector_type,
430 indices2);
431 }
432 return true;
433 }
434 }
435 }
436 if (!multiple_p (elt_bytes, 2, &elt_bytes))
437 return false;
438 nvectors *= 2;
439 }
440 }
441
442 /* Return true if DTA and DTB match. */
443
444 static bool
445 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
446 {
447 return (dta == dtb
448 || ((dta == vect_external_def || dta == vect_constant_def)
449 && (dtb == vect_external_def || dtb == vect_constant_def)));
450 }
451
452 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
453 they are of a valid type and that they match the defs of the first stmt of
454 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
455 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
456 indicates swap is required for cond_expr stmts. Specifically, *SWAP
457 is 1 if STMT is cond and operands of comparison need to be swapped;
458 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
459 If there is any operand swap in this function, *SWAP is set to non-zero
460 value.
461 If there was a fatal error return -1; if the error could be corrected by
462 swapping operands of father node of this one, return 1; if everything is
463 ok return 0. */
464 static int
465 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
466 bool *skip_args,
467 vec<stmt_vec_info> stmts, unsigned stmt_num,
468 vec<slp_oprnd_info> *oprnds_info)
469 {
470 stmt_vec_info stmt_info = stmts[stmt_num];
471 tree oprnd;
472 unsigned int i, number_of_oprnds;
473 enum vect_def_type dt = vect_uninitialized_def;
474 slp_oprnd_info oprnd_info;
475 int first_op_idx = 1;
476 unsigned int commutative_op = -1U;
477 bool first_op_cond = false;
478 bool first = stmt_num == 0;
479
480 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
481 {
482 number_of_oprnds = gimple_call_num_args (stmt);
483 first_op_idx = 3;
484 if (gimple_call_internal_p (stmt))
485 {
486 internal_fn ifn = gimple_call_internal_fn (stmt);
487 commutative_op = first_commutative_argument (ifn);
488
489 /* Masked load, only look at mask. */
490 if (ifn == IFN_MASK_LOAD)
491 {
492 number_of_oprnds = 1;
493 /* Mask operand index. */
494 first_op_idx = 5;
495 }
496 }
497 }
498 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
499 {
500 enum tree_code code = gimple_assign_rhs_code (stmt);
501 number_of_oprnds = gimple_num_ops (stmt) - 1;
502 /* Swap can only be done for cond_expr if asked to, otherwise we
503 could result in different comparison code to the first stmt. */
504 if (code == COND_EXPR
505 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
506 {
507 first_op_cond = true;
508 number_of_oprnds++;
509 }
510 else
511 commutative_op = commutative_tree_code (code) ? 0U : -1U;
512 }
513 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
514 number_of_oprnds = gimple_phi_num_args (stmt);
515 else
516 return -1;
517
518 bool swapped = (swap != 0);
519 bool backedge = false;
520 gcc_assert (!swapped || first_op_cond);
521 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
522 for (i = 0; i < number_of_oprnds; i++)
523 {
524 if (first_op_cond)
525 {
526 /* Map indicating how operands of cond_expr should be swapped. */
527 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
528 int *map = maps[swap];
529
530 if (i < 2)
531 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
532 first_op_idx), map[i]);
533 else
534 oprnd = gimple_op (stmt_info->stmt, map[i]);
535 }
536 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
537 {
538 oprnd = gimple_phi_arg_def (stmt, i);
539 backedge = dominated_by_p (CDI_DOMINATORS,
540 gimple_phi_arg_edge (stmt, i)->src,
541 gimple_bb (stmt_info->stmt));
542 }
543 else
544 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
545 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
546 oprnd = TREE_OPERAND (oprnd, 0);
547
548 oprnd_info = (*oprnds_info)[i];
549
550 stmt_vec_info def_stmt_info;
551 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
552 {
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
555 "Build SLP failed: can't analyze def for %T\n",
556 oprnd);
557
558 return -1;
559 }
560
561 if (skip_args[i])
562 {
563 oprnd_info->def_stmts.quick_push (NULL);
564 oprnd_info->ops.quick_push (NULL_TREE);
565 oprnd_info->first_dt = vect_uninitialized_def;
566 continue;
567 }
568
569 oprnd_info->def_stmts.quick_push (def_stmt_info);
570 oprnd_info->ops.quick_push (oprnd);
571
572 if (def_stmt_info
573 && is_pattern_stmt_p (def_stmt_info))
574 {
575 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
576 != def_stmt_info)
577 oprnd_info->any_pattern = true;
578 else
579 /* If we promote this to external use the original stmt def. */
580 oprnd_info->ops.last ()
581 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
582 }
583
584 /* If there's a extern def on a backedge make sure we can
585 code-generate at the region start.
586 ??? This is another case that could be fixed by adjusting
587 how we split the function but at the moment we'd have conflicting
588 goals there. */
589 if (backedge
590 && dts[i] == vect_external_def
591 && is_a <bb_vec_info> (vinfo)
592 && TREE_CODE (oprnd) == SSA_NAME
593 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
594 && !dominated_by_p (CDI_DOMINATORS,
595 as_a <bb_vec_info> (vinfo)->bbs[0],
596 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
597 {
598 if (dump_enabled_p ())
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
600 "Build SLP failed: extern def %T only defined "
601 "on backedge\n", oprnd);
602 return -1;
603 }
604
605 if (first)
606 {
607 tree type = TREE_TYPE (oprnd);
608 dt = dts[i];
609 if ((dt == vect_constant_def
610 || dt == vect_external_def)
611 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
612 && (TREE_CODE (type) == BOOLEAN_TYPE
613 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
614 type)))
615 {
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "Build SLP failed: invalid type of def "
619 "for variable-length SLP %T\n", oprnd);
620 return -1;
621 }
622
623 /* For the swapping logic below force vect_reduction_def
624 for the reduction op in a SLP reduction group. */
625 if (!STMT_VINFO_DATA_REF (stmt_info)
626 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
627 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
628 && def_stmt_info)
629 dts[i] = dt = vect_reduction_def;
630
631 /* Check the types of the definition. */
632 switch (dt)
633 {
634 case vect_external_def:
635 case vect_constant_def:
636 case vect_internal_def:
637 case vect_reduction_def:
638 case vect_induction_def:
639 case vect_nested_cycle:
640 break;
641
642 default:
643 /* FORNOW: Not supported. */
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
646 "Build SLP failed: illegal type of def %T\n",
647 oprnd);
648 return -1;
649 }
650
651 oprnd_info->first_dt = dt;
652 oprnd_info->first_op_type = type;
653 }
654 }
655 if (first)
656 return 0;
657
658 /* Now match the operand definition types to that of the first stmt. */
659 for (i = 0; i < number_of_oprnds;)
660 {
661 if (skip_args[i])
662 {
663 ++i;
664 continue;
665 }
666
667 oprnd_info = (*oprnds_info)[i];
668 dt = dts[i];
669 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
670 oprnd = oprnd_info->ops[stmt_num];
671 tree type = TREE_TYPE (oprnd);
672
673 if (!types_compatible_p (oprnd_info->first_op_type, type))
674 {
675 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
677 "Build SLP failed: different operand types\n");
678 return 1;
679 }
680
681 /* Not first stmt of the group, check that the def-stmt/s match
682 the def-stmt/s of the first stmt. Allow different definition
683 types for reduction chains: the first stmt must be a
684 vect_reduction_def (a phi node), and the rest
685 end in the reduction chain. */
686 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
687 && !(oprnd_info->first_dt == vect_reduction_def
688 && !STMT_VINFO_DATA_REF (stmt_info)
689 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
690 && def_stmt_info
691 && !STMT_VINFO_DATA_REF (def_stmt_info)
692 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
693 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
694 || (!STMT_VINFO_DATA_REF (stmt_info)
695 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
696 && ((!def_stmt_info
697 || STMT_VINFO_DATA_REF (def_stmt_info)
698 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
699 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
700 != (oprnd_info->first_dt != vect_reduction_def))))
701 {
702 /* Try swapping operands if we got a mismatch. For BB
703 vectorization only in case it will clearly improve things. */
704 if (i == commutative_op && !swapped
705 && (!is_a <bb_vec_info> (vinfo)
706 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
707 dts[i+1])
708 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
709 || vect_def_types_match
710 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
711 {
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE, vect_location,
714 "trying swapped operands\n");
715 std::swap (dts[i], dts[i+1]);
716 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
717 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
718 std::swap ((*oprnds_info)[i]->ops[stmt_num],
719 (*oprnds_info)[i+1]->ops[stmt_num]);
720 swapped = true;
721 continue;
722 }
723
724 if (is_a <bb_vec_info> (vinfo)
725 && !oprnd_info->any_pattern)
726 {
727 /* Now for commutative ops we should see whether we can
728 make the other operand matching. */
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
731 "treating operand as external\n");
732 oprnd_info->first_dt = dt = vect_external_def;
733 }
734 else
735 {
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738 "Build SLP failed: different types\n");
739 return 1;
740 }
741 }
742
743 /* Make sure to demote the overall operand to external. */
744 if (dt == vect_external_def)
745 oprnd_info->first_dt = vect_external_def;
746 /* For a SLP reduction chain we want to duplicate the reduction to
747 each of the chain members. That gets us a sane SLP graph (still
748 the stmts are not 100% correct wrt the initial values). */
749 else if ((dt == vect_internal_def
750 || dt == vect_reduction_def)
751 && oprnd_info->first_dt == vect_reduction_def
752 && !STMT_VINFO_DATA_REF (stmt_info)
753 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
754 && !STMT_VINFO_DATA_REF (def_stmt_info)
755 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
756 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
757 {
758 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
759 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
760 }
761
762 ++i;
763 }
764
765 /* Swap operands. */
766 if (swapped)
767 {
768 if (dump_enabled_p ())
769 dump_printf_loc (MSG_NOTE, vect_location,
770 "swapped operands to match def types in %G",
771 stmt_info->stmt);
772 }
773
774 return 0;
775 }
776
777 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
778 Return true if we can, meaning that this choice doesn't conflict with
779 existing SLP nodes that use STMT_INFO. */
780
781 bool
782 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
783 {
784 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
785 if (old_vectype)
786 return useless_type_conversion_p (vectype, old_vectype);
787
788 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
789 {
790 /* We maintain the invariant that if any statement in the group is
791 used, all other members of the group have the same vector type. */
792 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
793 stmt_vec_info member_info = first_info;
794 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
795 if (is_pattern_stmt_p (member_info)
796 && !useless_type_conversion_p (vectype,
797 STMT_VINFO_VECTYPE (member_info)))
798 break;
799
800 if (!member_info)
801 {
802 for (member_info = first_info; member_info;
803 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
804 STMT_VINFO_VECTYPE (member_info) = vectype;
805 return true;
806 }
807 }
808 else if (!is_pattern_stmt_p (stmt_info))
809 {
810 STMT_VINFO_VECTYPE (stmt_info) = vectype;
811 return true;
812 }
813
814 if (dump_enabled_p ())
815 {
816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
817 "Build SLP failed: incompatible vector"
818 " types for: %G", stmt_info->stmt);
819 dump_printf_loc (MSG_NOTE, vect_location,
820 " old vector type: %T\n", old_vectype);
821 dump_printf_loc (MSG_NOTE, vect_location,
822 " new vector type: %T\n", vectype);
823 }
824 return false;
825 }
826
827 /* Return true if call statements CALL1 and CALL2 are similar enough
828 to be combined into the same SLP group. */
829
830 bool
831 compatible_calls_p (gcall *call1, gcall *call2)
832 {
833 unsigned int nargs = gimple_call_num_args (call1);
834 if (nargs != gimple_call_num_args (call2))
835 return false;
836
837 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
838 return false;
839
840 if (gimple_call_internal_p (call1))
841 {
842 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
843 TREE_TYPE (gimple_call_lhs (call2))))
844 return false;
845 for (unsigned int i = 0; i < nargs; ++i)
846 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
847 TREE_TYPE (gimple_call_arg (call2, i))))
848 return false;
849 }
850 else
851 {
852 if (!operand_equal_p (gimple_call_fn (call1),
853 gimple_call_fn (call2), 0))
854 return false;
855
856 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
857 return false;
858 }
859 return true;
860 }
861
862 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
863 caller's attempt to find the vector type in STMT_INFO with the narrowest
864 element type. Return true if VECTYPE is nonnull and if it is valid
865 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
866 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
867 vect_build_slp_tree. */
868
869 static bool
870 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
871 unsigned int group_size,
872 tree vectype, poly_uint64 *max_nunits)
873 {
874 if (!vectype)
875 {
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
878 "Build SLP failed: unsupported data-type in %G\n",
879 stmt_info->stmt);
880 /* Fatal mismatch. */
881 return false;
882 }
883
884 /* If populating the vector type requires unrolling then fail
885 before adjusting *max_nunits for basic-block vectorization. */
886 if (is_a <bb_vec_info> (vinfo)
887 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
888 {
889 if (dump_enabled_p ())
890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891 "Build SLP failed: unrolling required "
892 "in basic block SLP\n");
893 /* Fatal mismatch. */
894 return false;
895 }
896
897 /* In case of multiple types we need to detect the smallest type. */
898 vect_update_max_nunits (max_nunits, vectype);
899 return true;
900 }
901
902 /* Verify if the scalar stmts STMTS are isomorphic, require data
903 permutation or are of unsupported types of operation. Return
904 true if they are, otherwise return false and indicate in *MATCHES
905 which stmts are not isomorphic to the first one. If MATCHES[0]
906 is false then this indicates the comparison could not be
907 carried out or the stmts will never be vectorized by SLP.
908
909 Note COND_EXPR is possibly isomorphic to another one after swapping its
910 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
911 the first stmt by swapping the two operands of comparison; set SWAP[i]
912 to 2 if stmt I is isormorphic to the first stmt by inverting the code
913 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
914 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
915
916 static bool
917 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
918 vec<stmt_vec_info> stmts, unsigned int group_size,
919 poly_uint64 *max_nunits, bool *matches,
920 bool *two_operators, tree *node_vectype)
921 {
922 unsigned int i;
923 stmt_vec_info first_stmt_info = stmts[0];
924 enum tree_code first_stmt_code = ERROR_MARK;
925 enum tree_code alt_stmt_code = ERROR_MARK;
926 enum tree_code rhs_code = ERROR_MARK;
927 enum tree_code first_cond_code = ERROR_MARK;
928 tree lhs;
929 bool need_same_oprnds = false;
930 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
931 optab optab;
932 int icode;
933 machine_mode optab_op2_mode;
934 machine_mode vec_mode;
935 stmt_vec_info first_load = NULL, prev_first_load = NULL;
936 bool first_stmt_load_p = false, load_p = false;
937 bool first_stmt_phi_p = false, phi_p = false;
938 bool maybe_soft_fail = false;
939 tree soft_fail_nunits_vectype = NULL_TREE;
940
941 /* For every stmt in NODE find its def stmt/s. */
942 stmt_vec_info stmt_info;
943 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
944 {
945 gimple *stmt = stmt_info->stmt;
946 swap[i] = 0;
947 matches[i] = false;
948
949 if (dump_enabled_p ())
950 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
951
952 /* Fail to vectorize statements marked as unvectorizable, throw
953 or are volatile. */
954 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
955 || stmt_can_throw_internal (cfun, stmt)
956 || gimple_has_volatile_ops (stmt))
957 {
958 if (dump_enabled_p ())
959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
960 "Build SLP failed: unvectorizable statement %G",
961 stmt);
962 /* ??? For BB vectorization we want to commutate operands in a way
963 to shuffle all unvectorizable defs into one operand and have
964 the other still vectorized. The following doesn't reliably
965 work for this though but it's the easiest we can do here. */
966 if (is_a <bb_vec_info> (vinfo) && i != 0)
967 continue;
968 /* Fatal mismatch. */
969 matches[0] = false;
970 return false;
971 }
972
973 lhs = gimple_get_lhs (stmt);
974 if (lhs == NULL_TREE)
975 {
976 if (dump_enabled_p ())
977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
978 "Build SLP failed: not GIMPLE_ASSIGN nor "
979 "GIMPLE_CALL %G", stmt);
980 if (is_a <bb_vec_info> (vinfo) && i != 0)
981 continue;
982 /* Fatal mismatch. */
983 matches[0] = false;
984 return false;
985 }
986
987 tree nunits_vectype;
988 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
989 &nunits_vectype, group_size))
990 {
991 if (is_a <bb_vec_info> (vinfo) && i != 0)
992 continue;
993 /* Fatal mismatch. */
994 matches[0] = false;
995 return false;
996 }
997 /* Record nunits required but continue analysis, producing matches[]
998 as if nunits was not an issue. This allows splitting of groups
999 to happen. */
1000 if (nunits_vectype
1001 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1002 nunits_vectype, max_nunits))
1003 {
1004 gcc_assert (is_a <bb_vec_info> (vinfo));
1005 maybe_soft_fail = true;
1006 soft_fail_nunits_vectype = nunits_vectype;
1007 }
1008
1009 gcc_assert (vectype);
1010
1011 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1012 if (call_stmt)
1013 {
1014 rhs_code = CALL_EXPR;
1015
1016 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1017 load_p = true;
1018 else if ((gimple_call_internal_p (call_stmt)
1019 && (!vectorizable_internal_fn_p
1020 (gimple_call_internal_fn (call_stmt))))
1021 || gimple_call_tail_p (call_stmt)
1022 || gimple_call_noreturn_p (call_stmt)
1023 || !gimple_call_nothrow_p (call_stmt)
1024 || gimple_call_chain (call_stmt))
1025 {
1026 if (dump_enabled_p ())
1027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1028 "Build SLP failed: unsupported call type %G",
1029 call_stmt);
1030 if (is_a <bb_vec_info> (vinfo) && i != 0)
1031 continue;
1032 /* Fatal mismatch. */
1033 matches[0] = false;
1034 return false;
1035 }
1036 }
1037 else if (gimple_code (stmt) == GIMPLE_PHI)
1038 {
1039 rhs_code = ERROR_MARK;
1040 phi_p = true;
1041 }
1042 else
1043 {
1044 rhs_code = gimple_assign_rhs_code (stmt);
1045 load_p = gimple_vuse (stmt);
1046 }
1047
1048 /* Check the operation. */
1049 if (i == 0)
1050 {
1051 *node_vectype = vectype;
1052 first_stmt_code = rhs_code;
1053 first_stmt_load_p = load_p;
1054 first_stmt_phi_p = phi_p;
1055
1056 /* Shift arguments should be equal in all the packed stmts for a
1057 vector shift with scalar shift operand. */
1058 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1059 || rhs_code == LROTATE_EXPR
1060 || rhs_code == RROTATE_EXPR)
1061 {
1062 vec_mode = TYPE_MODE (vectype);
1063
1064 /* First see if we have a vector/vector shift. */
1065 optab = optab_for_tree_code (rhs_code, vectype,
1066 optab_vector);
1067
1068 if (!optab
1069 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1070 {
1071 /* No vector/vector shift, try for a vector/scalar shift. */
1072 optab = optab_for_tree_code (rhs_code, vectype,
1073 optab_scalar);
1074
1075 if (!optab)
1076 {
1077 if (dump_enabled_p ())
1078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079 "Build SLP failed: no optab.\n");
1080 if (is_a <bb_vec_info> (vinfo) && i != 0)
1081 continue;
1082 /* Fatal mismatch. */
1083 matches[0] = false;
1084 return false;
1085 }
1086 icode = (int) optab_handler (optab, vec_mode);
1087 if (icode == CODE_FOR_nothing)
1088 {
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091 "Build SLP failed: "
1092 "op not supported by target.\n");
1093 if (is_a <bb_vec_info> (vinfo) && i != 0)
1094 continue;
1095 /* Fatal mismatch. */
1096 matches[0] = false;
1097 return false;
1098 }
1099 optab_op2_mode = insn_data[icode].operand[2].mode;
1100 if (!VECTOR_MODE_P (optab_op2_mode))
1101 {
1102 need_same_oprnds = true;
1103 first_op1 = gimple_assign_rhs2 (stmt);
1104 }
1105 }
1106 }
1107 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1108 {
1109 need_same_oprnds = true;
1110 first_op1 = gimple_assign_rhs2 (stmt);
1111 }
1112 else if (!load_p
1113 && rhs_code == BIT_FIELD_REF)
1114 {
1115 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1116 if (!is_a <bb_vec_info> (vinfo)
1117 || TREE_CODE (vec) != SSA_NAME
1118 || !operand_equal_p (TYPE_SIZE (vectype),
1119 TYPE_SIZE (TREE_TYPE (vec))))
1120 {
1121 if (dump_enabled_p ())
1122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123 "Build SLP failed: "
1124 "BIT_FIELD_REF not supported\n");
1125 /* Fatal mismatch. */
1126 matches[0] = false;
1127 return false;
1128 }
1129 }
1130 else if (call_stmt
1131 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1132 {
1133 need_same_oprnds = true;
1134 first_op1 = gimple_call_arg (call_stmt, 1);
1135 }
1136 }
1137 else
1138 {
1139 if (first_stmt_code != rhs_code
1140 && alt_stmt_code == ERROR_MARK)
1141 alt_stmt_code = rhs_code;
1142 if ((first_stmt_code != rhs_code
1143 && (first_stmt_code != IMAGPART_EXPR
1144 || rhs_code != REALPART_EXPR)
1145 && (first_stmt_code != REALPART_EXPR
1146 || rhs_code != IMAGPART_EXPR)
1147 /* Handle mismatches in plus/minus by computing both
1148 and merging the results. */
1149 && !((first_stmt_code == PLUS_EXPR
1150 || first_stmt_code == MINUS_EXPR)
1151 && (alt_stmt_code == PLUS_EXPR
1152 || alt_stmt_code == MINUS_EXPR)
1153 && rhs_code == alt_stmt_code)
1154 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1155 && (first_stmt_code == ARRAY_REF
1156 || first_stmt_code == BIT_FIELD_REF
1157 || first_stmt_code == INDIRECT_REF
1158 || first_stmt_code == COMPONENT_REF
1159 || first_stmt_code == MEM_REF)))
1160 || first_stmt_load_p != load_p
1161 || first_stmt_phi_p != phi_p)
1162 {
1163 if (dump_enabled_p ())
1164 {
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1166 "Build SLP failed: different operation "
1167 "in stmt %G", stmt);
1168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1169 "original stmt %G", first_stmt_info->stmt);
1170 }
1171 /* Mismatch. */
1172 continue;
1173 }
1174
1175 if (!load_p
1176 && first_stmt_code == BIT_FIELD_REF
1177 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1178 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1179 {
1180 if (dump_enabled_p ())
1181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1182 "Build SLP failed: different BIT_FIELD_REF "
1183 "arguments in %G", stmt);
1184 /* Mismatch. */
1185 continue;
1186 }
1187
1188 if (!load_p && rhs_code == CALL_EXPR)
1189 {
1190 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1191 as_a <gcall *> (stmt)))
1192 {
1193 if (dump_enabled_p ())
1194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1195 "Build SLP failed: different calls in %G",
1196 stmt);
1197 /* Mismatch. */
1198 continue;
1199 }
1200 }
1201
1202 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1203 && (gimple_bb (first_stmt_info->stmt)
1204 != gimple_bb (stmt_info->stmt)))
1205 {
1206 if (dump_enabled_p ())
1207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208 "Build SLP failed: different BB for PHI "
1209 "or possibly trapping operation in %G", stmt);
1210 /* Mismatch. */
1211 continue;
1212 }
1213
1214 if (need_same_oprnds)
1215 {
1216 tree other_op1 = (call_stmt
1217 ? gimple_call_arg (call_stmt, 1)
1218 : gimple_assign_rhs2 (stmt));
1219 if (!operand_equal_p (first_op1, other_op1, 0))
1220 {
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "Build SLP failed: different shift "
1224 "arguments in %G", stmt);
1225 /* Mismatch. */
1226 continue;
1227 }
1228 }
1229
1230 if (!types_compatible_p (vectype, *node_vectype))
1231 {
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "Build SLP failed: different vector type "
1235 "in %G", stmt);
1236 /* Mismatch. */
1237 continue;
1238 }
1239 }
1240
1241 /* Grouped store or load. */
1242 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1243 {
1244 if (REFERENCE_CLASS_P (lhs))
1245 {
1246 /* Store. */
1247 ;
1248 }
1249 else
1250 {
1251 /* Load. */
1252 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1253 if (prev_first_load)
1254 {
1255 /* Check that there are no loads from different interleaving
1256 chains in the same node. */
1257 if (prev_first_load != first_load)
1258 {
1259 if (dump_enabled_p ())
1260 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1261 vect_location,
1262 "Build SLP failed: different "
1263 "interleaving chains in one node %G",
1264 stmt);
1265 /* Mismatch. */
1266 continue;
1267 }
1268 }
1269 else
1270 prev_first_load = first_load;
1271 }
1272 } /* Grouped access. */
1273 else
1274 {
1275 if (load_p)
1276 {
1277 /* Not grouped load. */
1278 if (dump_enabled_p ())
1279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1280 "Build SLP failed: not grouped load %G", stmt);
1281
1282 /* FORNOW: Not grouped loads are not supported. */
1283 if (is_a <bb_vec_info> (vinfo) && i != 0)
1284 continue;
1285 /* Fatal mismatch. */
1286 matches[0] = false;
1287 return false;
1288 }
1289
1290 /* Not memory operation. */
1291 if (!phi_p
1292 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1293 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1294 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1295 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1296 && rhs_code != VIEW_CONVERT_EXPR
1297 && rhs_code != CALL_EXPR
1298 && rhs_code != BIT_FIELD_REF)
1299 {
1300 if (dump_enabled_p ())
1301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1302 "Build SLP failed: operation unsupported %G",
1303 stmt);
1304 if (is_a <bb_vec_info> (vinfo) && i != 0)
1305 continue;
1306 /* Fatal mismatch. */
1307 matches[0] = false;
1308 return false;
1309 }
1310
1311 if (rhs_code == COND_EXPR)
1312 {
1313 tree cond_expr = gimple_assign_rhs1 (stmt);
1314 enum tree_code cond_code = TREE_CODE (cond_expr);
1315 enum tree_code swap_code = ERROR_MARK;
1316 enum tree_code invert_code = ERROR_MARK;
1317
1318 if (i == 0)
1319 first_cond_code = TREE_CODE (cond_expr);
1320 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1321 {
1322 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1323 swap_code = swap_tree_comparison (cond_code);
1324 invert_code = invert_tree_comparison (cond_code, honor_nans);
1325 }
1326
1327 if (first_cond_code == cond_code)
1328 ;
1329 /* Isomorphic can be achieved by swapping. */
1330 else if (first_cond_code == swap_code)
1331 swap[i] = 1;
1332 /* Isomorphic can be achieved by inverting. */
1333 else if (first_cond_code == invert_code)
1334 swap[i] = 2;
1335 else
1336 {
1337 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339 "Build SLP failed: different"
1340 " operation %G", stmt);
1341 /* Mismatch. */
1342 continue;
1343 }
1344 }
1345 }
1346
1347 matches[i] = true;
1348 }
1349
1350 for (i = 0; i < group_size; ++i)
1351 if (!matches[i])
1352 return false;
1353
1354 /* If we allowed a two-operation SLP node verify the target can cope
1355 with the permute we are going to use. */
1356 if (alt_stmt_code != ERROR_MARK
1357 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1358 {
1359 *two_operators = true;
1360 }
1361
1362 if (maybe_soft_fail)
1363 {
1364 unsigned HOST_WIDE_INT const_nunits;
1365 if (!TYPE_VECTOR_SUBPARTS
1366 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1367 || const_nunits > group_size)
1368 matches[0] = false;
1369 else
1370 {
1371 /* With constant vector elements simulate a mismatch at the
1372 point we need to split. */
1373 unsigned tail = group_size & (const_nunits - 1);
1374 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1375 }
1376 return false;
1377 }
1378
1379 return true;
1380 }
1381
1382 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1383 Note we never remove apart from at destruction time so we do not
1384 need a special value for deleted that differs from empty. */
1385 struct bst_traits
1386 {
1387 typedef vec <stmt_vec_info> value_type;
1388 typedef vec <stmt_vec_info> compare_type;
1389 static inline hashval_t hash (value_type);
1390 static inline bool equal (value_type existing, value_type candidate);
1391 static inline bool is_empty (value_type x) { return !x.exists (); }
1392 static inline bool is_deleted (value_type x) { return !x.exists (); }
1393 static const bool empty_zero_p = true;
1394 static inline void mark_empty (value_type &x) { x.release (); }
1395 static inline void mark_deleted (value_type &x) { x.release (); }
1396 static inline void remove (value_type &x) { x.release (); }
1397 };
1398 inline hashval_t
1399 bst_traits::hash (value_type x)
1400 {
1401 inchash::hash h;
1402 for (unsigned i = 0; i < x.length (); ++i)
1403 h.add_int (gimple_uid (x[i]->stmt));
1404 return h.end ();
1405 }
1406 inline bool
1407 bst_traits::equal (value_type existing, value_type candidate)
1408 {
1409 if (existing.length () != candidate.length ())
1410 return false;
1411 for (unsigned i = 0; i < existing.length (); ++i)
1412 if (existing[i] != candidate[i])
1413 return false;
1414 return true;
1415 }
1416
1417 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1418 simple_hashmap_traits <bst_traits, slp_tree> >
1419 scalar_stmts_to_slp_tree_map_t;
1420
1421 static slp_tree
1422 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1423 vec<stmt_vec_info> stmts, unsigned int group_size,
1424 poly_uint64 *max_nunits,
1425 bool *matches, unsigned *limit, unsigned *tree_size,
1426 scalar_stmts_to_slp_tree_map_t *bst_map);
1427
1428 static slp_tree
1429 vect_build_slp_tree (vec_info *vinfo,
1430 vec<stmt_vec_info> stmts, unsigned int group_size,
1431 poly_uint64 *max_nunits,
1432 bool *matches, unsigned *limit, unsigned *tree_size,
1433 scalar_stmts_to_slp_tree_map_t *bst_map)
1434 {
1435 if (slp_tree *leader = bst_map->get (stmts))
1436 {
1437 if (dump_enabled_p ())
1438 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1439 *leader ? "" : "failed ", *leader);
1440 if (*leader)
1441 {
1442 SLP_TREE_REF_COUNT (*leader)++;
1443 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1444 stmts.release ();
1445 }
1446 return *leader;
1447 }
1448
1449 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1450 so we can pick up backedge destinations during discovery. */
1451 slp_tree res = new _slp_tree;
1452 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1453 SLP_TREE_SCALAR_STMTS (res) = stmts;
1454 bst_map->put (stmts.copy (), res);
1455
1456 if (*limit == 0)
1457 {
1458 if (dump_enabled_p ())
1459 dump_printf_loc (MSG_NOTE, vect_location,
1460 "SLP discovery limit exceeded\n");
1461 bool existed_p = bst_map->put (stmts, NULL);
1462 gcc_assert (existed_p);
1463 /* Mark the node invalid so we can detect those when still in use
1464 as backedge destinations. */
1465 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1466 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1467 vect_free_slp_tree (res);
1468 memset (matches, 0, sizeof (bool) * group_size);
1469 return NULL;
1470 }
1471 --*limit;
1472
1473 poly_uint64 this_max_nunits = 1;
1474 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1475 &this_max_nunits,
1476 matches, limit, tree_size, bst_map);
1477 if (!res_)
1478 {
1479 bool existed_p = bst_map->put (stmts, NULL);
1480 gcc_assert (existed_p);
1481 /* Mark the node invalid so we can detect those when still in use
1482 as backedge destinations. */
1483 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1484 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1485 vect_free_slp_tree (res);
1486 }
1487 else
1488 {
1489 gcc_assert (res_ == res);
1490 res->max_nunits = this_max_nunits;
1491 vect_update_max_nunits (max_nunits, this_max_nunits);
1492 /* Keep a reference for the bst_map use. */
1493 SLP_TREE_REF_COUNT (res)++;
1494 }
1495 return res_;
1496 }
1497
1498 /* Recursively build an SLP tree starting from NODE.
1499 Fail (and return a value not equal to zero) if def-stmts are not
1500 isomorphic, require data permutation or are of unsupported types of
1501 operation. Otherwise, return 0.
1502 The value returned is the depth in the SLP tree where a mismatch
1503 was found. */
1504
1505 static slp_tree
1506 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1507 vec<stmt_vec_info> stmts, unsigned int group_size,
1508 poly_uint64 *max_nunits,
1509 bool *matches, unsigned *limit, unsigned *tree_size,
1510 scalar_stmts_to_slp_tree_map_t *bst_map)
1511 {
1512 unsigned nops, i, this_tree_size = 0;
1513 poly_uint64 this_max_nunits = *max_nunits;
1514
1515 matches[0] = false;
1516
1517 stmt_vec_info stmt_info = stmts[0];
1518 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1519 nops = gimple_call_num_args (stmt);
1520 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1521 {
1522 nops = gimple_num_ops (stmt) - 1;
1523 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1524 nops++;
1525 }
1526 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1527 nops = gimple_phi_num_args (phi);
1528 else
1529 return NULL;
1530
1531 /* If the SLP node is a PHI (induction or reduction), terminate
1532 the recursion. */
1533 bool *skip_args = XALLOCAVEC (bool, nops);
1534 memset (skip_args, 0, sizeof (bool) * nops);
1535 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1536 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1537 {
1538 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1539 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1540 group_size);
1541 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1542 max_nunits))
1543 return NULL;
1544
1545 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1546 if (def_type == vect_induction_def)
1547 {
1548 /* Induction PHIs are not cycles but walk the initial
1549 value. Only for inner loops through, for outer loops
1550 we need to pick up the value from the actual PHIs
1551 to more easily support peeling and epilogue vectorization. */
1552 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1553 if (!nested_in_vect_loop_p (loop, stmt_info))
1554 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1555 else
1556 loop = loop->inner;
1557 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1558 }
1559 else if (def_type == vect_reduction_def
1560 || def_type == vect_double_reduction_def
1561 || def_type == vect_nested_cycle)
1562 {
1563 /* Else def types have to match. */
1564 stmt_vec_info other_info;
1565 bool all_same = true;
1566 FOR_EACH_VEC_ELT (stmts, i, other_info)
1567 {
1568 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1569 return NULL;
1570 if (other_info != stmt_info)
1571 all_same = false;
1572 }
1573 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1574 /* Reduction initial values are not explicitely represented. */
1575 if (!nested_in_vect_loop_p (loop, stmt_info))
1576 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1577 /* Reduction chain backedge defs are filled manually.
1578 ??? Need a better way to identify a SLP reduction chain PHI.
1579 Or a better overall way to SLP match those. */
1580 if (all_same && def_type == vect_reduction_def)
1581 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1582 }
1583 else if (def_type != vect_internal_def)
1584 return NULL;
1585 }
1586
1587
1588 bool two_operators = false;
1589 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1590 tree vectype = NULL_TREE;
1591 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1592 &this_max_nunits, matches, &two_operators,
1593 &vectype))
1594 return NULL;
1595
1596 /* If the SLP node is a load, terminate the recursion unless masked. */
1597 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1598 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1599 {
1600 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1601 {
1602 /* Masked load. */
1603 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1604 nops = 1;
1605 }
1606 else
1607 {
1608 *max_nunits = this_max_nunits;
1609 (*tree_size)++;
1610 node = vect_create_new_slp_node (node, stmts, 0);
1611 SLP_TREE_VECTYPE (node) = vectype;
1612 /* And compute the load permutation. Whether it is actually
1613 a permutation depends on the unrolling factor which is
1614 decided later. */
1615 vec<unsigned> load_permutation;
1616 int j;
1617 stmt_vec_info load_info;
1618 load_permutation.create (group_size);
1619 stmt_vec_info first_stmt_info
1620 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1621 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1622 {
1623 int load_place = vect_get_place_in_interleaving_chain
1624 (load_info, first_stmt_info);
1625 gcc_assert (load_place != -1);
1626 load_permutation.safe_push (load_place);
1627 }
1628 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1629 return node;
1630 }
1631 }
1632 else if (gimple_assign_single_p (stmt_info->stmt)
1633 && !gimple_vuse (stmt_info->stmt)
1634 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1635 {
1636 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1637 the same SSA name vector of a compatible type to vectype. */
1638 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1639 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1640 stmt_vec_info estmt_info;
1641 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1642 {
1643 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1644 tree bfref = gimple_assign_rhs1 (estmt);
1645 HOST_WIDE_INT lane;
1646 if (!known_eq (bit_field_size (bfref),
1647 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1648 || !constant_multiple_p (bit_field_offset (bfref),
1649 bit_field_size (bfref), &lane))
1650 {
1651 lperm.release ();
1652 return NULL;
1653 }
1654 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1655 }
1656 slp_tree vnode = vect_create_new_slp_node (vNULL);
1657 /* ??? We record vectype here but we hide eventually necessary
1658 punning and instead rely on code generation to materialize
1659 VIEW_CONVERT_EXPRs as necessary. We instead should make
1660 this explicit somehow. */
1661 SLP_TREE_VECTYPE (vnode) = vectype;
1662 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1663 /* We are always building a permutation node even if it is an identity
1664 permute to shield the rest of the vectorizer from the odd node
1665 representing an actual vector without any scalar ops.
1666 ??? We could hide it completely with making the permute node
1667 external? */
1668 node = vect_create_new_slp_node (node, stmts, 1);
1669 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1670 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1671 SLP_TREE_VECTYPE (node) = vectype;
1672 SLP_TREE_CHILDREN (node).quick_push (vnode);
1673 return node;
1674 }
1675
1676 /* Get at the operands, verifying they are compatible. */
1677 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1678 slp_oprnd_info oprnd_info;
1679 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1680 {
1681 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1682 stmts, i, &oprnds_info);
1683 if (res != 0)
1684 matches[(res == -1) ? 0 : i] = false;
1685 if (!matches[0])
1686 break;
1687 }
1688 for (i = 0; i < group_size; ++i)
1689 if (!matches[i])
1690 {
1691 vect_free_oprnd_info (oprnds_info);
1692 return NULL;
1693 }
1694 swap = NULL;
1695
1696 auto_vec<slp_tree, 4> children;
1697
1698 stmt_info = stmts[0];
1699
1700 /* Create SLP_TREE nodes for the definition node/s. */
1701 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1702 {
1703 slp_tree child;
1704 unsigned int j;
1705
1706 /* We're skipping certain operands from processing, for example
1707 outer loop reduction initial defs. */
1708 if (skip_args[i])
1709 {
1710 children.safe_push (NULL);
1711 continue;
1712 }
1713
1714 if (oprnd_info->first_dt == vect_uninitialized_def)
1715 {
1716 /* COND_EXPR have one too many eventually if the condition
1717 is a SSA name. */
1718 gcc_assert (i == 3 && nops == 4);
1719 continue;
1720 }
1721
1722 if (is_a <bb_vec_info> (vinfo)
1723 && oprnd_info->first_dt == vect_internal_def
1724 && !oprnd_info->any_pattern)
1725 {
1726 /* For BB vectorization, if all defs are the same do not
1727 bother to continue the build along the single-lane
1728 graph but use a splat of the scalar value. */
1729 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1730 for (j = 1; j < group_size; ++j)
1731 if (oprnd_info->def_stmts[j] != first_def)
1732 break;
1733 if (j == group_size
1734 /* But avoid doing this for loads where we may be
1735 able to CSE things, unless the stmt is not
1736 vectorizable. */
1737 && (!STMT_VINFO_VECTORIZABLE (first_def)
1738 || !gimple_vuse (first_def->stmt)))
1739 {
1740 if (dump_enabled_p ())
1741 dump_printf_loc (MSG_NOTE, vect_location,
1742 "Using a splat of the uniform operand\n");
1743 oprnd_info->first_dt = vect_external_def;
1744 }
1745 }
1746
1747 if (oprnd_info->first_dt == vect_external_def
1748 || oprnd_info->first_dt == vect_constant_def)
1749 {
1750 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1751 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1752 oprnd_info->ops = vNULL;
1753 children.safe_push (invnode);
1754 continue;
1755 }
1756
1757 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1758 group_size, &this_max_nunits,
1759 matches, limit,
1760 &this_tree_size, bst_map)) != NULL)
1761 {
1762 oprnd_info->def_stmts = vNULL;
1763 children.safe_push (child);
1764 continue;
1765 }
1766
1767 /* If the SLP build for operand zero failed and operand zero
1768 and one can be commutated try that for the scalar stmts
1769 that failed the match. */
1770 if (i == 0
1771 /* A first scalar stmt mismatch signals a fatal mismatch. */
1772 && matches[0]
1773 /* ??? For COND_EXPRs we can swap the comparison operands
1774 as well as the arms under some constraints. */
1775 && nops == 2
1776 && oprnds_info[1]->first_dt == vect_internal_def
1777 && is_gimple_assign (stmt_info->stmt)
1778 /* Swapping operands for reductions breaks assumptions later on. */
1779 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1780 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1781 {
1782 /* See whether we can swap the matching or the non-matching
1783 stmt operands. */
1784 bool swap_not_matching = true;
1785 do
1786 {
1787 for (j = 0; j < group_size; ++j)
1788 {
1789 if (matches[j] != !swap_not_matching)
1790 continue;
1791 stmt_vec_info stmt_info = stmts[j];
1792 /* Verify if we can swap operands of this stmt. */
1793 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1794 if (!stmt
1795 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1796 {
1797 if (!swap_not_matching)
1798 goto fail;
1799 swap_not_matching = false;
1800 break;
1801 }
1802 }
1803 }
1804 while (j != group_size);
1805
1806 /* Swap mismatched definition stmts. */
1807 if (dump_enabled_p ())
1808 dump_printf_loc (MSG_NOTE, vect_location,
1809 "Re-trying with swapped operands of stmts ");
1810 for (j = 0; j < group_size; ++j)
1811 if (matches[j] == !swap_not_matching)
1812 {
1813 std::swap (oprnds_info[0]->def_stmts[j],
1814 oprnds_info[1]->def_stmts[j]);
1815 std::swap (oprnds_info[0]->ops[j],
1816 oprnds_info[1]->ops[j]);
1817 if (dump_enabled_p ())
1818 dump_printf (MSG_NOTE, "%d ", j);
1819 }
1820 if (dump_enabled_p ())
1821 dump_printf (MSG_NOTE, "\n");
1822 /* After swapping some operands we lost track whether an
1823 operand has any pattern defs so be conservative here. */
1824 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
1825 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
1826 /* And try again with scratch 'matches' ... */
1827 bool *tem = XALLOCAVEC (bool, group_size);
1828 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1829 group_size, &this_max_nunits,
1830 tem, limit,
1831 &this_tree_size, bst_map)) != NULL)
1832 {
1833 oprnd_info->def_stmts = vNULL;
1834 children.safe_push (child);
1835 continue;
1836 }
1837 }
1838 fail:
1839
1840 /* If the SLP build failed and we analyze a basic-block
1841 simply treat nodes we fail to build as externally defined
1842 (and thus build vectors from the scalar defs).
1843 The cost model will reject outright expensive cases.
1844 ??? This doesn't treat cases where permutation ultimatively
1845 fails (or we don't try permutation below). Ideally we'd
1846 even compute a permutation that will end up with the maximum
1847 SLP tree size... */
1848 if (is_a <bb_vec_info> (vinfo)
1849 /* ??? Rejecting patterns this way doesn't work. We'd have to
1850 do extra work to cancel the pattern so the uses see the
1851 scalar version. */
1852 && !is_pattern_stmt_p (stmt_info)
1853 && !oprnd_info->any_pattern)
1854 {
1855 /* But if there's a leading vector sized set of matching stmts
1856 fail here so we can split the group. This matches the condition
1857 vect_analyze_slp_instance uses. */
1858 /* ??? We might want to split here and combine the results to support
1859 multiple vector sizes better. */
1860 for (j = 0; j < group_size; ++j)
1861 if (!matches[j])
1862 break;
1863 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1864 {
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_NOTE, vect_location,
1867 "Building vector operands from scalars\n");
1868 this_tree_size++;
1869 child = vect_create_new_slp_node (oprnd_info->ops);
1870 children.safe_push (child);
1871 oprnd_info->ops = vNULL;
1872 continue;
1873 }
1874 }
1875
1876 gcc_assert (child == NULL);
1877 FOR_EACH_VEC_ELT (children, j, child)
1878 if (child)
1879 vect_free_slp_tree (child);
1880 vect_free_oprnd_info (oprnds_info);
1881 return NULL;
1882 }
1883
1884 vect_free_oprnd_info (oprnds_info);
1885
1886 /* If we have all children of a child built up from uniform scalars
1887 or does more than one possibly expensive vector construction then
1888 just throw that away, causing it built up from scalars.
1889 The exception is the SLP node for the vector store. */
1890 if (is_a <bb_vec_info> (vinfo)
1891 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1892 /* ??? Rejecting patterns this way doesn't work. We'd have to
1893 do extra work to cancel the pattern so the uses see the
1894 scalar version. */
1895 && !is_pattern_stmt_p (stmt_info))
1896 {
1897 slp_tree child;
1898 unsigned j;
1899 bool all_uniform_p = true;
1900 unsigned n_vector_builds = 0;
1901 FOR_EACH_VEC_ELT (children, j, child)
1902 {
1903 if (!child)
1904 ;
1905 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1906 all_uniform_p = false;
1907 else if (!vect_slp_tree_uniform_p (child))
1908 {
1909 all_uniform_p = false;
1910 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1911 n_vector_builds++;
1912 }
1913 }
1914 if (all_uniform_p
1915 || n_vector_builds > 1
1916 || (n_vector_builds == children.length ()
1917 && is_a <gphi *> (stmt_info->stmt)))
1918 {
1919 /* Roll back. */
1920 matches[0] = false;
1921 FOR_EACH_VEC_ELT (children, j, child)
1922 if (child)
1923 vect_free_slp_tree (child);
1924
1925 if (dump_enabled_p ())
1926 dump_printf_loc (MSG_NOTE, vect_location,
1927 "Building parent vector operands from "
1928 "scalars instead\n");
1929 return NULL;
1930 }
1931 }
1932
1933 *tree_size += this_tree_size + 1;
1934 *max_nunits = this_max_nunits;
1935
1936 if (two_operators)
1937 {
1938 /* ??? We'd likely want to either cache in bst_map sth like
1939 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1940 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1941 explicit stmts to put in so the keying on 'stmts' doesn't
1942 work (but we have the same issue with nodes that use 'ops'). */
1943 slp_tree one = new _slp_tree;
1944 slp_tree two = new _slp_tree;
1945 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1946 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1947 SLP_TREE_VECTYPE (one) = vectype;
1948 SLP_TREE_VECTYPE (two) = vectype;
1949 SLP_TREE_CHILDREN (one).safe_splice (children);
1950 SLP_TREE_CHILDREN (two).safe_splice (children);
1951 slp_tree child;
1952 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1953 SLP_TREE_REF_COUNT (child)++;
1954
1955 /* Here we record the original defs since this
1956 node represents the final lane configuration. */
1957 node = vect_create_new_slp_node (node, stmts, 2);
1958 SLP_TREE_VECTYPE (node) = vectype;
1959 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1960 SLP_TREE_CHILDREN (node).quick_push (one);
1961 SLP_TREE_CHILDREN (node).quick_push (two);
1962 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1963 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1964 enum tree_code ocode = ERROR_MARK;
1965 stmt_vec_info ostmt_info;
1966 unsigned j = 0;
1967 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1968 {
1969 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1970 if (gimple_assign_rhs_code (ostmt) != code0)
1971 {
1972 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1973 ocode = gimple_assign_rhs_code (ostmt);
1974 j = i;
1975 }
1976 else
1977 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1978 }
1979 SLP_TREE_CODE (one) = code0;
1980 SLP_TREE_CODE (two) = ocode;
1981 SLP_TREE_LANES (one) = stmts.length ();
1982 SLP_TREE_LANES (two) = stmts.length ();
1983 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1984 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1985 return node;
1986 }
1987
1988 node = vect_create_new_slp_node (node, stmts, nops);
1989 SLP_TREE_VECTYPE (node) = vectype;
1990 SLP_TREE_CHILDREN (node).splice (children);
1991 return node;
1992 }
1993
1994 /* Dump a single SLP tree NODE. */
1995
1996 static void
1997 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1998 slp_tree node)
1999 {
2000 unsigned i, j;
2001 slp_tree child;
2002 stmt_vec_info stmt_info;
2003 tree op;
2004
2005 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2006 dump_user_location_t user_loc = loc.get_user_location ();
2007 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2008 SLP_TREE_DEF_TYPE (node) == vect_external_def
2009 ? " (external)"
2010 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2011 ? " (constant)"
2012 : ""), node,
2013 estimated_poly_value (node->max_nunits),
2014 SLP_TREE_REF_COUNT (node));
2015 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2016 {
2017 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2018 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2019 else
2020 dump_printf_loc (metadata, user_loc, "op template: %G",
2021 SLP_TREE_REPRESENTATIVE (node)->stmt);
2022 }
2023 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2024 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2025 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2026 else
2027 {
2028 dump_printf_loc (metadata, user_loc, "\t{ ");
2029 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2030 dump_printf (metadata, "%T%s ", op,
2031 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2032 dump_printf (metadata, "}\n");
2033 }
2034 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2035 {
2036 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2037 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2038 dump_printf (dump_kind, " %u", j);
2039 dump_printf (dump_kind, " }\n");
2040 }
2041 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2042 {
2043 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2044 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2045 dump_printf (dump_kind, " %u[%u]",
2046 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2047 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2048 dump_printf (dump_kind, " }\n");
2049 }
2050 if (SLP_TREE_CHILDREN (node).is_empty ())
2051 return;
2052 dump_printf_loc (metadata, user_loc, "\tchildren");
2053 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2054 dump_printf (dump_kind, " %p", (void *)child);
2055 dump_printf (dump_kind, "\n");
2056 }
2057
2058 DEBUG_FUNCTION void
2059 debug (slp_tree node)
2060 {
2061 debug_dump_context ctx;
2062 vect_print_slp_tree (MSG_NOTE,
2063 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2064 node);
2065 }
2066
2067 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2068
2069 static void
2070 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2071 slp_tree node, hash_set<slp_tree> &visited)
2072 {
2073 unsigned i;
2074 slp_tree child;
2075
2076 if (visited.add (node))
2077 return;
2078
2079 vect_print_slp_tree (dump_kind, loc, node);
2080
2081 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2082 if (child)
2083 vect_print_slp_graph (dump_kind, loc, child, visited);
2084 }
2085
2086 static void
2087 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2088 slp_tree entry)
2089 {
2090 hash_set<slp_tree> visited;
2091 vect_print_slp_graph (dump_kind, loc, entry, visited);
2092 }
2093
2094 /* Mark the tree rooted at NODE with PURE_SLP. */
2095
2096 static void
2097 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2098 {
2099 int i;
2100 stmt_vec_info stmt_info;
2101 slp_tree child;
2102
2103 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2104 return;
2105
2106 if (visited.add (node))
2107 return;
2108
2109 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2110 STMT_SLP_TYPE (stmt_info) = pure_slp;
2111
2112 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2113 if (child)
2114 vect_mark_slp_stmts (child, visited);
2115 }
2116
2117 static void
2118 vect_mark_slp_stmts (slp_tree node)
2119 {
2120 hash_set<slp_tree> visited;
2121 vect_mark_slp_stmts (node, visited);
2122 }
2123
2124 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2125
2126 static void
2127 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2128 {
2129 int i;
2130 stmt_vec_info stmt_info;
2131 slp_tree child;
2132
2133 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2134 return;
2135
2136 if (visited.add (node))
2137 return;
2138
2139 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2140 {
2141 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2142 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2143 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2144 }
2145
2146 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2147 if (child)
2148 vect_mark_slp_stmts_relevant (child, visited);
2149 }
2150
2151 static void
2152 vect_mark_slp_stmts_relevant (slp_tree node)
2153 {
2154 hash_set<slp_tree> visited;
2155 vect_mark_slp_stmts_relevant (node, visited);
2156 }
2157
2158
2159 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2160
2161 static void
2162 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2163 hash_set<slp_tree> &visited)
2164 {
2165 if (!node || visited.add (node))
2166 return;
2167
2168 if (SLP_TREE_CHILDREN (node).length () == 0)
2169 {
2170 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2171 return;
2172 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2173 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2174 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2175 loads.safe_push (node);
2176 }
2177 else
2178 {
2179 unsigned i;
2180 slp_tree child;
2181 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2182 vect_gather_slp_loads (loads, child, visited);
2183 }
2184 }
2185
2186
2187 /* Find the last store in SLP INSTANCE. */
2188
2189 stmt_vec_info
2190 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2191 {
2192 stmt_vec_info last = NULL;
2193 stmt_vec_info stmt_vinfo;
2194
2195 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2196 {
2197 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2198 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2199 }
2200
2201 return last;
2202 }
2203
2204 /* Find the first stmt in NODE. */
2205
2206 stmt_vec_info
2207 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2208 {
2209 stmt_vec_info first = NULL;
2210 stmt_vec_info stmt_vinfo;
2211
2212 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2213 {
2214 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2215 if (!first
2216 || get_later_stmt (stmt_vinfo, first) == first)
2217 first = stmt_vinfo;
2218 }
2219
2220 return first;
2221 }
2222
2223 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2224 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2225 (also containing the first GROUP1_SIZE stmts, since stores are
2226 consecutive), the second containing the remainder.
2227 Return the first stmt in the second group. */
2228
2229 static stmt_vec_info
2230 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2231 {
2232 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2233 gcc_assert (group1_size > 0);
2234 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2235 gcc_assert (group2_size > 0);
2236 DR_GROUP_SIZE (first_vinfo) = group1_size;
2237
2238 stmt_vec_info stmt_info = first_vinfo;
2239 for (unsigned i = group1_size; i > 1; i--)
2240 {
2241 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2242 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2243 }
2244 /* STMT is now the last element of the first group. */
2245 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2246 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2247
2248 DR_GROUP_SIZE (group2) = group2_size;
2249 for (stmt_info = group2; stmt_info;
2250 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2251 {
2252 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2253 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2254 }
2255
2256 /* For the second group, the DR_GROUP_GAP is that before the original group,
2257 plus skipping over the first vector. */
2258 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2259
2260 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2261 DR_GROUP_GAP (first_vinfo) += group2_size;
2262
2263 if (dump_enabled_p ())
2264 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2265 group1_size, group2_size);
2266
2267 return group2;
2268 }
2269
2270 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2271 statements and a vector of NUNITS elements. */
2272
2273 static poly_uint64
2274 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2275 {
2276 return exact_div (common_multiple (nunits, group_size), group_size);
2277 }
2278
2279 /* Helper that checks to see if a node is a load node. */
2280
2281 static inline bool
2282 vect_is_slp_load_node (slp_tree root)
2283 {
2284 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2285 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2286 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2287 }
2288
2289
2290 /* Helper function of optimize_load_redistribution that performs the operation
2291 recursively. */
2292
2293 static slp_tree
2294 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2295 vec_info *vinfo, unsigned int group_size,
2296 hash_map<slp_tree, slp_tree> *load_map,
2297 slp_tree root)
2298 {
2299 if (slp_tree *leader = load_map->get (root))
2300 return *leader;
2301
2302 slp_tree node;
2303 unsigned i;
2304
2305 /* For now, we don't know anything about externals so do not do anything. */
2306 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2307 return NULL;
2308 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2309 {
2310 /* First convert this node into a load node and add it to the leaves
2311 list and flatten the permute from a lane to a load one. If it's
2312 unneeded it will be elided later. */
2313 vec<stmt_vec_info> stmts;
2314 stmts.create (SLP_TREE_LANES (root));
2315 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2316 for (unsigned j = 0; j < lane_perm.length (); j++)
2317 {
2318 std::pair<unsigned, unsigned> perm = lane_perm[j];
2319 node = SLP_TREE_CHILDREN (root)[perm.first];
2320
2321 if (!vect_is_slp_load_node (node)
2322 || SLP_TREE_CHILDREN (node).exists ())
2323 {
2324 stmts.release ();
2325 goto next;
2326 }
2327
2328 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2329 }
2330
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_NOTE, vect_location,
2333 "converting stmts on permute node %p\n", root);
2334
2335 bool *matches = XALLOCAVEC (bool, group_size);
2336 poly_uint64 max_nunits = 1;
2337 unsigned tree_size = 0, limit = 1;
2338 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2339 matches, &limit, &tree_size, bst_map);
2340 if (!node)
2341 stmts.release ();
2342
2343 load_map->put (root, node);
2344 return node;
2345 }
2346
2347 next:
2348 load_map->put (root, NULL);
2349
2350 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2351 {
2352 slp_tree value
2353 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2354 node);
2355 if (value)
2356 {
2357 SLP_TREE_REF_COUNT (value)++;
2358 SLP_TREE_CHILDREN (root)[i] = value;
2359 /* ??? We know the original leafs of the replaced nodes will
2360 be referenced by bst_map, only the permutes created by
2361 pattern matching are not. */
2362 if (SLP_TREE_REF_COUNT (node) == 1)
2363 load_map->remove (node);
2364 vect_free_slp_tree (node);
2365 }
2366 }
2367
2368 return NULL;
2369 }
2370
2371 /* Temporary workaround for loads not being CSEd during SLP build. This
2372 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2373 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2374 same DR such that the final operation is equal to a permuted load. Such
2375 NODES are then directly converted into LOADS themselves. The nodes are
2376 CSEd using BST_MAP. */
2377
2378 static void
2379 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2380 vec_info *vinfo, unsigned int group_size,
2381 hash_map<slp_tree, slp_tree> *load_map,
2382 slp_tree root)
2383 {
2384 slp_tree node;
2385 unsigned i;
2386
2387 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2388 {
2389 slp_tree value
2390 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2391 node);
2392 if (value)
2393 {
2394 SLP_TREE_REF_COUNT (value)++;
2395 SLP_TREE_CHILDREN (root)[i] = value;
2396 /* ??? We know the original leafs of the replaced nodes will
2397 be referenced by bst_map, only the permutes created by
2398 pattern matching are not. */
2399 if (SLP_TREE_REF_COUNT (node) == 1)
2400 load_map->remove (node);
2401 vect_free_slp_tree (node);
2402 }
2403 }
2404 }
2405
2406 /* Helper function of vect_match_slp_patterns.
2407
2408 Attempts to match patterns against the slp tree rooted in REF_NODE using
2409 VINFO. Patterns are matched in post-order traversal.
2410
2411 If matching is successful the value in REF_NODE is updated and returned, if
2412 not then it is returned unchanged. */
2413
2414 static bool
2415 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2416 slp_tree_to_load_perm_map_t *perm_cache,
2417 slp_compat_nodes_map_t *compat_cache,
2418 hash_set<slp_tree> *visited)
2419 {
2420 unsigned i;
2421 slp_tree node = *ref_node;
2422 bool found_p = false;
2423 if (!node || visited->add (node))
2424 return false;
2425
2426 slp_tree child;
2427 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2428 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2429 vinfo, perm_cache, compat_cache,
2430 visited);
2431
2432 for (unsigned x = 0; x < num__slp_patterns; x++)
2433 {
2434 vect_pattern *pattern
2435 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
2436 if (pattern)
2437 {
2438 pattern->build (vinfo);
2439 delete pattern;
2440 found_p = true;
2441 }
2442 }
2443
2444 return found_p;
2445 }
2446
2447 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2448 vec_info VINFO.
2449
2450 The modified tree is returned. Patterns are tried in order and multiple
2451 patterns may match. */
2452
2453 static bool
2454 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2455 hash_set<slp_tree> *visited,
2456 slp_tree_to_load_perm_map_t *perm_cache,
2457 slp_compat_nodes_map_t *compat_cache)
2458 {
2459 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2460 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2461
2462 if (dump_enabled_p ())
2463 dump_printf_loc (MSG_NOTE, vect_location,
2464 "Analyzing SLP tree %p for patterns\n",
2465 SLP_INSTANCE_TREE (instance));
2466
2467 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
2468 visited);
2469 }
2470
2471 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2472 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2473 Return true if we could use IFN_STORE_LANES instead and if that appears
2474 to be the better approach. */
2475
2476 static bool
2477 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
2478 unsigned int group_size,
2479 unsigned int new_group_size)
2480 {
2481 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2482 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2483 if (!vectype)
2484 return false;
2485 /* Allow the split if one of the two new groups would operate on full
2486 vectors *within* rather than across one scalar loop iteration.
2487 This is purely a heuristic, but it should work well for group
2488 sizes of 3 and 4, where the possible splits are:
2489
2490 3->2+1: OK if the vector has exactly two elements
2491 4->2+2: Likewise
2492 4->3+1: Less clear-cut. */
2493 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
2494 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
2495 return false;
2496 return vect_store_lanes_supported (vectype, group_size, false);
2497 }
2498
2499 /* Analyze an SLP instance starting from a group of grouped stores. Call
2500 vect_build_slp_tree to build a tree of packed stmts if possible.
2501 Return FALSE if it's impossible to SLP any stmt in the loop. */
2502
2503 static bool
2504 vect_analyze_slp_instance (vec_info *vinfo,
2505 scalar_stmts_to_slp_tree_map_t *bst_map,
2506 stmt_vec_info stmt_info, slp_instance_kind kind,
2507 unsigned max_tree_size, unsigned *limit);
2508
2509 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2510 of KIND. Return true if successful. */
2511
2512 static bool
2513 vect_build_slp_instance (vec_info *vinfo,
2514 slp_instance_kind kind,
2515 vec<stmt_vec_info> &scalar_stmts,
2516 stmt_vec_info root_stmt_info,
2517 unsigned max_tree_size, unsigned *limit,
2518 scalar_stmts_to_slp_tree_map_t *bst_map,
2519 /* ??? We need stmt_info for group splitting. */
2520 stmt_vec_info stmt_info_)
2521 {
2522 if (dump_enabled_p ())
2523 {
2524 dump_printf_loc (MSG_NOTE, vect_location,
2525 "Starting SLP discovery for\n");
2526 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2527 dump_printf_loc (MSG_NOTE, vect_location,
2528 " %G", scalar_stmts[i]->stmt);
2529 }
2530
2531 /* Build the tree for the SLP instance. */
2532 unsigned int group_size = scalar_stmts.length ();
2533 bool *matches = XALLOCAVEC (bool, group_size);
2534 poly_uint64 max_nunits = 1;
2535 unsigned tree_size = 0;
2536 unsigned i;
2537 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2538 &max_nunits, matches, limit,
2539 &tree_size, bst_map);
2540 if (node != NULL)
2541 {
2542 /* Calculate the unrolling factor based on the smallest type. */
2543 poly_uint64 unrolling_factor
2544 = calculate_unrolling_factor (max_nunits, group_size);
2545
2546 if (maybe_ne (unrolling_factor, 1U)
2547 && is_a <bb_vec_info> (vinfo))
2548 {
2549 unsigned HOST_WIDE_INT const_max_nunits;
2550 if (!max_nunits.is_constant (&const_max_nunits)
2551 || const_max_nunits > group_size)
2552 {
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 "Build SLP failed: store group "
2556 "size not a multiple of the vector size "
2557 "in basic block SLP\n");
2558 vect_free_slp_tree (node);
2559 return false;
2560 }
2561 /* Fatal mismatch. */
2562 if (dump_enabled_p ())
2563 dump_printf_loc (MSG_NOTE, vect_location,
2564 "SLP discovery succeeded but node needs "
2565 "splitting\n");
2566 memset (matches, true, group_size);
2567 matches[group_size / const_max_nunits * const_max_nunits] = false;
2568 vect_free_slp_tree (node);
2569 }
2570 else
2571 {
2572 /* Create a new SLP instance. */
2573 slp_instance new_instance = XNEW (class _slp_instance);
2574 SLP_INSTANCE_TREE (new_instance) = node;
2575 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2576 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2577 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2578 SLP_INSTANCE_KIND (new_instance) = kind;
2579 new_instance->reduc_phis = NULL;
2580 new_instance->cost_vec = vNULL;
2581 new_instance->subgraph_entries = vNULL;
2582
2583 if (dump_enabled_p ())
2584 dump_printf_loc (MSG_NOTE, vect_location,
2585 "SLP size %u vs. limit %u.\n",
2586 tree_size, max_tree_size);
2587
2588 /* Fixup SLP reduction chains. */
2589 if (kind == slp_inst_kind_reduc_chain)
2590 {
2591 /* If this is a reduction chain with a conversion in front
2592 amend the SLP tree with a node for that. */
2593 gimple *scalar_def
2594 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2595 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2596 {
2597 /* Get at the conversion stmt - we know it's the single use
2598 of the last stmt of the reduction chain. */
2599 use_operand_p use_p;
2600 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2601 &use_p, &scalar_def);
2602 gcc_assert (r);
2603 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2604 next_info = vect_stmt_to_vectorize (next_info);
2605 scalar_stmts = vNULL;
2606 scalar_stmts.create (group_size);
2607 for (unsigned i = 0; i < group_size; ++i)
2608 scalar_stmts.quick_push (next_info);
2609 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2610 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2611 SLP_TREE_CHILDREN (conv).quick_push (node);
2612 SLP_INSTANCE_TREE (new_instance) = conv;
2613 /* We also have to fake this conversion stmt as SLP reduction
2614 group so we don't have to mess with too much code
2615 elsewhere. */
2616 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2617 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2618 }
2619 /* Fill the backedge child of the PHI SLP node. The
2620 general matching code cannot find it because the
2621 scalar code does not reflect how we vectorize the
2622 reduction. */
2623 use_operand_p use_p;
2624 imm_use_iterator imm_iter;
2625 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2626 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2627 gimple_get_lhs (scalar_def))
2628 /* There are exactly two non-debug uses, the reduction
2629 PHI and the loop-closed PHI node. */
2630 if (!is_gimple_debug (USE_STMT (use_p))
2631 && gimple_bb (USE_STMT (use_p)) == loop->header)
2632 {
2633 auto_vec<stmt_vec_info, 64> phis (group_size);
2634 stmt_vec_info phi_info
2635 = vinfo->lookup_stmt (USE_STMT (use_p));
2636 for (unsigned i = 0; i < group_size; ++i)
2637 phis.quick_push (phi_info);
2638 slp_tree *phi_node = bst_map->get (phis);
2639 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2640 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2641 = SLP_INSTANCE_TREE (new_instance);
2642 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2643 }
2644 }
2645
2646 vinfo->slp_instances.safe_push (new_instance);
2647
2648 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2649 the number of scalar stmts in the root in a few places.
2650 Verify that assumption holds. */
2651 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2652 .length () == group_size);
2653
2654 if (dump_enabled_p ())
2655 {
2656 dump_printf_loc (MSG_NOTE, vect_location,
2657 "Final SLP tree for instance %p:\n", new_instance);
2658 vect_print_slp_graph (MSG_NOTE, vect_location,
2659 SLP_INSTANCE_TREE (new_instance));
2660 }
2661
2662 return true;
2663 }
2664 }
2665 else
2666 {
2667 /* Failed to SLP. */
2668 /* Free the allocated memory. */
2669 scalar_stmts.release ();
2670 }
2671
2672 stmt_vec_info stmt_info = stmt_info_;
2673 /* Try to break the group up into pieces. */
2674 if (kind == slp_inst_kind_store)
2675 {
2676 /* ??? We could delay all the actual splitting of store-groups
2677 until after SLP discovery of the original group completed.
2678 Then we can recurse to vect_build_slp_instance directly. */
2679 for (i = 0; i < group_size; i++)
2680 if (!matches[i])
2681 break;
2682
2683 /* For basic block SLP, try to break the group up into multiples of
2684 a vector size. */
2685 if (is_a <bb_vec_info> (vinfo)
2686 && (i > 1 && i < group_size))
2687 {
2688 tree scalar_type
2689 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2690 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2691 1 << floor_log2 (i));
2692 unsigned HOST_WIDE_INT const_nunits;
2693 if (vectype
2694 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2695 {
2696 /* Split into two groups at the first vector boundary. */
2697 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2698 unsigned group1_size = i & ~(const_nunits - 1);
2699
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_NOTE, vect_location,
2702 "Splitting SLP group at stmt %u\n", i);
2703 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2704 group1_size);
2705 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2706 kind, max_tree_size,
2707 limit);
2708 /* Split the rest at the failure point and possibly
2709 re-analyze the remaining matching part if it has
2710 at least two lanes. */
2711 if (group1_size < i
2712 && (i + 1 < group_size
2713 || i - group1_size > 1))
2714 {
2715 stmt_vec_info rest2 = rest;
2716 rest = vect_split_slp_store_group (rest, i - group1_size);
2717 if (i - group1_size > 1)
2718 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2719 kind, max_tree_size,
2720 limit);
2721 }
2722 /* Re-analyze the non-matching tail if it has at least
2723 two lanes. */
2724 if (i + 1 < group_size)
2725 res |= vect_analyze_slp_instance (vinfo, bst_map,
2726 rest, kind, max_tree_size,
2727 limit);
2728 return res;
2729 }
2730 }
2731
2732 /* For loop vectorization split into arbitrary pieces of size > 1. */
2733 if (is_a <loop_vec_info> (vinfo)
2734 && (i > 1 && i < group_size)
2735 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
2736 {
2737 unsigned group1_size = i;
2738
2739 if (dump_enabled_p ())
2740 dump_printf_loc (MSG_NOTE, vect_location,
2741 "Splitting SLP group at stmt %u\n", i);
2742
2743 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2744 group1_size);
2745 /* Loop vectorization cannot handle gaps in stores, make sure
2746 the split group appears as strided. */
2747 STMT_VINFO_STRIDED_P (rest) = 1;
2748 DR_GROUP_GAP (rest) = 0;
2749 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2750 DR_GROUP_GAP (stmt_info) = 0;
2751
2752 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2753 kind, max_tree_size, limit);
2754 if (i + 1 < group_size)
2755 res |= vect_analyze_slp_instance (vinfo, bst_map,
2756 rest, kind, max_tree_size, limit);
2757
2758 return res;
2759 }
2760
2761 /* Even though the first vector did not all match, we might be able to SLP
2762 (some) of the remainder. FORNOW ignore this possibility. */
2763 }
2764
2765 /* Failed to SLP. */
2766 if (dump_enabled_p ())
2767 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2768 return false;
2769 }
2770
2771
2772 /* Analyze an SLP instance starting from a group of grouped stores. Call
2773 vect_build_slp_tree to build a tree of packed stmts if possible.
2774 Return FALSE if it's impossible to SLP any stmt in the loop. */
2775
2776 static bool
2777 vect_analyze_slp_instance (vec_info *vinfo,
2778 scalar_stmts_to_slp_tree_map_t *bst_map,
2779 stmt_vec_info stmt_info,
2780 slp_instance_kind kind,
2781 unsigned max_tree_size, unsigned *limit)
2782 {
2783 unsigned int i;
2784 vec<stmt_vec_info> scalar_stmts;
2785
2786 if (is_a <bb_vec_info> (vinfo))
2787 vect_location = stmt_info->stmt;
2788
2789 stmt_vec_info next_info = stmt_info;
2790 if (kind == slp_inst_kind_store)
2791 {
2792 /* Collect the stores and store them in scalar_stmts. */
2793 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2794 while (next_info)
2795 {
2796 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2797 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2798 }
2799 }
2800 else if (kind == slp_inst_kind_reduc_chain)
2801 {
2802 /* Collect the reduction stmts and store them in scalar_stmts. */
2803 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2804 while (next_info)
2805 {
2806 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2807 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2808 }
2809 /* Mark the first element of the reduction chain as reduction to properly
2810 transform the node. In the reduction analysis phase only the last
2811 element of the chain is marked as reduction. */
2812 STMT_VINFO_DEF_TYPE (stmt_info)
2813 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2814 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2815 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2816 }
2817 else if (kind == slp_inst_kind_ctor)
2818 {
2819 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2820 tree val;
2821 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2822 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2823 {
2824 stmt_vec_info def_info = vinfo->lookup_def (val);
2825 def_info = vect_stmt_to_vectorize (def_info);
2826 scalar_stmts.quick_push (def_info);
2827 }
2828 if (dump_enabled_p ())
2829 dump_printf_loc (MSG_NOTE, vect_location,
2830 "Analyzing vectorizable constructor: %G\n",
2831 stmt_info->stmt);
2832 }
2833 else if (kind == slp_inst_kind_reduc_group)
2834 {
2835 /* Collect reduction statements. */
2836 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2837 scalar_stmts.create (reductions.length ());
2838 for (i = 0; reductions.iterate (i, &next_info); i++)
2839 if ((STMT_VINFO_RELEVANT_P (next_info)
2840 || STMT_VINFO_LIVE_P (next_info))
2841 /* ??? Make sure we didn't skip a conversion around a reduction
2842 path. In that case we'd have to reverse engineer that conversion
2843 stmt following the chain using reduc_idx and from the PHI
2844 using reduc_def. */
2845 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
2846 scalar_stmts.quick_push (next_info);
2847 /* If less than two were relevant/live there's nothing to SLP. */
2848 if (scalar_stmts.length () < 2)
2849 return false;
2850 }
2851 else
2852 gcc_unreachable ();
2853
2854 /* Build the tree for the SLP instance. */
2855 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2856 kind == slp_inst_kind_ctor
2857 ? stmt_info : NULL,
2858 max_tree_size, limit, bst_map,
2859 kind == slp_inst_kind_store
2860 ? stmt_info : NULL);
2861
2862 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2863 where we should do store group splitting. */
2864
2865 return res;
2866 }
2867
2868 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2869 trees of packed scalar stmts if SLP is possible. */
2870
2871 opt_result
2872 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2873 {
2874 unsigned int i;
2875 stmt_vec_info first_element;
2876 slp_instance instance;
2877
2878 DUMP_VECT_SCOPE ("vect_analyze_slp");
2879
2880 unsigned limit = max_tree_size;
2881
2882 scalar_stmts_to_slp_tree_map_t *bst_map
2883 = new scalar_stmts_to_slp_tree_map_t ();
2884
2885 /* Find SLP sequences starting from groups of grouped stores. */
2886 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2887 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2888 STMT_VINFO_GROUPED_ACCESS (first_element)
2889 ? slp_inst_kind_store : slp_inst_kind_ctor,
2890 max_tree_size, &limit);
2891
2892 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
2893 {
2894 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
2895 {
2896 vect_location = bb_vinfo->roots[i].root->stmt;
2897 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
2898 bb_vinfo->roots[i].stmts,
2899 bb_vinfo->roots[i].root,
2900 max_tree_size, &limit, bst_map, NULL))
2901 bb_vinfo->roots[i].stmts = vNULL;
2902 }
2903 }
2904
2905 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2906 {
2907 /* Find SLP sequences starting from reduction chains. */
2908 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2909 if (! STMT_VINFO_RELEVANT_P (first_element)
2910 && ! STMT_VINFO_LIVE_P (first_element))
2911 ;
2912 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2913 slp_inst_kind_reduc_chain,
2914 max_tree_size, &limit))
2915 {
2916 /* Dissolve reduction chain group. */
2917 stmt_vec_info vinfo = first_element;
2918 stmt_vec_info last = NULL;
2919 while (vinfo)
2920 {
2921 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2922 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2923 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2924 last = vinfo;
2925 vinfo = next;
2926 }
2927 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2928 /* It can be still vectorized as part of an SLP reduction. */
2929 loop_vinfo->reductions.safe_push (last);
2930 }
2931
2932 /* Find SLP sequences starting from groups of reductions. */
2933 if (loop_vinfo->reductions.length () > 1)
2934 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2935 slp_inst_kind_reduc_group, max_tree_size,
2936 &limit);
2937 }
2938
2939 hash_set<slp_tree> visited_patterns;
2940 slp_tree_to_load_perm_map_t perm_cache;
2941 slp_compat_nodes_map_t compat_cache;
2942
2943 /* See if any patterns can be found in the SLP tree. */
2944 bool pattern_found = false;
2945 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2946 pattern_found |= vect_match_slp_patterns (instance, vinfo,
2947 &visited_patterns, &perm_cache,
2948 &compat_cache);
2949
2950 /* If any were found optimize permutations of loads. */
2951 if (pattern_found)
2952 {
2953 hash_map<slp_tree, slp_tree> load_map;
2954 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2955 {
2956 slp_tree root = SLP_INSTANCE_TREE (instance);
2957 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
2958 &load_map, root);
2959 }
2960 }
2961
2962
2963
2964 /* The map keeps a reference on SLP nodes built, release that. */
2965 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2966 it != bst_map->end (); ++it)
2967 if ((*it).second)
2968 vect_free_slp_tree ((*it).second);
2969 delete bst_map;
2970
2971 if (pattern_found && dump_enabled_p ())
2972 {
2973 dump_printf_loc (MSG_NOTE, vect_location,
2974 "Pattern matched SLP tree\n");
2975 hash_set<slp_tree> visited;
2976 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2977 vect_print_slp_graph (MSG_NOTE, vect_location,
2978 SLP_INSTANCE_TREE (instance), visited);
2979 }
2980
2981 return opt_result::success ();
2982 }
2983
2984 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2985
2986 static void
2987 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2988 vec<slp_tree> &vertices, vec<int> &leafs)
2989 {
2990 unsigned i;
2991 slp_tree child;
2992
2993 if (visited.add (node))
2994 return;
2995
2996 node->vertex = vertices.length ();
2997 vertices.safe_push (node);
2998
2999 bool leaf = true;
3000 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3001 if (child)
3002 {
3003 leaf = false;
3004 vect_slp_build_vertices (visited, child, vertices, leafs);
3005 }
3006 if (leaf)
3007 leafs.safe_push (node->vertex);
3008 }
3009
3010 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3011
3012 static void
3013 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
3014 vec<int> &leafs)
3015 {
3016 hash_set<slp_tree> visited;
3017 unsigned i;
3018 slp_instance instance;
3019 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3020 {
3021 unsigned n_v = vertices.length ();
3022 unsigned n_l = leafs.length ();
3023 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3024 leafs);
3025 /* If we added vertices but no entries to the reverse graph we've
3026 added a cycle that is not backwards-reachable. Push the entry
3027 to mimic as leaf then. */
3028 if (vertices.length () > n_v
3029 && leafs.length () == n_l)
3030 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
3031 }
3032 }
3033
3034 /* Apply (reverse) bijectite PERM to VEC. */
3035
3036 template <class T>
3037 static void
3038 vect_slp_permute (vec<unsigned> perm,
3039 vec<T> &vec, bool reverse)
3040 {
3041 auto_vec<T, 64> saved;
3042 saved.create (vec.length ());
3043 for (unsigned i = 0; i < vec.length (); ++i)
3044 saved.quick_push (vec[i]);
3045
3046 if (reverse)
3047 {
3048 for (unsigned i = 0; i < vec.length (); ++i)
3049 vec[perm[i]] = saved[i];
3050 for (unsigned i = 0; i < vec.length (); ++i)
3051 gcc_assert (vec[perm[i]] == saved[i]);
3052 }
3053 else
3054 {
3055 for (unsigned i = 0; i < vec.length (); ++i)
3056 vec[i] = saved[perm[i]];
3057 for (unsigned i = 0; i < vec.length (); ++i)
3058 gcc_assert (vec[i] == saved[perm[i]]);
3059 }
3060 }
3061
3062 /* Return whether permutations PERM_A and PERM_B as recorded in the
3063 PERMS vector are equal. */
3064
3065 static bool
3066 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3067 int perm_a, int perm_b)
3068 {
3069 return (perm_a == perm_b
3070 || (perms[perm_a].length () == perms[perm_b].length ()
3071 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3072 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3073 }
3074
3075 /* Optimize the SLP graph of VINFO. */
3076
3077 void
3078 vect_optimize_slp (vec_info *vinfo)
3079 {
3080 if (vinfo->slp_instances.is_empty ())
3081 return;
3082
3083 slp_tree node;
3084 unsigned i;
3085 auto_vec<slp_tree> vertices;
3086 auto_vec<int> leafs;
3087 vect_slp_build_vertices (vinfo, vertices, leafs);
3088
3089 struct graph *slpg = new_graph (vertices.length ());
3090 FOR_EACH_VEC_ELT (vertices, i, node)
3091 {
3092 unsigned j;
3093 slp_tree child;
3094 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3095 if (child)
3096 add_edge (slpg, i, child->vertex);
3097 }
3098
3099 /* Compute (reverse) postorder on the inverted graph. */
3100 auto_vec<int> ipo;
3101 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3102
3103 auto_sbitmap n_visited (vertices.length ());
3104 auto_sbitmap n_materialize (vertices.length ());
3105 auto_vec<int> n_perm (vertices.length ());
3106 auto_vec<vec<unsigned> > perms;
3107
3108 bitmap_clear (n_visited);
3109 bitmap_clear (n_materialize);
3110 n_perm.quick_grow_cleared (vertices.length ());
3111 perms.safe_push (vNULL); /* zero is no permute */
3112
3113 /* Produce initial permutations. */
3114 for (i = 0; i < leafs.length (); ++i)
3115 {
3116 int idx = leafs[i];
3117 slp_tree node = vertices[idx];
3118
3119 /* Handle externals and constants optimistically throughout the
3120 iteration. */
3121 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3122 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3123 continue;
3124
3125 /* Leafs do not change across iterations. Note leafs also double
3126 as entries to the reverse graph. */
3127 if (!slpg->vertices[idx].succ)
3128 bitmap_set_bit (n_visited, idx);
3129 /* Loads are the only thing generating permutes. */
3130 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3131 continue;
3132
3133 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3134 node unpermuted, record this permute. */
3135 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3136 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3137 continue;
3138 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3139 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3140 bool any_permute = false;
3141 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3142 {
3143 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3144 imin = MIN (imin, idx);
3145 imax = MAX (imax, idx);
3146 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3147 any_permute = true;
3148 }
3149 /* If there's no permute no need to split one out. */
3150 if (!any_permute)
3151 continue;
3152 /* If the span doesn't match we'd disrupt VF computation, avoid
3153 that for now. */
3154 if (imax - imin + 1 != SLP_TREE_LANES (node))
3155 continue;
3156
3157 /* For now only handle true permutes, like
3158 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3159 when permuting constants and invariants keeping the permute
3160 bijective. */
3161 auto_sbitmap load_index (SLP_TREE_LANES (node));
3162 bitmap_clear (load_index);
3163 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3164 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3165 unsigned j;
3166 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3167 if (!bitmap_bit_p (load_index, j))
3168 break;
3169 if (j != SLP_TREE_LANES (node))
3170 continue;
3171
3172 vec<unsigned> perm = vNULL;
3173 perm.safe_grow (SLP_TREE_LANES (node), true);
3174 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3175 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3176 perms.safe_push (perm);
3177 n_perm[idx] = perms.length () - 1;
3178 }
3179
3180 /* In addition to the above we have to mark outgoing permutes facing
3181 non-reduction graph entries that are not represented as to be
3182 materialized. */
3183 for (slp_instance instance : vinfo->slp_instances)
3184 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
3185 bitmap_set_bit (n_materialize, SLP_INSTANCE_TREE (instance)->vertex);
3186
3187 /* Propagate permutes along the graph and compute materialization points. */
3188 bool changed;
3189 unsigned iteration = 0;
3190 do
3191 {
3192 changed = false;
3193 ++iteration;
3194
3195 for (i = vertices.length (); i > 0 ; --i)
3196 {
3197 int idx = ipo[i-1];
3198 slp_tree node = vertices[idx];
3199 /* For leafs there's nothing to do - we've seeded permutes
3200 on those above. */
3201 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3202 continue;
3203
3204 bitmap_set_bit (n_visited, idx);
3205
3206 /* We do not handle stores with a permutation. */
3207 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3208 if (STMT_VINFO_DATA_REF (rep)
3209 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3210 continue;
3211 /* We cannot move a permute across an operation that is
3212 not independent on lanes. Note this is an explicit
3213 negative list since that's much shorter than the respective
3214 positive one but it's critical to keep maintaining it. */
3215 if (is_gimple_call (STMT_VINFO_STMT (rep)))
3216 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3217 {
3218 case CFN_COMPLEX_ADD_ROT90:
3219 case CFN_COMPLEX_ADD_ROT270:
3220 case CFN_COMPLEX_MUL:
3221 case CFN_COMPLEX_MUL_CONJ:
3222 continue;
3223 default:;
3224 }
3225
3226 int perm = -1;
3227 for (graph_edge *succ = slpg->vertices[idx].succ;
3228 succ; succ = succ->succ_next)
3229 {
3230 int succ_idx = succ->dest;
3231 /* Handle unvisited nodes optimistically. */
3232 /* ??? But for constants once we want to handle non-bijective
3233 permutes we have to verify the permute, when unifying lanes,
3234 will not unify different constants. For example see
3235 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3236 if (!bitmap_bit_p (n_visited, succ_idx))
3237 continue;
3238 int succ_perm = n_perm[succ_idx];
3239 /* Once we materialize succs permutation its output lanes
3240 appear unpermuted to us. */
3241 if (bitmap_bit_p (n_materialize, succ_idx))
3242 succ_perm = 0;
3243 if (perm == -1)
3244 perm = succ_perm;
3245 else if (succ_perm == 0)
3246 {
3247 perm = 0;
3248 break;
3249 }
3250 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3251 {
3252 perm = 0;
3253 break;
3254 }
3255 }
3256
3257 if (perm == -1)
3258 /* Pick up pre-computed leaf values. */
3259 perm = n_perm[idx];
3260 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
3261 {
3262 if (iteration > 1)
3263 /* Make sure we eventually converge. */
3264 gcc_checking_assert (perm == 0);
3265 n_perm[idx] = perm;
3266 if (perm == 0)
3267 bitmap_clear_bit (n_materialize, idx);
3268 changed = true;
3269 }
3270
3271 if (perm == 0)
3272 continue;
3273
3274 /* Elide pruning at materialization points in the first
3275 iteration so every node was visited once at least. */
3276 if (iteration == 1)
3277 continue;
3278
3279 /* Decide on permute materialization. Look whether there's
3280 a use (pred) edge that is permuted differently than us.
3281 In that case mark ourselves so the permutation is applied.
3282 For VEC_PERM_EXPRs the permutation doesn't carry along
3283 from children to parents so force materialization at the
3284 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3285 are a source of an arbitrary permutation again, similar
3286 to constants/externals - that's something we do not yet
3287 optimally handle. */
3288 bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
3289 && slpg->vertices[idx].pred != NULL);
3290 if (all_preds_permuted)
3291 for (graph_edge *pred = slpg->vertices[idx].pred;
3292 pred; pred = pred->pred_next)
3293 {
3294 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
3295 int pred_perm = n_perm[pred->src];
3296 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3297 {
3298 all_preds_permuted = false;
3299 break;
3300 }
3301 }
3302 if (!all_preds_permuted)
3303 {
3304 if (!bitmap_bit_p (n_materialize, idx))
3305 changed = true;
3306 bitmap_set_bit (n_materialize, idx);
3307 }
3308 }
3309 }
3310 while (changed || iteration == 1);
3311
3312 /* Materialize. */
3313 for (i = 0; i < vertices.length (); ++i)
3314 {
3315 int perm = n_perm[i];
3316 if (perm <= 0)
3317 continue;
3318
3319 slp_tree node = vertices[i];
3320
3321 /* First permute invariant/external original successors. */
3322 unsigned j;
3323 slp_tree child;
3324 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3325 {
3326 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3327 continue;
3328
3329 /* If the vector is uniform there's nothing to do. */
3330 if (vect_slp_tree_uniform_p (child))
3331 continue;
3332
3333 /* We can end up sharing some externals via two_operator
3334 handling. Be prepared to unshare those. */
3335 if (child->refcnt != 1)
3336 {
3337 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3338 SLP_TREE_CHILDREN (node)[j] = child
3339 = vect_create_new_slp_node
3340 (SLP_TREE_SCALAR_OPS (child).copy ());
3341 }
3342 vect_slp_permute (perms[perm],
3343 SLP_TREE_SCALAR_OPS (child), true);
3344 }
3345
3346 if (bitmap_bit_p (n_materialize, i))
3347 {
3348 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3349 /* For loads simply drop the permutation, the load permutation
3350 already performs the desired permutation. */
3351 ;
3352 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3353 {
3354 /* If the node is already a permute node we can apply
3355 the permutation to the lane selection, effectively
3356 materializing it on the incoming vectors. */
3357 if (dump_enabled_p ())
3358 dump_printf_loc (MSG_NOTE, vect_location,
3359 "simplifying permute node %p\n",
3360 node);
3361
3362 for (unsigned k = 0;
3363 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3364 SLP_TREE_LANE_PERMUTATION (node)[k].second
3365 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3366 }
3367 else
3368 {
3369 if (dump_enabled_p ())
3370 dump_printf_loc (MSG_NOTE, vect_location,
3371 "inserting permute node in place of %p\n",
3372 node);
3373
3374 /* Make a copy of NODE and in-place change it to a
3375 VEC_PERM node to permute the lanes of the copy. */
3376 slp_tree copy = new _slp_tree;
3377 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3378 SLP_TREE_CHILDREN (node) = vNULL;
3379 SLP_TREE_SCALAR_STMTS (copy)
3380 = SLP_TREE_SCALAR_STMTS (node).copy ();
3381 vect_slp_permute (perms[perm],
3382 SLP_TREE_SCALAR_STMTS (copy), true);
3383 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3384 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3385 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3386 SLP_TREE_LANE_PERMUTATION (copy)
3387 = SLP_TREE_LANE_PERMUTATION (node);
3388 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3389 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3390 copy->refcnt = 1;
3391 copy->max_nunits = node->max_nunits;
3392 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3393 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3394 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3395
3396 /* Now turn NODE into a VEC_PERM. */
3397 SLP_TREE_CHILDREN (node).safe_push (copy);
3398 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3399 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3400 SLP_TREE_LANE_PERMUTATION (node)
3401 .quick_push (std::make_pair (0, perms[perm][j]));
3402 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3403 }
3404 }
3405 else
3406 {
3407 /* Apply the reverse permutation to our stmts. */
3408 vect_slp_permute (perms[perm],
3409 SLP_TREE_SCALAR_STMTS (node), true);
3410 /* And to the load permutation, which we can simply
3411 make regular by design. */
3412 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3413 {
3414 /* ??? When we handle non-bijective permutes the idea
3415 is that we can force the load-permutation to be
3416 { min, min + 1, min + 2, ... max }. But then the
3417 scalar defs might no longer match the lane content
3418 which means wrong-code with live lane vectorization.
3419 So we possibly have to have NULL entries for those. */
3420 vect_slp_permute (perms[perm],
3421 SLP_TREE_LOAD_PERMUTATION (node), true);
3422 }
3423 }
3424 }
3425
3426 /* Free the perms vector used for propagation. */
3427 while (!perms.is_empty ())
3428 perms.pop ().release ();
3429 free_graph (slpg);
3430
3431
3432 /* Now elide load permutations that are not necessary. */
3433 for (i = 0; i < leafs.length (); ++i)
3434 {
3435 node = vertices[leafs[i]];
3436 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3437 continue;
3438
3439 /* In basic block vectorization we allow any subchain of an interleaving
3440 chain.
3441 FORNOW: not in loop SLP because of realignment complications. */
3442 if (is_a <bb_vec_info> (vinfo))
3443 {
3444 bool subchain_p = true;
3445 stmt_vec_info next_load_info = NULL;
3446 stmt_vec_info load_info;
3447 unsigned j;
3448 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3449 {
3450 if (j != 0
3451 && (next_load_info != load_info
3452 || DR_GROUP_GAP (load_info) != 1))
3453 {
3454 subchain_p = false;
3455 break;
3456 }
3457 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3458 }
3459 if (subchain_p)
3460 {
3461 SLP_TREE_LOAD_PERMUTATION (node).release ();
3462 continue;
3463 }
3464 }
3465 else
3466 {
3467 stmt_vec_info load_info;
3468 bool this_load_permuted = false;
3469 unsigned j;
3470 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3471 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3472 {
3473 this_load_permuted = true;
3474 break;
3475 }
3476 stmt_vec_info first_stmt_info
3477 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3478 if (!this_load_permuted
3479 /* The load requires permutation when unrolling exposes
3480 a gap either because the group is larger than the SLP
3481 group-size or because there is a gap between the groups. */
3482 && (known_eq (LOOP_VINFO_VECT_FACTOR
3483 (as_a <loop_vec_info> (vinfo)), 1U)
3484 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3485 && DR_GROUP_GAP (first_stmt_info) == 0)))
3486 {
3487 SLP_TREE_LOAD_PERMUTATION (node).release ();
3488 continue;
3489 }
3490 }
3491 }
3492 }
3493
3494 /* Gather loads reachable from the individual SLP graph entries. */
3495
3496 void
3497 vect_gather_slp_loads (vec_info *vinfo)
3498 {
3499 unsigned i;
3500 slp_instance instance;
3501 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3502 {
3503 hash_set<slp_tree> visited;
3504 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3505 SLP_INSTANCE_TREE (instance), visited);
3506 }
3507 }
3508
3509
3510 /* For each possible SLP instance decide whether to SLP it and calculate overall
3511 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3512 least one instance. */
3513
3514 bool
3515 vect_make_slp_decision (loop_vec_info loop_vinfo)
3516 {
3517 unsigned int i;
3518 poly_uint64 unrolling_factor = 1;
3519 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3520 slp_instance instance;
3521 int decided_to_slp = 0;
3522
3523 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3524
3525 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3526 {
3527 /* FORNOW: SLP if you can. */
3528 /* All unroll factors have the form:
3529
3530 GET_MODE_SIZE (vinfo->vector_mode) * X
3531
3532 for some rational X, so they must have a common multiple. */
3533 unrolling_factor
3534 = force_common_multiple (unrolling_factor,
3535 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3536
3537 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3538 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3539 loop-based vectorization. Such stmts will be marked as HYBRID. */
3540 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3541 decided_to_slp++;
3542 }
3543
3544 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3545
3546 if (decided_to_slp && dump_enabled_p ())
3547 {
3548 dump_printf_loc (MSG_NOTE, vect_location,
3549 "Decided to SLP %d instances. Unrolling factor ",
3550 decided_to_slp);
3551 dump_dec (MSG_NOTE, unrolling_factor);
3552 dump_printf (MSG_NOTE, "\n");
3553 }
3554
3555 return (decided_to_slp > 0);
3556 }
3557
3558 /* Private data for vect_detect_hybrid_slp. */
3559 struct vdhs_data
3560 {
3561 loop_vec_info loop_vinfo;
3562 vec<stmt_vec_info> *worklist;
3563 };
3564
3565 /* Walker for walk_gimple_op. */
3566
3567 static tree
3568 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3569 {
3570 walk_stmt_info *wi = (walk_stmt_info *)data;
3571 vdhs_data *dat = (vdhs_data *)wi->info;
3572
3573 if (wi->is_lhs)
3574 return NULL_TREE;
3575
3576 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3577 if (!def_stmt_info)
3578 return NULL_TREE;
3579 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3580 if (PURE_SLP_STMT (def_stmt_info))
3581 {
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3584 def_stmt_info->stmt);
3585 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3586 dat->worklist->safe_push (def_stmt_info);
3587 }
3588
3589 return NULL_TREE;
3590 }
3591
3592 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3593 if so, otherwise pushing it to WORKLIST. */
3594
3595 static void
3596 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3597 vec<stmt_vec_info> &worklist,
3598 stmt_vec_info stmt_info)
3599 {
3600 if (dump_enabled_p ())
3601 dump_printf_loc (MSG_NOTE, vect_location,
3602 "Processing hybrid candidate : %G", stmt_info->stmt);
3603 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3604 imm_use_iterator iter2;
3605 ssa_op_iter iter1;
3606 use_operand_p use_p;
3607 def_operand_p def_p;
3608 bool any_def = false;
3609 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3610 {
3611 any_def = true;
3612 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3613 {
3614 if (is_gimple_debug (USE_STMT (use_p)))
3615 continue;
3616 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3617 /* An out-of loop use means this is a loop_vect sink. */
3618 if (!use_info)
3619 {
3620 if (dump_enabled_p ())
3621 dump_printf_loc (MSG_NOTE, vect_location,
3622 "Found loop_vect sink: %G", stmt_info->stmt);
3623 worklist.safe_push (stmt_info);
3624 return;
3625 }
3626 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3627 {
3628 if (dump_enabled_p ())
3629 dump_printf_loc (MSG_NOTE, vect_location,
3630 "Found loop_vect use: %G", use_info->stmt);
3631 worklist.safe_push (stmt_info);
3632 return;
3633 }
3634 }
3635 }
3636 /* No def means this is a loo_vect sink. */
3637 if (!any_def)
3638 {
3639 if (dump_enabled_p ())
3640 dump_printf_loc (MSG_NOTE, vect_location,
3641 "Found loop_vect sink: %G", stmt_info->stmt);
3642 worklist.safe_push (stmt_info);
3643 return;
3644 }
3645 if (dump_enabled_p ())
3646 dump_printf_loc (MSG_NOTE, vect_location,
3647 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3648 STMT_SLP_TYPE (stmt_info) = pure_slp;
3649 }
3650
3651 /* Find stmts that must be both vectorized and SLPed. */
3652
3653 void
3654 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3655 {
3656 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3657
3658 /* All stmts participating in SLP are marked pure_slp, all other
3659 stmts are loop_vect.
3660 First collect all loop_vect stmts into a worklist.
3661 SLP patterns cause not all original scalar stmts to appear in
3662 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3663 Rectify this here and do a backward walk over the IL only considering
3664 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3665 mark them as pure_slp. */
3666 auto_vec<stmt_vec_info> worklist;
3667 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3668 {
3669 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3670 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3671 gsi_next (&gsi))
3672 {
3673 gphi *phi = gsi.phi ();
3674 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3675 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3676 maybe_push_to_hybrid_worklist (loop_vinfo,
3677 worklist, stmt_info);
3678 }
3679 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3680 gsi_prev (&gsi))
3681 {
3682 gimple *stmt = gsi_stmt (gsi);
3683 if (is_gimple_debug (stmt))
3684 continue;
3685 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3686 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3687 {
3688 for (gimple_stmt_iterator gsi2
3689 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3690 !gsi_end_p (gsi2); gsi_next (&gsi2))
3691 {
3692 stmt_vec_info patt_info
3693 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3694 if (!STMT_SLP_TYPE (patt_info)
3695 && STMT_VINFO_RELEVANT (patt_info))
3696 maybe_push_to_hybrid_worklist (loop_vinfo,
3697 worklist, patt_info);
3698 }
3699 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3700 }
3701 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3702 maybe_push_to_hybrid_worklist (loop_vinfo,
3703 worklist, stmt_info);
3704 }
3705 }
3706
3707 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3708 mark any SLP vectorized stmt as hybrid.
3709 ??? We're visiting def stmts N times (once for each non-SLP and
3710 once for each hybrid-SLP use). */
3711 walk_stmt_info wi;
3712 vdhs_data dat;
3713 dat.worklist = &worklist;
3714 dat.loop_vinfo = loop_vinfo;
3715 memset (&wi, 0, sizeof (wi));
3716 wi.info = (void *)&dat;
3717 while (!worklist.is_empty ())
3718 {
3719 stmt_vec_info stmt_info = worklist.pop ();
3720 /* Since SSA operands are not set up for pattern stmts we need
3721 to use walk_gimple_op. */
3722 wi.is_lhs = 0;
3723 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3724 }
3725 }
3726
3727
3728 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3729
3730 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3731 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL)
3732 {
3733 for (unsigned i = 0; i < bbs.length (); ++i)
3734 {
3735 if (i != 0)
3736 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3737 gsi_next (&si))
3738 {
3739 gphi *phi = si.phi ();
3740 gimple_set_uid (phi, 0);
3741 add_stmt (phi);
3742 }
3743 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3744 !gsi_end_p (gsi); gsi_next (&gsi))
3745 {
3746 gimple *stmt = gsi_stmt (gsi);
3747 gimple_set_uid (stmt, 0);
3748 if (is_gimple_debug (stmt))
3749 continue;
3750 add_stmt (stmt);
3751 }
3752 }
3753 }
3754
3755
3756 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3757 stmts in the basic block. */
3758
3759 _bb_vec_info::~_bb_vec_info ()
3760 {
3761 /* Reset region marker. */
3762 for (unsigned i = 0; i < bbs.length (); ++i)
3763 {
3764 if (i != 0)
3765 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3766 gsi_next (&si))
3767 {
3768 gphi *phi = si.phi ();
3769 gimple_set_uid (phi, -1);
3770 }
3771 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3772 !gsi_end_p (gsi); gsi_next (&gsi))
3773 {
3774 gimple *stmt = gsi_stmt (gsi);
3775 gimple_set_uid (stmt, -1);
3776 }
3777 }
3778
3779 for (unsigned i = 0; i < roots.length (); ++i)
3780 roots[i].stmts.release ();
3781 roots.release ();
3782 }
3783
3784 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3785 given then that child nodes have already been processed, and that
3786 their def types currently match their SLP node's def type. */
3787
3788 static bool
3789 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3790 slp_instance node_instance,
3791 stmt_vector_for_cost *cost_vec)
3792 {
3793 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3794
3795 /* Calculate the number of vector statements to be created for the
3796 scalar stmts in this node. For SLP reductions it is equal to the
3797 number of vector statements in the children (which has already been
3798 calculated by the recursive call). Otherwise it is the number of
3799 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3800 VF divided by the number of elements in a vector. */
3801 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3802 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3803 {
3804 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3805 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3806 {
3807 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3808 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3809 break;
3810 }
3811 }
3812 else
3813 {
3814 poly_uint64 vf;
3815 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3816 vf = loop_vinfo->vectorization_factor;
3817 else
3818 vf = 1;
3819 unsigned int group_size = SLP_TREE_LANES (node);
3820 tree vectype = SLP_TREE_VECTYPE (node);
3821 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3822 = vect_get_num_vectors (vf * group_size, vectype);
3823 }
3824
3825 /* Handle purely internal nodes. */
3826 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3827 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3828
3829 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3830 if (is_a <bb_vec_info> (vinfo)
3831 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3832 {
3833 if (dump_enabled_p ())
3834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3835 "desired vector type conflicts with earlier one "
3836 "for %G", stmt_info->stmt);
3837 return false;
3838 }
3839
3840 bool dummy;
3841 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3842 node, node_instance, cost_vec);
3843 }
3844
3845 /* Try to build NODE from scalars, returning true on success.
3846 NODE_INSTANCE is the SLP instance that contains NODE. */
3847
3848 static bool
3849 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3850 slp_instance node_instance)
3851 {
3852 stmt_vec_info stmt_info;
3853 unsigned int i;
3854
3855 if (!is_a <bb_vec_info> (vinfo)
3856 || node == SLP_INSTANCE_TREE (node_instance)
3857 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3858 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3859 return false;
3860
3861 if (dump_enabled_p ())
3862 dump_printf_loc (MSG_NOTE, vect_location,
3863 "Building vector operands of %p from scalars instead\n", node);
3864
3865 /* Don't remove and free the child nodes here, since they could be
3866 referenced by other structures. The analysis and scheduling phases
3867 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3868 unsigned int group_size = SLP_TREE_LANES (node);
3869 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3870 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3871 SLP_TREE_LOAD_PERMUTATION (node).release ();
3872 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3873 {
3874 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3875 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3876 }
3877 return true;
3878 }
3879
3880 /* Compute the prologue cost for invariant or constant operands represented
3881 by NODE. */
3882
3883 static void
3884 vect_prologue_cost_for_slp (slp_tree node,
3885 stmt_vector_for_cost *cost_vec)
3886 {
3887 /* There's a special case of an existing vector, that costs nothing. */
3888 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3889 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3890 return;
3891 /* Without looking at the actual initializer a vector of
3892 constants can be implemented as load from the constant pool.
3893 When all elements are the same we can use a splat. */
3894 tree vectype = SLP_TREE_VECTYPE (node);
3895 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3896 unsigned num_vects_to_check;
3897 unsigned HOST_WIDE_INT const_nunits;
3898 unsigned nelt_limit;
3899 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3900 && ! multiple_p (const_nunits, group_size))
3901 {
3902 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3903 nelt_limit = const_nunits;
3904 }
3905 else
3906 {
3907 /* If either the vector has variable length or the vectors
3908 are composed of repeated whole groups we only need to
3909 cost construction once. All vectors will be the same. */
3910 num_vects_to_check = 1;
3911 nelt_limit = group_size;
3912 }
3913 tree elt = NULL_TREE;
3914 unsigned nelt = 0;
3915 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3916 {
3917 unsigned si = j % group_size;
3918 if (nelt == 0)
3919 elt = SLP_TREE_SCALAR_OPS (node)[si];
3920 /* ??? We're just tracking whether all operands of a single
3921 vector initializer are the same, ideally we'd check if
3922 we emitted the same one already. */
3923 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3924 elt = NULL_TREE;
3925 nelt++;
3926 if (nelt == nelt_limit)
3927 {
3928 record_stmt_cost (cost_vec, 1,
3929 SLP_TREE_DEF_TYPE (node) == vect_external_def
3930 ? (elt ? scalar_to_vec : vec_construct)
3931 : vector_load,
3932 NULL, vectype, 0, vect_prologue);
3933 nelt = 0;
3934 }
3935 }
3936 }
3937
3938 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3939 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3940
3941 Return true if the operations are supported. */
3942
3943 static bool
3944 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3945 slp_instance node_instance,
3946 hash_set<slp_tree> &visited_set,
3947 vec<slp_tree> &visited_vec,
3948 stmt_vector_for_cost *cost_vec)
3949 {
3950 int i, j;
3951 slp_tree child;
3952
3953 /* Assume we can code-generate all invariants. */
3954 if (!node
3955 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3956 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3957 return true;
3958
3959 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3960 {
3961 if (dump_enabled_p ())
3962 dump_printf_loc (MSG_NOTE, vect_location,
3963 "Failed cyclic SLP reference in %p\n", node);
3964 return false;
3965 }
3966 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3967
3968 /* If we already analyzed the exact same set of scalar stmts we're done.
3969 We share the generated vector stmts for those. */
3970 if (visited_set.add (node))
3971 return true;
3972 visited_vec.safe_push (node);
3973
3974 bool res = true;
3975 unsigned visited_rec_start = visited_vec.length ();
3976 unsigned cost_vec_rec_start = cost_vec->length ();
3977 bool seen_non_constant_child = false;
3978 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3979 {
3980 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3981 visited_set, visited_vec,
3982 cost_vec);
3983 if (!res)
3984 break;
3985 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
3986 seen_non_constant_child = true;
3987 }
3988 /* We're having difficulties scheduling nodes with just constant
3989 operands and no scalar stmts since we then cannot compute a stmt
3990 insertion place. */
3991 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
3992 {
3993 if (dump_enabled_p ())
3994 dump_printf_loc (MSG_NOTE, vect_location,
3995 "Cannot vectorize all-constant op node %p\n", node);
3996 res = false;
3997 }
3998
3999 if (res)
4000 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4001 cost_vec);
4002 /* If analysis failed we have to pop all recursive visited nodes
4003 plus ourselves. */
4004 if (!res)
4005 {
4006 while (visited_vec.length () >= visited_rec_start)
4007 visited_set.remove (visited_vec.pop ());
4008 cost_vec->truncate (cost_vec_rec_start);
4009 }
4010
4011 /* When the node can be vectorized cost invariant nodes it references.
4012 This is not done in DFS order to allow the refering node
4013 vectorizable_* calls to nail down the invariant nodes vector type
4014 and possibly unshare it if it needs a different vector type than
4015 other referrers. */
4016 if (res)
4017 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4018 if (child
4019 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4020 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4021 /* Perform usual caching, note code-generation still
4022 code-gens these nodes multiple times but we expect
4023 to CSE them later. */
4024 && !visited_set.add (child))
4025 {
4026 visited_vec.safe_push (child);
4027 /* ??? After auditing more code paths make a "default"
4028 and push the vector type from NODE to all children
4029 if it is not already set. */
4030 /* Compute the number of vectors to be generated. */
4031 tree vector_type = SLP_TREE_VECTYPE (child);
4032 if (!vector_type)
4033 {
4034 /* For shifts with a scalar argument we don't need
4035 to cost or code-generate anything.
4036 ??? Represent this more explicitely. */
4037 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4038 == shift_vec_info_type)
4039 && j == 1);
4040 continue;
4041 }
4042 unsigned group_size = SLP_TREE_LANES (child);
4043 poly_uint64 vf = 1;
4044 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4045 vf = loop_vinfo->vectorization_factor;
4046 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4047 = vect_get_num_vectors (vf * group_size, vector_type);
4048 /* And cost them. */
4049 vect_prologue_cost_for_slp (child, cost_vec);
4050 }
4051
4052 /* If this node or any of its children can't be vectorized, try pruning
4053 the tree here rather than felling the whole thing. */
4054 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4055 {
4056 /* We'll need to revisit this for invariant costing and number
4057 of vectorized stmt setting. */
4058 res = true;
4059 }
4060
4061 return res;
4062 }
4063
4064
4065 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4066 region and that can be vectorized using vectorizable_live_operation
4067 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4068 scalar code computing it to be retained. */
4069
4070 static void
4071 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4072 slp_instance instance,
4073 stmt_vector_for_cost *cost_vec,
4074 hash_set<stmt_vec_info> &svisited,
4075 hash_set<slp_tree> &visited)
4076 {
4077 if (visited.add (node))
4078 return;
4079
4080 unsigned i;
4081 stmt_vec_info stmt_info;
4082 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4083 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4084 {
4085 if (svisited.contains (stmt_info))
4086 continue;
4087 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4088 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4089 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4090 /* Only the pattern root stmt computes the original scalar value. */
4091 continue;
4092 bool mark_visited = true;
4093 gimple *orig_stmt = orig_stmt_info->stmt;
4094 ssa_op_iter op_iter;
4095 def_operand_p def_p;
4096 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4097 {
4098 imm_use_iterator use_iter;
4099 gimple *use_stmt;
4100 stmt_vec_info use_stmt_info;
4101 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4102 if (!is_gimple_debug (use_stmt))
4103 {
4104 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4105 if (!use_stmt_info
4106 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4107 {
4108 STMT_VINFO_LIVE_P (stmt_info) = true;
4109 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4110 NULL, node, instance, i,
4111 false, cost_vec))
4112 /* ??? So we know we can vectorize the live stmt
4113 from one SLP node. If we cannot do so from all
4114 or none consistently we'd have to record which
4115 SLP node (and lane) we want to use for the live
4116 operation. So make sure we can code-generate
4117 from all nodes. */
4118 mark_visited = false;
4119 else
4120 STMT_VINFO_LIVE_P (stmt_info) = false;
4121 break;
4122 }
4123 }
4124 /* We have to verify whether we can insert the lane extract
4125 before all uses. The following is a conservative approximation.
4126 We cannot put this into vectorizable_live_operation because
4127 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4128 doesn't work.
4129 Note that while the fact that we emit code for loads at the
4130 first load should make this a non-problem leafs we construct
4131 from scalars are vectorized after the last scalar def.
4132 ??? If we'd actually compute the insert location during
4133 analysis we could use sth less conservative than the last
4134 scalar stmt in the node for the dominance check. */
4135 /* ??? What remains is "live" uses in vector CTORs in the same
4136 SLP graph which is where those uses can end up code-generated
4137 right after their definition instead of close to their original
4138 use. But that would restrict us to code-generate lane-extracts
4139 from the latest stmt in a node. So we compensate for this
4140 during code-generation, simply not replacing uses for those
4141 hopefully rare cases. */
4142 if (STMT_VINFO_LIVE_P (stmt_info))
4143 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4144 if (!is_gimple_debug (use_stmt)
4145 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4146 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4147 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4148 {
4149 if (dump_enabled_p ())
4150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4151 "Cannot determine insertion place for "
4152 "lane extract\n");
4153 STMT_VINFO_LIVE_P (stmt_info) = false;
4154 mark_visited = true;
4155 }
4156 }
4157 if (mark_visited)
4158 svisited.add (stmt_info);
4159 }
4160
4161 slp_tree child;
4162 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4163 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4164 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4165 cost_vec, svisited, visited);
4166 }
4167
4168 /* Analyze statements in SLP instances of VINFO. Return true if the
4169 operations are supported. */
4170
4171 bool
4172 vect_slp_analyze_operations (vec_info *vinfo)
4173 {
4174 slp_instance instance;
4175 int i;
4176
4177 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4178
4179 hash_set<slp_tree> visited;
4180 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4181 {
4182 auto_vec<slp_tree> visited_vec;
4183 stmt_vector_for_cost cost_vec;
4184 cost_vec.create (2);
4185 if (is_a <bb_vec_info> (vinfo))
4186 vect_location = instance->location ();
4187 if (!vect_slp_analyze_node_operations (vinfo,
4188 SLP_INSTANCE_TREE (instance),
4189 instance, visited, visited_vec,
4190 &cost_vec)
4191 /* Instances with a root stmt require vectorized defs for the
4192 SLP tree root. */
4193 || (SLP_INSTANCE_ROOT_STMT (instance)
4194 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4195 != vect_internal_def
4196 /* Make sure we vectorized with the expected type. */
4197 || !useless_type_conversion_p
4198 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
4199 (instance->root_stmt->stmt))),
4200 TREE_TYPE (SLP_TREE_VECTYPE
4201 (SLP_INSTANCE_TREE (instance)))))))
4202 {
4203 slp_tree node = SLP_INSTANCE_TREE (instance);
4204 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4205 if (dump_enabled_p ())
4206 dump_printf_loc (MSG_NOTE, vect_location,
4207 "removing SLP instance operations starting from: %G",
4208 stmt_info->stmt);
4209 vect_free_slp_instance (instance);
4210 vinfo->slp_instances.ordered_remove (i);
4211 cost_vec.release ();
4212 while (!visited_vec.is_empty ())
4213 visited.remove (visited_vec.pop ());
4214 }
4215 else
4216 {
4217 i++;
4218
4219 /* For BB vectorization remember the SLP graph entry
4220 cost for later. */
4221 if (is_a <bb_vec_info> (vinfo))
4222 instance->cost_vec = cost_vec;
4223 else
4224 {
4225 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4226 cost_vec.release ();
4227 }
4228 }
4229 }
4230
4231 /* Compute vectorizable live stmts. */
4232 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4233 {
4234 hash_set<stmt_vec_info> svisited;
4235 hash_set<slp_tree> visited;
4236 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4237 {
4238 vect_location = instance->location ();
4239 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4240 instance, &instance->cost_vec, svisited,
4241 visited);
4242 }
4243 }
4244
4245 return !vinfo->slp_instances.is_empty ();
4246 }
4247
4248 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4249 closing the eventual chain. */
4250
4251 static slp_instance
4252 get_ultimate_leader (slp_instance instance,
4253 hash_map<slp_instance, slp_instance> &instance_leader)
4254 {
4255 auto_vec<slp_instance *, 8> chain;
4256 slp_instance *tem;
4257 while (*(tem = instance_leader.get (instance)) != instance)
4258 {
4259 chain.safe_push (tem);
4260 instance = *tem;
4261 }
4262 while (!chain.is_empty ())
4263 *chain.pop () = instance;
4264 return instance;
4265 }
4266
4267 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4268
4269 static void
4270 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4271 slp_instance instance, slp_tree node,
4272 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4273 hash_map<slp_instance, slp_instance> &instance_leader,
4274 hash_set<slp_tree> &visited)
4275 {
4276 stmt_vec_info stmt_info;
4277 unsigned i;
4278
4279 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4280 {
4281 bool existed_p;
4282 slp_instance &stmt_instance
4283 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4284 if (!existed_p)
4285 ;
4286 else if (stmt_instance != instance)
4287 {
4288 /* If we're running into a previously marked stmt make us the
4289 leader of the current ultimate leader. This keeps the
4290 leader chain acyclic and works even when the current instance
4291 connects two previously independent graph parts. */
4292 slp_instance stmt_leader
4293 = get_ultimate_leader (stmt_instance, instance_leader);
4294 if (stmt_leader != instance)
4295 instance_leader.put (stmt_leader, instance);
4296 }
4297 stmt_instance = instance;
4298 }
4299
4300 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
4301 return;
4302
4303 slp_tree child;
4304 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4305 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4306 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4307 instance_leader, visited);
4308 }
4309
4310 /* Partition the SLP graph into pieces that can be costed independently. */
4311
4312 static void
4313 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4314 {
4315 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4316
4317 /* First walk the SLP graph assigning each involved scalar stmt a
4318 corresponding SLP graph entry and upon visiting a previously
4319 marked stmt, make the stmts leader the current SLP graph entry. */
4320 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4321 hash_map<slp_instance, slp_instance> instance_leader;
4322 hash_set<slp_tree> visited;
4323 slp_instance instance;
4324 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4325 {
4326 instance_leader.put (instance, instance);
4327 vect_bb_partition_graph_r (bb_vinfo,
4328 instance, SLP_INSTANCE_TREE (instance),
4329 stmt_to_instance, instance_leader,
4330 visited);
4331 }
4332
4333 /* Then collect entries to each independent subgraph. */
4334 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4335 {
4336 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4337 leader->subgraph_entries.safe_push (instance);
4338 if (dump_enabled_p ()
4339 && leader != instance)
4340 dump_printf_loc (MSG_NOTE, vect_location,
4341 "instance %p is leader of %p\n",
4342 leader, instance);
4343 }
4344 }
4345
4346 /* Compute the scalar cost of the SLP node NODE and its children
4347 and return it. Do not account defs that are marked in LIFE and
4348 update LIFE according to uses of NODE. */
4349
4350 static void
4351 vect_bb_slp_scalar_cost (vec_info *vinfo,
4352 slp_tree node, vec<bool, va_heap> *life,
4353 stmt_vector_for_cost *cost_vec,
4354 hash_set<slp_tree> &visited)
4355 {
4356 unsigned i;
4357 stmt_vec_info stmt_info;
4358 slp_tree child;
4359
4360 if (visited.add (node))
4361 return;
4362
4363 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4364 {
4365 ssa_op_iter op_iter;
4366 def_operand_p def_p;
4367
4368 if ((*life)[i])
4369 continue;
4370
4371 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4372 gimple *orig_stmt = orig_stmt_info->stmt;
4373
4374 /* If there is a non-vectorized use of the defs then the scalar
4375 stmt is kept live in which case we do not account it or any
4376 required defs in the SLP children in the scalar cost. This
4377 way we make the vectorization more costly when compared to
4378 the scalar cost. */
4379 if (!STMT_VINFO_LIVE_P (stmt_info))
4380 {
4381 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4382 {
4383 imm_use_iterator use_iter;
4384 gimple *use_stmt;
4385 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4386 if (!is_gimple_debug (use_stmt))
4387 {
4388 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4389 if (!use_stmt_info
4390 || !PURE_SLP_STMT
4391 (vect_stmt_to_vectorize (use_stmt_info)))
4392 {
4393 (*life)[i] = true;
4394 break;
4395 }
4396 }
4397 }
4398 if ((*life)[i])
4399 continue;
4400 }
4401
4402 /* Count scalar stmts only once. */
4403 if (gimple_visited_p (orig_stmt))
4404 continue;
4405 gimple_set_visited (orig_stmt, true);
4406
4407 vect_cost_for_stmt kind;
4408 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4409 {
4410 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4411 kind = scalar_load;
4412 else
4413 kind = scalar_store;
4414 }
4415 else if (vect_nop_conversion_p (orig_stmt_info))
4416 continue;
4417 /* For single-argument PHIs assume coalescing which means zero cost
4418 for the scalar and the vector PHIs. This avoids artificially
4419 favoring the vector path (but may pessimize it in some cases). */
4420 else if (is_a <gphi *> (orig_stmt_info->stmt)
4421 && gimple_phi_num_args
4422 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
4423 continue;
4424 else
4425 kind = scalar_stmt;
4426 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
4427 SLP_TREE_VECTYPE (node), 0, vect_body);
4428 }
4429
4430 auto_vec<bool, 20> subtree_life;
4431 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4432 {
4433 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4434 {
4435 /* Do not directly pass LIFE to the recursive call, copy it to
4436 confine changes in the callee to the current child/subtree. */
4437 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4438 {
4439 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
4440 for (unsigned j = 0;
4441 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
4442 {
4443 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
4444 if (perm.first == i)
4445 subtree_life[perm.second] = (*life)[j];
4446 }
4447 }
4448 else
4449 {
4450 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4451 subtree_life.safe_splice (*life);
4452 }
4453 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4454 visited);
4455 subtree_life.truncate (0);
4456 }
4457 }
4458 }
4459
4460 /* Comparator for the loop-index sorted cost vectors. */
4461
4462 static int
4463 li_cost_vec_cmp (const void *a_, const void *b_)
4464 {
4465 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
4466 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
4467 if (a->first < b->first)
4468 return -1;
4469 else if (a->first == b->first)
4470 return 0;
4471 return 1;
4472 }
4473
4474 /* Check if vectorization of the basic block is profitable for the
4475 subgraph denoted by SLP_INSTANCES. */
4476
4477 static bool
4478 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4479 vec<slp_instance> slp_instances)
4480 {
4481 slp_instance instance;
4482 int i;
4483 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4484 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4485
4486 if (dump_enabled_p ())
4487 {
4488 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
4489 hash_set<slp_tree> visited;
4490 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4491 vect_print_slp_graph (MSG_NOTE, vect_location,
4492 SLP_INSTANCE_TREE (instance), visited);
4493 }
4494
4495 /* Calculate scalar cost and sum the cost for the vector stmts
4496 previously collected. */
4497 stmt_vector_for_cost scalar_costs = vNULL;
4498 stmt_vector_for_cost vector_costs = vNULL;
4499 hash_set<slp_tree> visited;
4500 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4501 {
4502 auto_vec<bool, 20> life;
4503 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4504 true);
4505 if (SLP_INSTANCE_ROOT_STMT (instance))
4506 record_stmt_cost (&scalar_costs, 1, scalar_stmt,
4507 SLP_INSTANCE_ROOT_STMT (instance), 0, vect_body);
4508 vect_bb_slp_scalar_cost (bb_vinfo,
4509 SLP_INSTANCE_TREE (instance),
4510 &life, &scalar_costs, visited);
4511 vector_costs.safe_splice (instance->cost_vec);
4512 instance->cost_vec.release ();
4513 }
4514 /* Unset visited flag. */
4515 stmt_info_for_cost *cost;
4516 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4517 gimple_set_visited (cost->stmt_info->stmt, false);
4518
4519 if (dump_enabled_p ())
4520 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4521
4522 /* When costing non-loop vectorization we need to consider each covered
4523 loop independently and make sure vectorization is profitable. For
4524 now we assume a loop may be not entered or executed an arbitrary
4525 number of iterations (??? static information can provide more
4526 precise info here) which means we can simply cost each containing
4527 loops stmts separately. */
4528
4529 /* First produce cost vectors sorted by loop index. */
4530 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4531 li_scalar_costs (scalar_costs.length ());
4532 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
4533 li_vector_costs (vector_costs.length ());
4534 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
4535 {
4536 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4537 li_scalar_costs.quick_push (std::make_pair (l, cost));
4538 }
4539 /* Use a random used loop as fallback in case the first vector_costs
4540 entry does not have a stmt_info associated with it. */
4541 unsigned l = li_scalar_costs[0].first;
4542 FOR_EACH_VEC_ELT (vector_costs, i, cost)
4543 {
4544 /* We inherit from the previous COST, invariants, externals and
4545 extracts immediately follow the cost for the related stmt. */
4546 if (cost->stmt_info)
4547 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
4548 li_vector_costs.quick_push (std::make_pair (l, cost));
4549 }
4550 li_scalar_costs.qsort (li_cost_vec_cmp);
4551 li_vector_costs.qsort (li_cost_vec_cmp);
4552
4553 /* Now cost the portions individually. */
4554 unsigned vi = 0;
4555 unsigned si = 0;
4556 while (si < li_scalar_costs.length ()
4557 && vi < li_vector_costs.length ())
4558 {
4559 unsigned sl = li_scalar_costs[si].first;
4560 unsigned vl = li_vector_costs[vi].first;
4561 if (sl != vl)
4562 {
4563 if (dump_enabled_p ())
4564 dump_printf_loc (MSG_NOTE, vect_location,
4565 "Scalar %d and vector %d loop part do not "
4566 "match up, skipping scalar part\n", sl, vl);
4567 /* Skip the scalar part, assuming zero cost on the vector side. */
4568 do
4569 {
4570 si++;
4571 }
4572 while (si < li_scalar_costs.length ()
4573 && li_scalar_costs[si].first == sl);
4574 continue;
4575 }
4576
4577 void *scalar_target_cost_data = init_cost (NULL);
4578 do
4579 {
4580 add_stmt_cost (bb_vinfo, scalar_target_cost_data,
4581 li_scalar_costs[si].second);
4582 si++;
4583 }
4584 while (si < li_scalar_costs.length ()
4585 && li_scalar_costs[si].first == sl);
4586 unsigned dummy;
4587 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4588 destroy_cost_data (scalar_target_cost_data);
4589
4590 /* Complete the target-specific vector cost calculation. */
4591 void *vect_target_cost_data = init_cost (NULL);
4592 do
4593 {
4594 add_stmt_cost (bb_vinfo, vect_target_cost_data,
4595 li_vector_costs[vi].second);
4596 vi++;
4597 }
4598 while (vi < li_vector_costs.length ()
4599 && li_vector_costs[vi].first == vl);
4600 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4601 &vec_inside_cost, &vec_epilogue_cost);
4602 destroy_cost_data (vect_target_cost_data);
4603
4604 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4605
4606 if (dump_enabled_p ())
4607 {
4608 dump_printf_loc (MSG_NOTE, vect_location,
4609 "Cost model analysis for part in loop %d:\n", sl);
4610 dump_printf (MSG_NOTE, " Vector cost: %d\n",
4611 vec_inside_cost + vec_outside_cost);
4612 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
4613 }
4614
4615 /* Vectorization is profitable if its cost is more than the cost of scalar
4616 version. Note that we err on the vector side for equal cost because
4617 the cost estimate is otherwise quite pessimistic (constant uses are
4618 free on the scalar side but cost a load on the vector side for
4619 example). */
4620 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4621 {
4622 scalar_costs.release ();
4623 vector_costs.release ();
4624 return false;
4625 }
4626 }
4627 if (vi < li_vector_costs.length ())
4628 {
4629 if (dump_enabled_p ())
4630 dump_printf_loc (MSG_NOTE, vect_location,
4631 "Excess vector cost for part in loop %d:\n",
4632 li_vector_costs[vi].first);
4633 scalar_costs.release ();
4634 vector_costs.release ();
4635 return false;
4636 }
4637
4638 scalar_costs.release ();
4639 vector_costs.release ();
4640 return true;
4641 }
4642
4643 /* qsort comparator for lane defs. */
4644
4645 static int
4646 vld_cmp (const void *a_, const void *b_)
4647 {
4648 auto *a = (const std::pair<unsigned, tree> *)a_;
4649 auto *b = (const std::pair<unsigned, tree> *)b_;
4650 return a->first - b->first;
4651 }
4652
4653 /* Return true if USE_STMT is a vector lane insert into VEC and set
4654 *THIS_LANE to the lane number that is set. */
4655
4656 static bool
4657 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
4658 {
4659 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
4660 if (!use_ass
4661 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
4662 || (vec
4663 ? gimple_assign_rhs1 (use_ass) != vec
4664 : ((vec = gimple_assign_rhs1 (use_ass)), false))
4665 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
4666 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
4667 || !constant_multiple_p
4668 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
4669 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
4670 this_lane))
4671 return false;
4672 return true;
4673 }
4674
4675 /* Find any vectorizable constructors and add them to the grouped_store
4676 array. */
4677
4678 static void
4679 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4680 {
4681 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4682 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4683 !gsi_end_p (gsi); gsi_next (&gsi))
4684 {
4685 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4686 if (!assign)
4687 continue;
4688
4689 tree rhs = gimple_assign_rhs1 (assign);
4690 if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
4691 {
4692 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4693 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4694 CONSTRUCTOR_NELTS (rhs))
4695 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4696 || uniform_vector_p (rhs))
4697 continue;
4698
4699 unsigned j;
4700 tree val;
4701 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4702 if (TREE_CODE (val) != SSA_NAME
4703 || !bb_vinfo->lookup_def (val))
4704 break;
4705 if (j != CONSTRUCTOR_NELTS (rhs))
4706 continue;
4707
4708 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4709 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4710 }
4711 else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
4712 && VECTOR_TYPE_P (TREE_TYPE (rhs))
4713 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
4714 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
4715 && integer_zerop (gimple_assign_rhs3 (assign))
4716 && useless_type_conversion_p
4717 (TREE_TYPE (TREE_TYPE (rhs)),
4718 TREE_TYPE (gimple_assign_rhs2 (assign)))
4719 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
4720 {
4721 /* We start to match on insert to lane zero but since the
4722 inserts need not be ordered we'd have to search both
4723 the def and the use chains. */
4724 tree vectype = TREE_TYPE (rhs);
4725 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4726 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
4727 auto_sbitmap lanes (nlanes);
4728 bitmap_clear (lanes);
4729 bitmap_set_bit (lanes, 0);
4730 tree def = gimple_assign_lhs (assign);
4731 lane_defs.quick_push
4732 (std::make_pair (0, gimple_assign_rhs2 (assign)));
4733 unsigned lanes_found = 1;
4734 /* Start with the use chains, the last stmt will be the root. */
4735 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
4736 do
4737 {
4738 use_operand_p use_p;
4739 gimple *use_stmt;
4740 if (!single_imm_use (def, &use_p, &use_stmt))
4741 break;
4742 unsigned this_lane;
4743 if (!bb_vinfo->lookup_stmt (use_stmt)
4744 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
4745 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
4746 break;
4747 if (bitmap_bit_p (lanes, this_lane))
4748 break;
4749 lanes_found++;
4750 bitmap_set_bit (lanes, this_lane);
4751 gassign *use_ass = as_a <gassign *> (use_stmt);
4752 lane_defs.quick_push (std::make_pair
4753 (this_lane, gimple_assign_rhs2 (use_ass)));
4754 last = bb_vinfo->lookup_stmt (use_ass);
4755 def = gimple_assign_lhs (use_ass);
4756 }
4757 while (lanes_found < nlanes);
4758 if (lanes_found < nlanes)
4759 {
4760 /* Now search the def chain. */
4761 def = gimple_assign_rhs1 (assign);
4762 do
4763 {
4764 if (TREE_CODE (def) != SSA_NAME
4765 || !has_single_use (def))
4766 break;
4767 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
4768 unsigned this_lane;
4769 if (!bb_vinfo->lookup_stmt (def_stmt)
4770 || !vect_slp_is_lane_insert (def_stmt,
4771 NULL_TREE, &this_lane)
4772 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
4773 break;
4774 if (bitmap_bit_p (lanes, this_lane))
4775 break;
4776 lanes_found++;
4777 bitmap_set_bit (lanes, this_lane);
4778 lane_defs.quick_push (std::make_pair
4779 (this_lane,
4780 gimple_assign_rhs2 (def_stmt)));
4781 def = gimple_assign_rhs1 (def_stmt);
4782 }
4783 while (lanes_found < nlanes);
4784 }
4785 if (lanes_found == nlanes)
4786 {
4787 /* Sort lane_defs after the lane index and register the root. */
4788 lane_defs.qsort (vld_cmp);
4789 vec<stmt_vec_info> stmts;
4790 stmts.create (nlanes);
4791 for (unsigned i = 0; i < nlanes; ++i)
4792 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
4793 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
4794 stmts, last));
4795 }
4796 }
4797 }
4798 }
4799
4800 /* Walk the grouped store chains and replace entries with their
4801 pattern variant if any. */
4802
4803 static void
4804 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4805 {
4806 stmt_vec_info first_element;
4807 unsigned i;
4808
4809 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4810 {
4811 /* We also have CTORs in this array. */
4812 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4813 continue;
4814 if (STMT_VINFO_IN_PATTERN_P (first_element))
4815 {
4816 stmt_vec_info orig = first_element;
4817 first_element = STMT_VINFO_RELATED_STMT (first_element);
4818 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4819 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4820 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4821 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4822 vinfo->grouped_stores[i] = first_element;
4823 }
4824 stmt_vec_info prev = first_element;
4825 while (DR_GROUP_NEXT_ELEMENT (prev))
4826 {
4827 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4828 if (STMT_VINFO_IN_PATTERN_P (elt))
4829 {
4830 stmt_vec_info orig = elt;
4831 elt = STMT_VINFO_RELATED_STMT (elt);
4832 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4833 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4834 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4835 }
4836 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4837 prev = elt;
4838 }
4839 }
4840 }
4841
4842 /* Check if the region described by BB_VINFO can be vectorized, returning
4843 true if so. When returning false, set FATAL to true if the same failure
4844 would prevent vectorization at other vector sizes, false if it is still
4845 worth trying other sizes. N_STMTS is the number of statements in the
4846 region. */
4847
4848 static bool
4849 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4850 vec<int> *dataref_groups)
4851 {
4852 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4853
4854 slp_instance instance;
4855 int i;
4856 poly_uint64 min_vf = 2;
4857
4858 /* The first group of checks is independent of the vector size. */
4859 fatal = true;
4860
4861 /* Analyze the data references. */
4862
4863 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4864 {
4865 if (dump_enabled_p ())
4866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4867 "not vectorized: unhandled data-ref in basic "
4868 "block.\n");
4869 return false;
4870 }
4871
4872 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4873 {
4874 if (dump_enabled_p ())
4875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4876 "not vectorized: unhandled data access in "
4877 "basic block.\n");
4878 return false;
4879 }
4880
4881 vect_slp_check_for_constructors (bb_vinfo);
4882
4883 /* If there are no grouped stores and no constructors in the region
4884 there is no need to continue with pattern recog as vect_analyze_slp
4885 will fail anyway. */
4886 if (bb_vinfo->grouped_stores.is_empty ()
4887 && bb_vinfo->roots.is_empty ())
4888 {
4889 if (dump_enabled_p ())
4890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4891 "not vectorized: no grouped stores in "
4892 "basic block.\n");
4893 return false;
4894 }
4895
4896 /* While the rest of the analysis below depends on it in some way. */
4897 fatal = false;
4898
4899 vect_pattern_recog (bb_vinfo);
4900
4901 /* Update store groups from pattern processing. */
4902 vect_fixup_store_groups_with_patterns (bb_vinfo);
4903
4904 /* Check the SLP opportunities in the basic block, analyze and build SLP
4905 trees. */
4906 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4907 {
4908 if (dump_enabled_p ())
4909 {
4910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4911 "Failed to SLP the basic block.\n");
4912 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4913 "not vectorized: failed to find SLP opportunities "
4914 "in basic block.\n");
4915 }
4916 return false;
4917 }
4918
4919 /* Optimize permutations. */
4920 vect_optimize_slp (bb_vinfo);
4921
4922 /* Gather the loads reachable from the SLP graph entries. */
4923 vect_gather_slp_loads (bb_vinfo);
4924
4925 vect_record_base_alignments (bb_vinfo);
4926
4927 /* Analyze and verify the alignment of data references and the
4928 dependence in the SLP instances. */
4929 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4930 {
4931 vect_location = instance->location ();
4932 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4933 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4934 {
4935 slp_tree node = SLP_INSTANCE_TREE (instance);
4936 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4937 if (dump_enabled_p ())
4938 dump_printf_loc (MSG_NOTE, vect_location,
4939 "removing SLP instance operations starting from: %G",
4940 stmt_info->stmt);
4941 vect_free_slp_instance (instance);
4942 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4943 continue;
4944 }
4945
4946 /* Mark all the statements that we want to vectorize as pure SLP and
4947 relevant. */
4948 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4949 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4950 if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance))
4951 {
4952 STMT_SLP_TYPE (root) = pure_slp;
4953 if (is_gimple_assign (root->stmt)
4954 && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR)
4955 {
4956 /* ??? We should probably record the whole vector of
4957 root stmts so we do not have to back-track here... */
4958 for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance));
4959 n != 1; --n)
4960 {
4961 root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt));
4962 STMT_SLP_TYPE (root) = pure_slp;
4963 }
4964 }
4965 }
4966
4967 i++;
4968 }
4969 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4970 return false;
4971
4972 if (!vect_slp_analyze_operations (bb_vinfo))
4973 {
4974 if (dump_enabled_p ())
4975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4976 "not vectorized: bad operation in basic block.\n");
4977 return false;
4978 }
4979
4980 vect_bb_partition_graph (bb_vinfo);
4981
4982 return true;
4983 }
4984
4985 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4986 basic blocks in BBS, returning true on success.
4987 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4988
4989 static bool
4990 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4991 vec<int> *dataref_groups, unsigned int n_stmts)
4992 {
4993 bb_vec_info bb_vinfo;
4994 auto_vector_modes vector_modes;
4995
4996 /* Autodetect first vector size we try. */
4997 machine_mode next_vector_mode = VOIDmode;
4998 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4999 unsigned int mode_i = 0;
5000
5001 vec_info_shared shared;
5002
5003 machine_mode autodetected_vector_mode = VOIDmode;
5004 while (1)
5005 {
5006 bool vectorized = false;
5007 bool fatal = false;
5008 bb_vinfo = new _bb_vec_info (bbs, &shared);
5009
5010 bool first_time_p = shared.datarefs.is_empty ();
5011 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
5012 if (first_time_p)
5013 bb_vinfo->shared->save_datarefs ();
5014 else
5015 bb_vinfo->shared->check_datarefs ();
5016 bb_vinfo->vector_mode = next_vector_mode;
5017
5018 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
5019 {
5020 if (dump_enabled_p ())
5021 {
5022 dump_printf_loc (MSG_NOTE, vect_location,
5023 "***** Analysis succeeded with vector mode"
5024 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
5025 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
5026 }
5027
5028 bb_vinfo->shared->check_datarefs ();
5029
5030 unsigned i;
5031 slp_instance instance;
5032 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
5033 {
5034 if (instance->subgraph_entries.is_empty ())
5035 continue;
5036
5037 vect_location = instance->location ();
5038 if (!unlimited_cost_model (NULL)
5039 && !vect_bb_vectorization_profitable_p
5040 (bb_vinfo, instance->subgraph_entries))
5041 {
5042 if (dump_enabled_p ())
5043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5044 "not vectorized: vectorization is not "
5045 "profitable.\n");
5046 continue;
5047 }
5048
5049 if (!dbg_cnt (vect_slp))
5050 continue;
5051
5052 if (!vectorized && dump_enabled_p ())
5053 dump_printf_loc (MSG_NOTE, vect_location,
5054 "Basic block will be vectorized "
5055 "using SLP\n");
5056 vectorized = true;
5057
5058 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
5059
5060 unsigned HOST_WIDE_INT bytes;
5061 if (dump_enabled_p ())
5062 {
5063 if (GET_MODE_SIZE
5064 (bb_vinfo->vector_mode).is_constant (&bytes))
5065 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5066 "basic block part vectorized using %wu "
5067 "byte vectors\n", bytes);
5068 else
5069 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5070 "basic block part vectorized using "
5071 "variable length vectors\n");
5072 }
5073 }
5074 }
5075 else
5076 {
5077 if (dump_enabled_p ())
5078 dump_printf_loc (MSG_NOTE, vect_location,
5079 "***** Analysis failed with vector mode %s\n",
5080 GET_MODE_NAME (bb_vinfo->vector_mode));
5081 }
5082
5083 if (mode_i == 0)
5084 autodetected_vector_mode = bb_vinfo->vector_mode;
5085
5086 if (!fatal)
5087 while (mode_i < vector_modes.length ()
5088 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
5089 {
5090 if (dump_enabled_p ())
5091 dump_printf_loc (MSG_NOTE, vect_location,
5092 "***** The result for vector mode %s would"
5093 " be the same\n",
5094 GET_MODE_NAME (vector_modes[mode_i]));
5095 mode_i += 1;
5096 }
5097
5098 delete bb_vinfo;
5099
5100 if (mode_i < vector_modes.length ()
5101 && VECTOR_MODE_P (autodetected_vector_mode)
5102 && (related_vector_mode (vector_modes[mode_i],
5103 GET_MODE_INNER (autodetected_vector_mode))
5104 == autodetected_vector_mode)
5105 && (related_vector_mode (autodetected_vector_mode,
5106 GET_MODE_INNER (vector_modes[mode_i]))
5107 == vector_modes[mode_i]))
5108 {
5109 if (dump_enabled_p ())
5110 dump_printf_loc (MSG_NOTE, vect_location,
5111 "***** Skipping vector mode %s, which would"
5112 " repeat the analysis for %s\n",
5113 GET_MODE_NAME (vector_modes[mode_i]),
5114 GET_MODE_NAME (autodetected_vector_mode));
5115 mode_i += 1;
5116 }
5117
5118 if (vectorized
5119 || mode_i == vector_modes.length ()
5120 || autodetected_vector_mode == VOIDmode
5121 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5122 vector sizes will fail do not bother iterating. */
5123 || fatal)
5124 return vectorized;
5125
5126 /* Try the next biggest vector size. */
5127 next_vector_mode = vector_modes[mode_i++];
5128 if (dump_enabled_p ())
5129 dump_printf_loc (MSG_NOTE, vect_location,
5130 "***** Re-trying analysis with vector mode %s\n",
5131 GET_MODE_NAME (next_vector_mode));
5132 }
5133 }
5134
5135
5136 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5137 true if anything in the basic-block was vectorized. */
5138
5139 static bool
5140 vect_slp_bbs (vec<basic_block> bbs)
5141 {
5142 vec<data_reference_p> datarefs = vNULL;
5143 auto_vec<int> dataref_groups;
5144 int insns = 0;
5145 int current_group = 0;
5146
5147 for (unsigned i = 0; i < bbs.length (); i++)
5148 {
5149 basic_block bb = bbs[i];
5150 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
5151 gsi_next (&gsi))
5152 {
5153 gimple *stmt = gsi_stmt (gsi);
5154 if (is_gimple_debug (stmt))
5155 continue;
5156
5157 insns++;
5158
5159 if (gimple_location (stmt) != UNKNOWN_LOCATION)
5160 vect_location = stmt;
5161
5162 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
5163 &dataref_groups, current_group))
5164 ++current_group;
5165 }
5166 /* New BBs always start a new DR group. */
5167 ++current_group;
5168 }
5169
5170 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
5171 }
5172
5173 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5174 true if anything in the basic-block was vectorized. */
5175
5176 bool
5177 vect_slp_bb (basic_block bb)
5178 {
5179 auto_vec<basic_block> bbs;
5180 bbs.safe_push (bb);
5181 return vect_slp_bbs (bbs);
5182 }
5183
5184 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5185 true if anything in the basic-block was vectorized. */
5186
5187 bool
5188 vect_slp_function (function *fun)
5189 {
5190 bool r = false;
5191 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
5192 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
5193
5194 /* For the moment split the function into pieces to avoid making
5195 the iteration on the vector mode moot. Split at points we know
5196 to not handle well which is CFG merges (SLP discovery doesn't
5197 handle non-loop-header PHIs) and loop exits. Since pattern
5198 recog requires reverse iteration to visit uses before defs
5199 simply chop RPO into pieces. */
5200 auto_vec<basic_block> bbs;
5201 for (unsigned i = 0; i < n; i++)
5202 {
5203 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
5204 bool split = false;
5205
5206 /* Split when a BB is not dominated by the first block. */
5207 if (!bbs.is_empty ()
5208 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
5209 {
5210 if (dump_enabled_p ())
5211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5212 "splitting region at dominance boundary bb%d\n",
5213 bb->index);
5214 split = true;
5215 }
5216 /* Split when the loop determined by the first block
5217 is exited. This is because we eventually insert
5218 invariants at region begin. */
5219 else if (!bbs.is_empty ()
5220 && bbs[0]->loop_father != bb->loop_father
5221 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
5222 {
5223 if (dump_enabled_p ())
5224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5225 "splitting region at loop %d exit at bb%d\n",
5226 bbs[0]->loop_father->num, bb->index);
5227 split = true;
5228 }
5229
5230 if (split && !bbs.is_empty ())
5231 {
5232 r |= vect_slp_bbs (bbs);
5233 bbs.truncate (0);
5234 bbs.quick_push (bb);
5235 }
5236 else
5237 bbs.safe_push (bb);
5238
5239 /* When we have a stmt ending this block and defining a
5240 value we have to insert on edges when inserting after it for
5241 a vector containing its definition. Avoid this for now. */
5242 if (gimple *last = last_stmt (bb))
5243 if (gimple_get_lhs (last)
5244 && is_ctrl_altering_stmt (last))
5245 {
5246 if (dump_enabled_p ())
5247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5248 "splitting region at control altering "
5249 "definition %G", last);
5250 r |= vect_slp_bbs (bbs);
5251 bbs.truncate (0);
5252 }
5253 }
5254
5255 if (!bbs.is_empty ())
5256 r |= vect_slp_bbs (bbs);
5257
5258 free (rpo);
5259
5260 return r;
5261 }
5262
5263 /* Build a variable-length vector in which the elements in ELTS are repeated
5264 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5265 RESULTS and add any new instructions to SEQ.
5266
5267 The approach we use is:
5268
5269 (1) Find a vector mode VM with integer elements of mode IM.
5270
5271 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5272 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5273 from small vectors to IM.
5274
5275 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5276
5277 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5278 correct byte contents.
5279
5280 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5281
5282 We try to find the largest IM for which this sequence works, in order
5283 to cut down on the number of interleaves. */
5284
5285 void
5286 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
5287 vec<tree> elts, unsigned int nresults,
5288 vec<tree> &results)
5289 {
5290 unsigned int nelts = elts.length ();
5291 tree element_type = TREE_TYPE (vector_type);
5292
5293 /* (1) Find a vector mode VM with integer elements of mode IM. */
5294 unsigned int nvectors = 1;
5295 tree new_vector_type;
5296 tree permutes[2];
5297 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
5298 &nvectors, &new_vector_type,
5299 permutes))
5300 gcc_unreachable ();
5301
5302 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5303 unsigned int partial_nelts = nelts / nvectors;
5304 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
5305
5306 tree_vector_builder partial_elts;
5307 auto_vec<tree, 32> pieces (nvectors * 2);
5308 pieces.quick_grow_cleared (nvectors * 2);
5309 for (unsigned int i = 0; i < nvectors; ++i)
5310 {
5311 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5312 ELTS' has mode IM. */
5313 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
5314 for (unsigned int j = 0; j < partial_nelts; ++j)
5315 partial_elts.quick_push (elts[i * partial_nelts + j]);
5316 tree t = gimple_build_vector (seq, &partial_elts);
5317 t = gimple_build (seq, VIEW_CONVERT_EXPR,
5318 TREE_TYPE (new_vector_type), t);
5319
5320 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5321 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
5322 }
5323
5324 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5325 correct byte contents.
5326
5327 Conceptually, we need to repeat the following operation log2(nvectors)
5328 times, where hi_start = nvectors / 2:
5329
5330 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5331 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5332
5333 However, if each input repeats every N elements and the VF is
5334 a multiple of N * 2, the HI result is the same as the LO result.
5335 This will be true for the first N1 iterations of the outer loop,
5336 followed by N2 iterations for which both the LO and HI results
5337 are needed. I.e.:
5338
5339 N1 + N2 = log2(nvectors)
5340
5341 Each "N1 iteration" doubles the number of redundant vectors and the
5342 effect of the process as a whole is to have a sequence of nvectors/2**N1
5343 vectors that repeats 2**N1 times. Rather than generate these redundant
5344 vectors, we halve the number of vectors for each N1 iteration. */
5345 unsigned int in_start = 0;
5346 unsigned int out_start = nvectors;
5347 unsigned int new_nvectors = nvectors;
5348 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
5349 {
5350 unsigned int hi_start = new_nvectors / 2;
5351 unsigned int out_i = 0;
5352 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
5353 {
5354 if ((in_i & 1) != 0
5355 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
5356 2 * in_repeat))
5357 continue;
5358
5359 tree output = make_ssa_name (new_vector_type);
5360 tree input1 = pieces[in_start + (in_i / 2)];
5361 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
5362 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
5363 input1, input2,
5364 permutes[in_i & 1]);
5365 gimple_seq_add_stmt (seq, stmt);
5366 pieces[out_start + out_i] = output;
5367 out_i += 1;
5368 }
5369 std::swap (in_start, out_start);
5370 new_nvectors = out_i;
5371 }
5372
5373 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5374 results.reserve (nresults);
5375 for (unsigned int i = 0; i < nresults; ++i)
5376 if (i < new_nvectors)
5377 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
5378 pieces[in_start + i]));
5379 else
5380 results.quick_push (results[i - new_nvectors]);
5381 }
5382
5383
5384 /* For constant and loop invariant defs in OP_NODE this function creates
5385 vector defs that will be used in the vectorized stmts and stores them
5386 to SLP_TREE_VEC_DEFS of OP_NODE. */
5387
5388 static void
5389 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
5390 {
5391 unsigned HOST_WIDE_INT nunits;
5392 tree vec_cst;
5393 unsigned j, number_of_places_left_in_vector;
5394 tree vector_type;
5395 tree vop;
5396 int group_size = op_node->ops.length ();
5397 unsigned int vec_num, i;
5398 unsigned number_of_copies = 1;
5399 bool constant_p;
5400 gimple_seq ctor_seq = NULL;
5401 auto_vec<tree, 16> permute_results;
5402
5403 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5404 vector_type = SLP_TREE_VECTYPE (op_node);
5405
5406 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
5407 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
5408 auto_vec<tree> voprnds (number_of_vectors);
5409
5410 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5411 created vectors. It is greater than 1 if unrolling is performed.
5412
5413 For example, we have two scalar operands, s1 and s2 (e.g., group of
5414 strided accesses of size two), while NUNITS is four (i.e., four scalars
5415 of this type can be packed in a vector). The output vector will contain
5416 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5417 will be 2).
5418
5419 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5420 containing the operands.
5421
5422 For example, NUNITS is four as before, and the group size is 8
5423 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5424 {s5, s6, s7, s8}. */
5425
5426 /* When using duplicate_and_interleave, we just need one element for
5427 each scalar statement. */
5428 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5429 nunits = group_size;
5430
5431 number_of_copies = nunits * number_of_vectors / group_size;
5432
5433 number_of_places_left_in_vector = nunits;
5434 constant_p = true;
5435 tree_vector_builder elts (vector_type, nunits, 1);
5436 elts.quick_grow (nunits);
5437 stmt_vec_info insert_after = NULL;
5438 for (j = 0; j < number_of_copies; j++)
5439 {
5440 tree op;
5441 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
5442 {
5443 /* Create 'vect_ = {op0,op1,...,opn}'. */
5444 number_of_places_left_in_vector--;
5445 tree orig_op = op;
5446 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
5447 {
5448 if (CONSTANT_CLASS_P (op))
5449 {
5450 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5451 {
5452 /* Can't use VIEW_CONVERT_EXPR for booleans because
5453 of possibly different sizes of scalar value and
5454 vector element. */
5455 if (integer_zerop (op))
5456 op = build_int_cst (TREE_TYPE (vector_type), 0);
5457 else if (integer_onep (op))
5458 op = build_all_ones_cst (TREE_TYPE (vector_type));
5459 else
5460 gcc_unreachable ();
5461 }
5462 else
5463 op = fold_unary (VIEW_CONVERT_EXPR,
5464 TREE_TYPE (vector_type), op);
5465 gcc_assert (op && CONSTANT_CLASS_P (op));
5466 }
5467 else
5468 {
5469 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
5470 gimple *init_stmt;
5471 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5472 {
5473 tree true_val
5474 = build_all_ones_cst (TREE_TYPE (vector_type));
5475 tree false_val
5476 = build_zero_cst (TREE_TYPE (vector_type));
5477 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
5478 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
5479 op, true_val,
5480 false_val);
5481 }
5482 else
5483 {
5484 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
5485 op);
5486 init_stmt
5487 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
5488 op);
5489 }
5490 gimple_seq_add_stmt (&ctor_seq, init_stmt);
5491 op = new_temp;
5492 }
5493 }
5494 elts[number_of_places_left_in_vector] = op;
5495 if (!CONSTANT_CLASS_P (op))
5496 constant_p = false;
5497 /* For BB vectorization we have to compute an insert location
5498 when a def is inside the analyzed region since we cannot
5499 simply insert at the BB start in this case. */
5500 stmt_vec_info opdef;
5501 if (TREE_CODE (orig_op) == SSA_NAME
5502 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
5503 && is_a <bb_vec_info> (vinfo)
5504 && (opdef = vinfo->lookup_def (orig_op)))
5505 {
5506 if (!insert_after)
5507 insert_after = opdef;
5508 else
5509 insert_after = get_later_stmt (insert_after, opdef);
5510 }
5511
5512 if (number_of_places_left_in_vector == 0)
5513 {
5514 if (constant_p
5515 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5516 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5517 vec_cst = gimple_build_vector (&ctor_seq, &elts);
5518 else
5519 {
5520 if (permute_results.is_empty ())
5521 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
5522 elts, number_of_vectors,
5523 permute_results);
5524 vec_cst = permute_results[number_of_vectors - j - 1];
5525 }
5526 if (!gimple_seq_empty_p (ctor_seq))
5527 {
5528 if (insert_after)
5529 {
5530 gimple_stmt_iterator gsi;
5531 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
5532 {
5533 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
5534 gsi_insert_seq_before (&gsi, ctor_seq,
5535 GSI_CONTINUE_LINKING);
5536 }
5537 else if (!stmt_ends_bb_p (insert_after->stmt))
5538 {
5539 gsi = gsi_for_stmt (insert_after->stmt);
5540 gsi_insert_seq_after (&gsi, ctor_seq,
5541 GSI_CONTINUE_LINKING);
5542 }
5543 else
5544 {
5545 /* When we want to insert after a def where the
5546 defining stmt throws then insert on the fallthru
5547 edge. */
5548 edge e = find_fallthru_edge
5549 (gimple_bb (insert_after->stmt)->succs);
5550 basic_block new_bb
5551 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
5552 gcc_assert (!new_bb);
5553 }
5554 }
5555 else
5556 vinfo->insert_seq_on_entry (NULL, ctor_seq);
5557 ctor_seq = NULL;
5558 }
5559 voprnds.quick_push (vec_cst);
5560 insert_after = NULL;
5561 number_of_places_left_in_vector = nunits;
5562 constant_p = true;
5563 elts.new_vector (vector_type, nunits, 1);
5564 elts.quick_grow (nunits);
5565 }
5566 }
5567 }
5568
5569 /* Since the vectors are created in the reverse order, we should invert
5570 them. */
5571 vec_num = voprnds.length ();
5572 for (j = vec_num; j != 0; j--)
5573 {
5574 vop = voprnds[j - 1];
5575 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5576 }
5577
5578 /* In case that VF is greater than the unrolling factor needed for the SLP
5579 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5580 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5581 to replicate the vectors. */
5582 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
5583 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
5584 i++)
5585 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5586 }
5587
5588 /* Get the Ith vectorized definition from SLP_NODE. */
5589
5590 tree
5591 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
5592 {
5593 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
5594 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
5595 else
5596 return SLP_TREE_VEC_DEFS (slp_node)[i];
5597 }
5598
5599 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5600
5601 void
5602 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
5603 {
5604 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
5605 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
5606 {
5607 unsigned j;
5608 gimple *vec_def_stmt;
5609 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
5610 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
5611 }
5612 else
5613 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
5614 }
5615
5616 /* Get N vectorized definitions for SLP_NODE. */
5617
5618 void
5619 vect_get_slp_defs (vec_info *,
5620 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
5621 {
5622 if (n == -1U)
5623 n = SLP_TREE_CHILDREN (slp_node).length ();
5624
5625 for (unsigned i = 0; i < n; ++i)
5626 {
5627 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
5628 vec<tree> vec_defs = vNULL;
5629 vect_get_slp_defs (child, &vec_defs);
5630 vec_oprnds->quick_push (vec_defs);
5631 }
5632 }
5633
5634 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5635 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5636 permute statements for the SLP node NODE. Store the number of vector
5637 permute instructions in *N_PERMS and the number of vector load
5638 instructions in *N_LOADS. */
5639
5640 bool
5641 vect_transform_slp_perm_load (vec_info *vinfo,
5642 slp_tree node, vec<tree> dr_chain,
5643 gimple_stmt_iterator *gsi, poly_uint64 vf,
5644 bool analyze_only, unsigned *n_perms,
5645 unsigned int *n_loads)
5646 {
5647 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5648 int vec_index = 0;
5649 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5650 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
5651 unsigned int mask_element;
5652 machine_mode mode;
5653
5654 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
5655 return false;
5656
5657 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
5658
5659 mode = TYPE_MODE (vectype);
5660 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5661
5662 /* Initialize the vect stmts of NODE to properly insert the generated
5663 stmts later. */
5664 if (! analyze_only)
5665 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
5666 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
5667 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
5668
5669 /* Generate permutation masks for every NODE. Number of masks for each NODE
5670 is equal to GROUP_SIZE.
5671 E.g., we have a group of three nodes with three loads from the same
5672 location in each node, and the vector size is 4. I.e., we have a
5673 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5674 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5675 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5676 ...
5677
5678 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5679 The last mask is illegal since we assume two operands for permute
5680 operation, and the mask element values can't be outside that range.
5681 Hence, the last mask must be converted into {2,5,5,5}.
5682 For the first two permutations we need the first and the second input
5683 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5684 we need the second and the third vectors: {b1,c1,a2,b2} and
5685 {c2,a3,b3,c3}. */
5686
5687 int vect_stmts_counter = 0;
5688 unsigned int index = 0;
5689 int first_vec_index = -1;
5690 int second_vec_index = -1;
5691 bool noop_p = true;
5692 *n_perms = 0;
5693
5694 vec_perm_builder mask;
5695 unsigned int nelts_to_build;
5696 unsigned int nvectors_per_build;
5697 unsigned int in_nlanes;
5698 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
5699 && multiple_p (nunits, group_size));
5700 if (repeating_p)
5701 {
5702 /* A single vector contains a whole number of copies of the node, so:
5703 (a) all permutes can use the same mask; and
5704 (b) the permutes only need a single vector input. */
5705 mask.new_vector (nunits, group_size, 3);
5706 nelts_to_build = mask.encoded_nelts ();
5707 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
5708 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
5709 }
5710 else
5711 {
5712 /* We need to construct a separate mask for each vector statement. */
5713 unsigned HOST_WIDE_INT const_nunits, const_vf;
5714 if (!nunits.is_constant (&const_nunits)
5715 || !vf.is_constant (&const_vf))
5716 return false;
5717 mask.new_vector (const_nunits, const_nunits, 1);
5718 nelts_to_build = const_vf * group_size;
5719 nvectors_per_build = 1;
5720 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5721 }
5722 auto_sbitmap used_in_lanes (in_nlanes);
5723 bitmap_clear (used_in_lanes);
5724
5725 unsigned int count = mask.encoded_nelts ();
5726 mask.quick_grow (count);
5727 vec_perm_indices indices;
5728
5729 for (unsigned int j = 0; j < nelts_to_build; j++)
5730 {
5731 unsigned int iter_num = j / group_size;
5732 unsigned int stmt_num = j % group_size;
5733 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5734 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5735 bitmap_set_bit (used_in_lanes, i);
5736 if (repeating_p)
5737 {
5738 first_vec_index = 0;
5739 mask_element = i;
5740 }
5741 else
5742 {
5743 /* Enforced before the loop when !repeating_p. */
5744 unsigned int const_nunits = nunits.to_constant ();
5745 vec_index = i / const_nunits;
5746 mask_element = i % const_nunits;
5747 if (vec_index == first_vec_index
5748 || first_vec_index == -1)
5749 {
5750 first_vec_index = vec_index;
5751 }
5752 else if (vec_index == second_vec_index
5753 || second_vec_index == -1)
5754 {
5755 second_vec_index = vec_index;
5756 mask_element += const_nunits;
5757 }
5758 else
5759 {
5760 if (dump_enabled_p ())
5761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5762 "permutation requires at "
5763 "least three vectors %G",
5764 stmt_info->stmt);
5765 gcc_assert (analyze_only);
5766 return false;
5767 }
5768
5769 gcc_assert (mask_element < 2 * const_nunits);
5770 }
5771
5772 if (mask_element != index)
5773 noop_p = false;
5774 mask[index++] = mask_element;
5775
5776 if (index == count && !noop_p)
5777 {
5778 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5779 if (!can_vec_perm_const_p (mode, indices))
5780 {
5781 if (dump_enabled_p ())
5782 {
5783 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5784 vect_location,
5785 "unsupported vect permute { ");
5786 for (i = 0; i < count; ++i)
5787 {
5788 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5789 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5790 }
5791 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5792 }
5793 gcc_assert (analyze_only);
5794 return false;
5795 }
5796
5797 ++*n_perms;
5798 }
5799
5800 if (index == count)
5801 {
5802 if (!analyze_only)
5803 {
5804 tree mask_vec = NULL_TREE;
5805
5806 if (! noop_p)
5807 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5808
5809 if (second_vec_index == -1)
5810 second_vec_index = first_vec_index;
5811
5812 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5813 {
5814 /* Generate the permute statement if necessary. */
5815 tree first_vec = dr_chain[first_vec_index + ri];
5816 tree second_vec = dr_chain[second_vec_index + ri];
5817 gimple *perm_stmt;
5818 if (! noop_p)
5819 {
5820 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5821 tree perm_dest
5822 = vect_create_destination_var (gimple_assign_lhs (stmt),
5823 vectype);
5824 perm_dest = make_ssa_name (perm_dest);
5825 perm_stmt
5826 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5827 first_vec, second_vec,
5828 mask_vec);
5829 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5830 gsi);
5831 }
5832 else
5833 /* If mask was NULL_TREE generate the requested
5834 identity transform. */
5835 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5836
5837 /* Store the vector statement in NODE. */
5838 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5839 }
5840 }
5841
5842 index = 0;
5843 first_vec_index = -1;
5844 second_vec_index = -1;
5845 noop_p = true;
5846 }
5847 }
5848
5849 if (n_loads)
5850 {
5851 if (repeating_p)
5852 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5853 else
5854 {
5855 /* Enforced above when !repeating_p. */
5856 unsigned int const_nunits = nunits.to_constant ();
5857 *n_loads = 0;
5858 bool load_seen = false;
5859 for (unsigned i = 0; i < in_nlanes; ++i)
5860 {
5861 if (i % const_nunits == 0)
5862 {
5863 if (load_seen)
5864 *n_loads += 1;
5865 load_seen = false;
5866 }
5867 if (bitmap_bit_p (used_in_lanes, i))
5868 load_seen = true;
5869 }
5870 if (load_seen)
5871 *n_loads += 1;
5872 }
5873 }
5874
5875 return true;
5876 }
5877
5878 /* Produce the next vector result for SLP permutation NODE by adding a vector
5879 statement at GSI. If MASK_VEC is nonnull, add:
5880
5881 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
5882
5883 otherwise add:
5884
5885 <new SSA name> = FIRST_DEF. */
5886
5887 static void
5888 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5889 slp_tree node, tree first_def, tree second_def,
5890 tree mask_vec)
5891 {
5892 tree vectype = SLP_TREE_VECTYPE (node);
5893
5894 /* ??? We SLP match existing vector element extracts but
5895 allow punning which we need to re-instantiate at uses
5896 but have no good way of explicitly representing. */
5897 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
5898 {
5899 gassign *conv_stmt
5900 = gimple_build_assign (make_ssa_name (vectype),
5901 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
5902 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5903 first_def = gimple_assign_lhs (conv_stmt);
5904 }
5905 gassign *perm_stmt;
5906 tree perm_dest = make_ssa_name (vectype);
5907 if (mask_vec)
5908 {
5909 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
5910 {
5911 gassign *conv_stmt
5912 = gimple_build_assign (make_ssa_name (vectype),
5913 build1 (VIEW_CONVERT_EXPR,
5914 vectype, second_def));
5915 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5916 second_def = gimple_assign_lhs (conv_stmt);
5917 }
5918 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5919 first_def, second_def,
5920 mask_vec);
5921 }
5922 else
5923 /* We need a copy here in case the def was external. */
5924 perm_stmt = gimple_build_assign (perm_dest, first_def);
5925 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5926 /* Store the vector statement in NODE. */
5927 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5928 }
5929
5930 /* Vectorize the SLP permutations in NODE as specified
5931 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5932 child number and lane number.
5933 Interleaving of two two-lane two-child SLP subtrees (not supported):
5934 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5935 A blend of two four-lane two-child SLP subtrees:
5936 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5937 Highpart of a four-lane one-child SLP subtree (not supported):
5938 [ { 0, 2 }, { 0, 3 } ]
5939 Where currently only a subset is supported by code generating below. */
5940
5941 static bool
5942 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5943 slp_tree node, stmt_vector_for_cost *cost_vec)
5944 {
5945 tree vectype = SLP_TREE_VECTYPE (node);
5946
5947 /* ??? We currently only support all same vector input and output types
5948 while the SLP IL should really do a concat + select and thus accept
5949 arbitrary mismatches. */
5950 slp_tree child;
5951 unsigned i;
5952 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5953 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
5954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5955 {
5956 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5957 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5958 {
5959 if (dump_enabled_p ())
5960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5961 "Unsupported lane permutation\n");
5962 return false;
5963 }
5964 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
5965 repeating_p = false;
5966 }
5967
5968 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5969 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5970 if (dump_enabled_p ())
5971 {
5972 dump_printf_loc (MSG_NOTE, vect_location,
5973 "vectorizing permutation");
5974 for (unsigned i = 0; i < perm.length (); ++i)
5975 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5976 if (repeating_p)
5977 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
5978 dump_printf (MSG_NOTE, "\n");
5979 }
5980
5981 /* REPEATING_P is true if every output vector is guaranteed to use the
5982 same permute vector. We can handle that case for both variable-length
5983 and constant-length vectors, but we only handle other cases for
5984 constant-length vectors.
5985
5986 Set:
5987
5988 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
5989 mask vector that we want to build.
5990
5991 - NCOPIES to the number of copies of PERM that we need in order
5992 to build the necessary permute mask vectors.
5993
5994 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
5995 for each permute mask vector. This is only relevant when GSI is
5996 nonnull. */
5997 uint64_t npatterns;
5998 unsigned nelts_per_pattern;
5999 uint64_t ncopies;
6000 unsigned noutputs_per_mask;
6001 if (repeating_p)
6002 {
6003 /* We need a single permute mask vector that has the form:
6004
6005 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
6006
6007 In other words, the original n-element permute in PERM is
6008 "unrolled" to fill a full vector. The stepped vector encoding
6009 that we use for permutes requires 3n elements. */
6010 npatterns = SLP_TREE_LANES (node);
6011 nelts_per_pattern = ncopies = 3;
6012 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6013 }
6014 else
6015 {
6016 /* Calculate every element of every permute mask vector explicitly,
6017 instead of relying on the pattern described above. */
6018 if (!nunits.is_constant (&npatterns))
6019 return false;
6020 nelts_per_pattern = ncopies = 1;
6021 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
6022 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
6023 return false;
6024 noutputs_per_mask = 1;
6025 }
6026 unsigned olanes = ncopies * SLP_TREE_LANES (node);
6027 gcc_assert (repeating_p || multiple_p (olanes, nunits));
6028
6029 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6030 from the { SLP operand, scalar lane } permutation as recorded in the
6031 SLP node as intermediate step. This part should already work
6032 with SLP children with arbitrary number of lanes. */
6033 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
6034 auto_vec<unsigned> active_lane;
6035 vperm.create (olanes);
6036 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
6037 for (unsigned i = 0; i < ncopies; ++i)
6038 {
6039 for (unsigned pi = 0; pi < perm.length (); ++pi)
6040 {
6041 std::pair<unsigned, unsigned> p = perm[pi];
6042 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
6043 if (repeating_p)
6044 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
6045 else
6046 {
6047 /* We checked above that the vectors are constant-length. */
6048 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
6049 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
6050 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
6051 vperm.quick_push ({{p.first, vi}, vl});
6052 }
6053 }
6054 /* Advance to the next group. */
6055 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
6056 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
6057 }
6058
6059 if (dump_enabled_p ())
6060 {
6061 dump_printf_loc (MSG_NOTE, vect_location, "as");
6062 for (unsigned i = 0; i < vperm.length (); ++i)
6063 {
6064 if (i != 0
6065 && (repeating_p
6066 ? multiple_p (i, npatterns)
6067 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
6068 dump_printf (MSG_NOTE, ",");
6069 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
6070 vperm[i].first.first, vperm[i].first.second,
6071 vperm[i].second);
6072 }
6073 dump_printf (MSG_NOTE, "\n");
6074 }
6075
6076 /* We can only handle two-vector permutes, everything else should
6077 be lowered on the SLP level. The following is closely inspired
6078 by vect_transform_slp_perm_load and is supposed to eventually
6079 replace it.
6080 ??? As intermediate step do code-gen in the SLP tree representation
6081 somehow? */
6082 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
6083 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
6084 unsigned int index = 0;
6085 poly_uint64 mask_element;
6086 vec_perm_builder mask;
6087 mask.new_vector (nunits, npatterns, nelts_per_pattern);
6088 unsigned int count = mask.encoded_nelts ();
6089 mask.quick_grow (count);
6090 vec_perm_indices indices;
6091 unsigned nperms = 0;
6092 for (unsigned i = 0; i < vperm.length (); ++i)
6093 {
6094 mask_element = vperm[i].second;
6095 if (first_vec.first == -1U
6096 || first_vec == vperm[i].first)
6097 first_vec = vperm[i].first;
6098 else if (second_vec.first == -1U
6099 || second_vec == vperm[i].first)
6100 {
6101 second_vec = vperm[i].first;
6102 mask_element += nunits;
6103 }
6104 else
6105 {
6106 if (dump_enabled_p ())
6107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6108 "permutation requires at "
6109 "least three vectors\n");
6110 gcc_assert (!gsi);
6111 return false;
6112 }
6113
6114 mask[index++] = mask_element;
6115
6116 if (index == count)
6117 {
6118 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
6119 bool identity_p = indices.series_p (0, 1, 0, 1);
6120 if (!identity_p
6121 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6122 {
6123 if (dump_enabled_p ())
6124 {
6125 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6126 vect_location,
6127 "unsupported vect permute { ");
6128 for (i = 0; i < count; ++i)
6129 {
6130 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6131 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6132 }
6133 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6134 }
6135 gcc_assert (!gsi);
6136 return false;
6137 }
6138
6139 if (!identity_p)
6140 nperms++;
6141 if (gsi)
6142 {
6143 if (second_vec.first == -1U)
6144 second_vec = first_vec;
6145
6146 slp_tree
6147 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
6148 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
6149
6150 tree mask_vec = NULL_TREE;
6151 if (!identity_p)
6152 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6153
6154 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
6155 {
6156 tree first_def
6157 = vect_get_slp_vect_def (first_node,
6158 first_vec.second + vi);
6159 tree second_def
6160 = vect_get_slp_vect_def (second_node,
6161 second_vec.second + vi);
6162 vect_add_slp_permutation (vinfo, gsi, node, first_def,
6163 second_def, mask_vec);
6164 }
6165 }
6166
6167 index = 0;
6168 first_vec = std::make_pair (-1U, -1U);
6169 second_vec = std::make_pair (-1U, -1U);
6170 }
6171 }
6172
6173 if (!gsi)
6174 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
6175
6176 return true;
6177 }
6178
6179 /* Vectorize SLP NODE. */
6180
6181 static void
6182 vect_schedule_slp_node (vec_info *vinfo,
6183 slp_tree node, slp_instance instance)
6184 {
6185 gimple_stmt_iterator si;
6186 int i;
6187 slp_tree child;
6188
6189 /* For existing vectors there's nothing to do. */
6190 if (SLP_TREE_VEC_DEFS (node).exists ())
6191 return;
6192
6193 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
6194
6195 /* Vectorize externals and constants. */
6196 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6197 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6198 {
6199 /* ??? vectorizable_shift can end up using a scalar operand which is
6200 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6201 node in this case. */
6202 if (!SLP_TREE_VECTYPE (node))
6203 return;
6204
6205 vect_create_constant_vectors (vinfo, node);
6206 return;
6207 }
6208
6209 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6210
6211 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
6212 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6213
6214 if (dump_enabled_p ())
6215 dump_printf_loc (MSG_NOTE, vect_location,
6216 "------>vectorizing SLP node starting from: %G",
6217 stmt_info->stmt);
6218
6219 if (STMT_VINFO_DATA_REF (stmt_info)
6220 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6221 {
6222 /* Vectorized loads go before the first scalar load to make it
6223 ready early, vectorized stores go before the last scalar
6224 stmt which is where all uses are ready. */
6225 stmt_vec_info last_stmt_info = NULL;
6226 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
6227 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
6228 else /* DR_IS_WRITE */
6229 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
6230 si = gsi_for_stmt (last_stmt_info->stmt);
6231 }
6232 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
6233 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
6234 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
6235 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
6236 {
6237 /* For PHI node vectorization we do not use the insertion iterator. */
6238 si = gsi_none ();
6239 }
6240 else
6241 {
6242 /* Emit other stmts after the children vectorized defs which is
6243 earliest possible. */
6244 gimple *last_stmt = NULL;
6245 bool seen_vector_def = false;
6246 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6247 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6248 {
6249 /* For fold-left reductions we are retaining the scalar
6250 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6251 set so the representation isn't perfect. Resort to the
6252 last scalar def here. */
6253 if (SLP_TREE_VEC_STMTS (child).is_empty ())
6254 {
6255 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
6256 == cycle_phi_info_type);
6257 gphi *phi = as_a <gphi *>
6258 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
6259 if (!last_stmt
6260 || vect_stmt_dominates_stmt_p (last_stmt, phi))
6261 last_stmt = phi;
6262 }
6263 /* We are emitting all vectorized stmts in the same place and
6264 the last one is the last.
6265 ??? Unless we have a load permutation applied and that
6266 figures to re-use an earlier generated load. */
6267 unsigned j;
6268 gimple *vstmt;
6269 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
6270 if (!last_stmt
6271 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6272 last_stmt = vstmt;
6273 }
6274 else if (!SLP_TREE_VECTYPE (child))
6275 {
6276 /* For externals we use unvectorized at all scalar defs. */
6277 unsigned j;
6278 tree def;
6279 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
6280 if (TREE_CODE (def) == SSA_NAME
6281 && !SSA_NAME_IS_DEFAULT_DEF (def))
6282 {
6283 gimple *stmt = SSA_NAME_DEF_STMT (def);
6284 if (!last_stmt
6285 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
6286 last_stmt = stmt;
6287 }
6288 }
6289 else
6290 {
6291 /* For externals we have to look at all defs since their
6292 insertion place is decided per vector. But beware
6293 of pre-existing vectors where we need to make sure
6294 we do not insert before the region boundary. */
6295 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
6296 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
6297 seen_vector_def = true;
6298 else
6299 {
6300 unsigned j;
6301 tree vdef;
6302 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
6303 if (TREE_CODE (vdef) == SSA_NAME
6304 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
6305 {
6306 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
6307 if (!last_stmt
6308 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
6309 last_stmt = vstmt;
6310 }
6311 }
6312 }
6313 /* This can happen when all children are pre-existing vectors or
6314 constants. */
6315 if (!last_stmt)
6316 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
6317 if (!last_stmt)
6318 {
6319 gcc_assert (seen_vector_def);
6320 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
6321 }
6322 else if (is_ctrl_altering_stmt (last_stmt))
6323 {
6324 /* We split regions to vectorize at control altering stmts
6325 with a definition so this must be an external which
6326 we can insert at the start of the region. */
6327 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
6328 }
6329 else if (is_a <bb_vec_info> (vinfo)
6330 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
6331 && gimple_could_trap_p (stmt_info->stmt))
6332 {
6333 /* We've constrained possibly trapping operations to all come
6334 from the same basic-block, if vectorized defs would allow earlier
6335 scheduling still force vectorized stmts to the original block.
6336 This is only necessary for BB vectorization since for loop vect
6337 all operations are in a single BB and scalar stmt based
6338 placement doesn't play well with epilogue vectorization. */
6339 gcc_assert (dominated_by_p (CDI_DOMINATORS,
6340 gimple_bb (stmt_info->stmt),
6341 gimple_bb (last_stmt)));
6342 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
6343 }
6344 else if (is_a <gphi *> (last_stmt))
6345 si = gsi_after_labels (gimple_bb (last_stmt));
6346 else
6347 {
6348 si = gsi_for_stmt (last_stmt);
6349 gsi_next (&si);
6350 }
6351 }
6352
6353 bool done_p = false;
6354
6355 /* Handle purely internal nodes. */
6356 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6357 {
6358 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6359 be shared with different SLP nodes (but usually it's the same
6360 operation apart from the case the stmt is only there for denoting
6361 the actual scalar lane defs ...). So do not call vect_transform_stmt
6362 but open-code it here (partly). */
6363 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
6364 gcc_assert (done);
6365 done_p = true;
6366 }
6367 if (!done_p)
6368 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
6369 }
6370
6371 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6372 For loop vectorization this is done in vectorizable_call, but for SLP
6373 it needs to be deferred until end of vect_schedule_slp, because multiple
6374 SLP instances may refer to the same scalar stmt. */
6375
6376 static void
6377 vect_remove_slp_scalar_calls (vec_info *vinfo,
6378 slp_tree node, hash_set<slp_tree> &visited)
6379 {
6380 gimple *new_stmt;
6381 gimple_stmt_iterator gsi;
6382 int i;
6383 slp_tree child;
6384 tree lhs;
6385 stmt_vec_info stmt_info;
6386
6387 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6388 return;
6389
6390 if (visited.add (node))
6391 return;
6392
6393 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6394 vect_remove_slp_scalar_calls (vinfo, child, visited);
6395
6396 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6397 {
6398 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
6399 if (!stmt || gimple_bb (stmt) == NULL)
6400 continue;
6401 if (is_pattern_stmt_p (stmt_info)
6402 || !PURE_SLP_STMT (stmt_info))
6403 continue;
6404 lhs = gimple_call_lhs (stmt);
6405 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
6406 gsi = gsi_for_stmt (stmt);
6407 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
6408 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
6409 }
6410 }
6411
6412 static void
6413 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
6414 {
6415 hash_set<slp_tree> visited;
6416 vect_remove_slp_scalar_calls (vinfo, node, visited);
6417 }
6418
6419 /* Vectorize the instance root. */
6420
6421 void
6422 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
6423 {
6424 gassign *rstmt = NULL;
6425
6426 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
6427 {
6428 gimple *child_stmt;
6429 int j;
6430
6431 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6432 {
6433 tree vect_lhs = gimple_get_lhs (child_stmt);
6434 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
6435 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
6436 TREE_TYPE (vect_lhs)))
6437 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
6438 vect_lhs);
6439 rstmt = gimple_build_assign (root_lhs, vect_lhs);
6440 break;
6441 }
6442 }
6443 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
6444 {
6445 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6446 gimple *child_stmt;
6447 int j;
6448 vec<constructor_elt, va_gc> *v;
6449 vec_alloc (v, nelts);
6450
6451 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6452 {
6453 CONSTRUCTOR_APPEND_ELT (v,
6454 NULL_TREE,
6455 gimple_get_lhs (child_stmt));
6456 }
6457 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
6458 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
6459 tree r_constructor = build_constructor (rtype, v);
6460 rstmt = gimple_build_assign (lhs, r_constructor);
6461 }
6462
6463 gcc_assert (rstmt);
6464
6465 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
6466 gsi_replace (&rgsi, rstmt, true);
6467 }
6468
6469 struct slp_scc_info
6470 {
6471 bool on_stack;
6472 int dfs;
6473 int lowlink;
6474 };
6475
6476 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6477
6478 static void
6479 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
6480 hash_map<slp_tree, slp_scc_info> &scc_info,
6481 int &maxdfs, vec<slp_tree> &stack)
6482 {
6483 bool existed_p;
6484 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
6485 gcc_assert (!existed_p);
6486 info->dfs = maxdfs;
6487 info->lowlink = maxdfs;
6488 maxdfs++;
6489
6490 /* Leaf. */
6491 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6492 {
6493 info->on_stack = false;
6494 vect_schedule_slp_node (vinfo, node, instance);
6495 return;
6496 }
6497
6498 info->on_stack = true;
6499 stack.safe_push (node);
6500
6501 unsigned i;
6502 slp_tree child;
6503 /* DFS recurse. */
6504 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6505 {
6506 if (!child)
6507 continue;
6508 slp_scc_info *child_info = scc_info.get (child);
6509 if (!child_info)
6510 {
6511 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
6512 /* Recursion might have re-allocated the node. */
6513 info = scc_info.get (node);
6514 child_info = scc_info.get (child);
6515 info->lowlink = MIN (info->lowlink, child_info->lowlink);
6516 }
6517 else if (child_info->on_stack)
6518 info->lowlink = MIN (info->lowlink, child_info->dfs);
6519 }
6520 if (info->lowlink != info->dfs)
6521 return;
6522
6523 auto_vec<slp_tree, 4> phis_to_fixup;
6524
6525 /* Singleton. */
6526 if (stack.last () == node)
6527 {
6528 stack.pop ();
6529 info->on_stack = false;
6530 vect_schedule_slp_node (vinfo, node, instance);
6531 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6532 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
6533 phis_to_fixup.quick_push (node);
6534 }
6535 else
6536 {
6537 /* SCC. */
6538 int last_idx = stack.length () - 1;
6539 while (stack[last_idx] != node)
6540 last_idx--;
6541 /* We can break the cycle at PHIs who have at least one child
6542 code generated. Then we could re-start the DFS walk until
6543 all nodes in the SCC are covered (we might have new entries
6544 for only back-reachable nodes). But it's simpler to just
6545 iterate and schedule those that are ready. */
6546 unsigned todo = stack.length () - last_idx;
6547 do
6548 {
6549 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
6550 {
6551 slp_tree entry = stack[idx];
6552 if (!entry)
6553 continue;
6554 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
6555 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
6556 bool ready = !phi;
6557 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
6558 if (!child)
6559 {
6560 gcc_assert (phi);
6561 ready = true;
6562 break;
6563 }
6564 else if (scc_info.get (child)->on_stack)
6565 {
6566 if (!phi)
6567 {
6568 ready = false;
6569 break;
6570 }
6571 }
6572 else
6573 {
6574 if (phi)
6575 {
6576 ready = true;
6577 break;
6578 }
6579 }
6580 if (ready)
6581 {
6582 vect_schedule_slp_node (vinfo, entry, instance);
6583 scc_info.get (entry)->on_stack = false;
6584 stack[idx] = NULL;
6585 todo--;
6586 if (phi)
6587 phis_to_fixup.safe_push (entry);
6588 }
6589 }
6590 }
6591 while (todo != 0);
6592
6593 /* Pop the SCC. */
6594 stack.truncate (last_idx);
6595 }
6596
6597 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6598 slp_tree phi_node;
6599 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
6600 {
6601 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
6602 edge_iterator ei;
6603 edge e;
6604 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
6605 {
6606 unsigned dest_idx = e->dest_idx;
6607 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
6608 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
6609 continue;
6610 /* Simply fill all args. */
6611 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
6612 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
6613 vect_get_slp_vect_def (child, i),
6614 e, gimple_phi_arg_location (phi, dest_idx));
6615 }
6616 }
6617 }
6618
6619 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6620
6621 void
6622 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
6623 {
6624 slp_instance instance;
6625 unsigned int i;
6626
6627 hash_map<slp_tree, slp_scc_info> scc_info;
6628 int maxdfs = 0;
6629 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6630 {
6631 slp_tree node = SLP_INSTANCE_TREE (instance);
6632 if (dump_enabled_p ())
6633 {
6634 dump_printf_loc (MSG_NOTE, vect_location,
6635 "Vectorizing SLP tree:\n");
6636 if (SLP_INSTANCE_ROOT_STMT (instance))
6637 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
6638 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
6639 vect_print_slp_graph (MSG_NOTE, vect_location,
6640 SLP_INSTANCE_TREE (instance));
6641 }
6642 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6643 have a PHI be the node breaking the cycle. */
6644 auto_vec<slp_tree> stack;
6645 if (!scc_info.get (node))
6646 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
6647
6648 if (SLP_INSTANCE_ROOT_STMT (instance))
6649 vectorize_slp_instance_root_stmt (node, instance);
6650
6651 if (dump_enabled_p ())
6652 dump_printf_loc (MSG_NOTE, vect_location,
6653 "vectorizing stmts using SLP.\n");
6654 }
6655
6656 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6657 {
6658 slp_tree root = SLP_INSTANCE_TREE (instance);
6659 stmt_vec_info store_info;
6660 unsigned int j;
6661
6662 /* Remove scalar call stmts. Do not do this for basic-block
6663 vectorization as not all uses may be vectorized.
6664 ??? Why should this be necessary? DCE should be able to
6665 remove the stmts itself.
6666 ??? For BB vectorization we can as well remove scalar
6667 stmts starting from the SLP tree root if they have no
6668 uses. */
6669 if (is_a <loop_vec_info> (vinfo))
6670 vect_remove_slp_scalar_calls (vinfo, root);
6671
6672 /* Remove vectorized stores original scalar stmts. */
6673 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
6674 {
6675 if (!STMT_VINFO_DATA_REF (store_info)
6676 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
6677 break;
6678
6679 store_info = vect_orig_stmt (store_info);
6680 /* Free the attached stmt_vec_info and remove the stmt. */
6681 vinfo->remove_stmt (store_info);
6682
6683 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
6684 to not crash in vect_free_slp_tree later. */
6685 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
6686 SLP_TREE_REPRESENTATIVE (root) = NULL;
6687 }
6688 }
6689 }
This page took 0.372204 seconds and 5 git commands to generate.