]> gcc.gnu.org Git - gcc.git/blob - gcc/tree-vect-slp.c
7bff5118bd00c5ed660c35b02bf70b1bb4b31834
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
56
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
59
60 void
61 vect_slp_init (void)
62 {
63 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
64 }
65
66 void
67 vect_slp_fini (void)
68 {
69 while (slp_first_node)
70 delete slp_first_node;
71 delete slp_tree_pool;
72 slp_tree_pool = NULL;
73 }
74
75 void *
76 _slp_tree::operator new (size_t n)
77 {
78 gcc_assert (n == sizeof (_slp_tree));
79 return slp_tree_pool->allocate_raw ();
80 }
81
82 void
83 _slp_tree::operator delete (void *node, size_t n)
84 {
85 gcc_assert (n == sizeof (_slp_tree));
86 slp_tree_pool->remove_raw (node);
87 }
88
89
90 /* Initialize a SLP node. */
91
92 _slp_tree::_slp_tree ()
93 {
94 this->prev_node = NULL;
95 if (slp_first_node)
96 slp_first_node->prev_node = this;
97 this->next_node = slp_first_node;
98 slp_first_node = this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL;
100 SLP_TREE_SCALAR_OPS (this) = vNULL;
101 SLP_TREE_VEC_STMTS (this) = vNULL;
102 SLP_TREE_VEC_DEFS (this) = vNULL;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108 SLP_TREE_CODE (this) = ERROR_MARK;
109 SLP_TREE_VECTYPE (this) = NULL_TREE;
110 SLP_TREE_REPRESENTATIVE (this) = NULL;
111 SLP_TREE_REF_COUNT (this) = 1;
112 this->failed = NULL;
113 this->max_nunits = 1;
114 this->lanes = 0;
115 }
116
117 /* Tear down a SLP node. */
118
119 _slp_tree::~_slp_tree ()
120 {
121 if (this->prev_node)
122 this->prev_node->next_node = this->next_node;
123 else
124 slp_first_node = this->next_node;
125 if (this->next_node)
126 this->next_node->prev_node = this->prev_node;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
134 if (this->failed)
135 free (failed);
136 }
137
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
139
140 void
141 vect_free_slp_tree (slp_tree node)
142 {
143 int i;
144 slp_tree child;
145
146 if (--SLP_TREE_REF_COUNT (node) != 0)
147 return;
148
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150 if (child)
151 vect_free_slp_tree (child);
152
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
157 {
158 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
161 }
162
163 delete node;
164 }
165
166 /* Return a location suitable for dumpings related to the SLP instance. */
167
168 dump_user_location_t
169 _slp_instance::location () const
170 {
171 if (!root_stmts.is_empty ())
172 return root_stmts[0]->stmt;
173 else
174 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
175 }
176
177
178 /* Free the memory allocated for the SLP instance. */
179
180 void
181 vect_free_slp_instance (slp_instance instance)
182 {
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184 SLP_INSTANCE_LOADS (instance).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance).release ();
186 instance->subgraph_entries.release ();
187 instance->cost_vec.release ();
188 free (instance);
189 }
190
191
192 /* Create an SLP node for SCALAR_STMTS. */
193
194 slp_tree
195 vect_create_new_slp_node (unsigned nops, tree_code code)
196 {
197 slp_tree node = new _slp_tree;
198 SLP_TREE_SCALAR_STMTS (node) = vNULL;
199 SLP_TREE_CHILDREN (node).create (nops);
200 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201 SLP_TREE_CODE (node) = code;
202 return node;
203 }
204 /* Create an SLP node for SCALAR_STMTS. */
205
206 static slp_tree
207 vect_create_new_slp_node (slp_tree node,
208 vec<stmt_vec_info> scalar_stmts, unsigned nops)
209 {
210 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211 SLP_TREE_CHILDREN (node).create (nops);
212 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214 SLP_TREE_LANES (node) = scalar_stmts.length ();
215 return node;
216 }
217
218 /* Create an SLP node for SCALAR_STMTS. */
219
220 static slp_tree
221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
222 {
223 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
224 }
225
226 /* Create an SLP node for OPS. */
227
228 static slp_tree
229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
230 {
231 SLP_TREE_SCALAR_OPS (node) = ops;
232 SLP_TREE_DEF_TYPE (node) = vect_external_def;
233 SLP_TREE_LANES (node) = ops.length ();
234 return node;
235 }
236
237 /* Create an SLP node for OPS. */
238
239 static slp_tree
240 vect_create_new_slp_node (vec<tree> ops)
241 {
242 return vect_create_new_slp_node (new _slp_tree, ops);
243 }
244
245
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
248 node. */
249 typedef struct _slp_oprnd_info
250 {
251 /* Def-stmts for the operands. */
252 vec<stmt_vec_info> def_stmts;
253 /* Operands. */
254 vec<tree> ops;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
257 stmt. */
258 tree first_op_type;
259 enum vect_def_type first_dt;
260 bool any_pattern;
261 } *slp_oprnd_info;
262
263
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265 operand. */
266 static vec<slp_oprnd_info>
267 vect_create_oprnd_info (int nops, int group_size)
268 {
269 int i;
270 slp_oprnd_info oprnd_info;
271 vec<slp_oprnd_info> oprnds_info;
272
273 oprnds_info.create (nops);
274 for (i = 0; i < nops; i++)
275 {
276 oprnd_info = XNEW (struct _slp_oprnd_info);
277 oprnd_info->def_stmts.create (group_size);
278 oprnd_info->ops.create (group_size);
279 oprnd_info->first_dt = vect_uninitialized_def;
280 oprnd_info->first_op_type = NULL_TREE;
281 oprnd_info->any_pattern = false;
282 oprnds_info.quick_push (oprnd_info);
283 }
284
285 return oprnds_info;
286 }
287
288
289 /* Free operands info. */
290
291 static void
292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
293 {
294 int i;
295 slp_oprnd_info oprnd_info;
296
297 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
298 {
299 oprnd_info->def_stmts.release ();
300 oprnd_info->ops.release ();
301 XDELETE (oprnd_info);
302 }
303
304 oprnds_info.release ();
305 }
306
307
308 /* Return true if STMTS contains a pattern statement. */
309
310 static bool
311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
312 {
313 stmt_vec_info stmt_info;
314 unsigned int i;
315 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316 if (is_pattern_stmt_p (stmt_info))
317 return true;
318 return false;
319 }
320
321 /* Return true when all lanes in the external or constant NODE have
322 the same value. */
323
324 static bool
325 vect_slp_tree_uniform_p (slp_tree node)
326 {
327 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
329
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332 return false;
333
334 unsigned i;
335 tree op, first = NULL_TREE;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337 if (!first)
338 first = op;
339 else if (!operand_equal_p (first, op, 0))
340 return false;
341
342 return true;
343 }
344
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
347 of the chain. */
348
349 int
350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 stmt_vec_info first_stmt_info)
352 {
353 stmt_vec_info next_stmt_info = first_stmt_info;
354 int result = 0;
355
356 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357 return -1;
358
359 do
360 {
361 if (next_stmt_info == stmt_info)
362 return result;
363 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364 if (next_stmt_info)
365 result += DR_GROUP_GAP (next_stmt_info);
366 }
367 while (next_stmt_info);
368
369 return -1;
370 }
371
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376 (if nonnull). */
377
378 bool
379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 tree elt_type, unsigned int *nvectors_out,
381 tree *vector_type_out,
382 tree *permutes)
383 {
384 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386 return false;
387
388 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390 unsigned int nvectors = 1;
391 for (;;)
392 {
393 scalar_int_mode int_mode;
394 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
396 {
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type = build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode), 1);
400 tree vector_type
401 = get_vectype_for_scalar_type (vinfo, int_type, count);
402 if (vector_type
403 && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 GET_MODE_SIZE (base_vector_mode)))
406 {
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 vec_perm_builder sel1 (nelts, 2, 3);
412 vec_perm_builder sel2 (nelts, 2, 3);
413 poly_int64 half_nelts = exact_div (nelts, 2);
414 for (unsigned int i = 0; i < 3; ++i)
415 {
416 sel1.quick_push (i);
417 sel1.quick_push (i + nelts);
418 sel2.quick_push (half_nelts + i);
419 sel2.quick_push (half_nelts + i + nelts);
420 }
421 vec_perm_indices indices1 (sel1, 2, nelts);
422 vec_perm_indices indices2 (sel2, 2, nelts);
423 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
425 {
426 if (nvectors_out)
427 *nvectors_out = nvectors;
428 if (vector_type_out)
429 *vector_type_out = vector_type;
430 if (permutes)
431 {
432 permutes[0] = vect_gen_perm_mask_checked (vector_type,
433 indices1);
434 permutes[1] = vect_gen_perm_mask_checked (vector_type,
435 indices2);
436 }
437 return true;
438 }
439 }
440 }
441 if (!multiple_p (elt_bytes, 2, &elt_bytes))
442 return false;
443 nvectors *= 2;
444 }
445 }
446
447 /* Return true if DTA and DTB match. */
448
449 static bool
450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
451 {
452 return (dta == dtb
453 || ((dta == vect_external_def || dta == vect_constant_def)
454 && (dtb == vect_external_def || dtb == vect_constant_def)));
455 }
456
457 static const int cond_expr_maps[3][5] = {
458 { 4, -1, -2, 1, 2 },
459 { 4, -2, -1, 1, 2 },
460 { 4, -1, -2, 2, 1 }
461 };
462 static const int arg1_map[] = { 1, 1 };
463 static const int arg2_map[] = { 1, 2 };
464 static const int arg1_arg4_map[] = { 2, 1, 4 };
465
466 /* For most SLP statements, there is a one-to-one mapping between
467 gimple arguments and child nodes. If that is not true for STMT,
468 return an array that contains:
469
470 - the number of child nodes, followed by
471 - for each child node, the index of the argument associated with that node.
472 The special index -1 is the first operand of an embedded comparison and
473 the special index -2 is the second operand of an embedded comparison.
474
475 SWAP is as for vect_get_and_check_slp_defs. */
476
477 static const int *
478 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
479 {
480 if (auto assign = dyn_cast<const gassign *> (stmt))
481 {
482 if (gimple_assign_rhs_code (assign) == COND_EXPR
483 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
484 return cond_expr_maps[swap];
485 }
486 gcc_assert (!swap);
487 if (auto call = dyn_cast<const gcall *> (stmt))
488 {
489 if (gimple_call_internal_p (call))
490 switch (gimple_call_internal_fn (call))
491 {
492 case IFN_MASK_LOAD:
493 return arg2_map;
494
495 case IFN_GATHER_LOAD:
496 return arg1_map;
497
498 case IFN_MASK_GATHER_LOAD:
499 return arg1_arg4_map;
500
501 default:
502 break;
503 }
504 }
505 return nullptr;
506 }
507
508 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
509 they are of a valid type and that they match the defs of the first stmt of
510 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
511 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
512 indicates swap is required for cond_expr stmts. Specifically, SWAP
513 is 1 if STMT is cond and operands of comparison need to be swapped;
514 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
515
516 If there was a fatal error return -1; if the error could be corrected by
517 swapping operands of father node of this one, return 1; if everything is
518 ok return 0. */
519 static int
520 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
521 bool *skip_args,
522 vec<stmt_vec_info> stmts, unsigned stmt_num,
523 vec<slp_oprnd_info> *oprnds_info)
524 {
525 stmt_vec_info stmt_info = stmts[stmt_num];
526 tree oprnd;
527 unsigned int i, number_of_oprnds;
528 enum vect_def_type dt = vect_uninitialized_def;
529 slp_oprnd_info oprnd_info;
530 unsigned int commutative_op = -1U;
531 bool first = stmt_num == 0;
532
533 if (!is_a<gcall *> (stmt_info->stmt)
534 && !is_a<gassign *> (stmt_info->stmt)
535 && !is_a<gphi *> (stmt_info->stmt))
536 return -1;
537
538 number_of_oprnds = gimple_num_args (stmt_info->stmt);
539 const int *map = vect_get_operand_map (stmt_info->stmt, swap);
540 if (map)
541 number_of_oprnds = *map++;
542 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
543 {
544 if (gimple_call_internal_p (stmt))
545 {
546 internal_fn ifn = gimple_call_internal_fn (stmt);
547 commutative_op = first_commutative_argument (ifn);
548 }
549 }
550 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
551 {
552 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
553 commutative_op = 0;
554 }
555
556 bool swapped = (swap != 0);
557 bool backedge = false;
558 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
559 for (i = 0; i < number_of_oprnds; i++)
560 {
561 int opno = map ? map[i] : int (i);
562 if (opno < 0)
563 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
564 else
565 {
566 oprnd = gimple_arg (stmt_info->stmt, opno);
567 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
568 backedge = dominated_by_p (CDI_DOMINATORS,
569 gimple_phi_arg_edge (stmt, opno)->src,
570 gimple_bb (stmt_info->stmt));
571 }
572 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
573 oprnd = TREE_OPERAND (oprnd, 0);
574
575 oprnd_info = (*oprnds_info)[i];
576
577 stmt_vec_info def_stmt_info;
578 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
579 {
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582 "Build SLP failed: can't analyze def for %T\n",
583 oprnd);
584
585 return -1;
586 }
587
588 if (skip_args[i])
589 {
590 oprnd_info->def_stmts.quick_push (NULL);
591 oprnd_info->ops.quick_push (NULL_TREE);
592 oprnd_info->first_dt = vect_uninitialized_def;
593 continue;
594 }
595
596 oprnd_info->def_stmts.quick_push (def_stmt_info);
597 oprnd_info->ops.quick_push (oprnd);
598
599 if (def_stmt_info
600 && is_pattern_stmt_p (def_stmt_info))
601 {
602 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
603 != def_stmt_info)
604 oprnd_info->any_pattern = true;
605 else
606 /* If we promote this to external use the original stmt def. */
607 oprnd_info->ops.last ()
608 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
609 }
610
611 /* If there's a extern def on a backedge make sure we can
612 code-generate at the region start.
613 ??? This is another case that could be fixed by adjusting
614 how we split the function but at the moment we'd have conflicting
615 goals there. */
616 if (backedge
617 && dts[i] == vect_external_def
618 && is_a <bb_vec_info> (vinfo)
619 && TREE_CODE (oprnd) == SSA_NAME
620 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
621 && !dominated_by_p (CDI_DOMINATORS,
622 as_a <bb_vec_info> (vinfo)->bbs[0],
623 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
624 {
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "Build SLP failed: extern def %T only defined "
628 "on backedge\n", oprnd);
629 return -1;
630 }
631
632 if (first)
633 {
634 tree type = TREE_TYPE (oprnd);
635 dt = dts[i];
636 if ((dt == vect_constant_def
637 || dt == vect_external_def)
638 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
639 && (TREE_CODE (type) == BOOLEAN_TYPE
640 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
641 type)))
642 {
643 if (dump_enabled_p ())
644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
645 "Build SLP failed: invalid type of def "
646 "for variable-length SLP %T\n", oprnd);
647 return -1;
648 }
649
650 /* For the swapping logic below force vect_reduction_def
651 for the reduction op in a SLP reduction group. */
652 if (!STMT_VINFO_DATA_REF (stmt_info)
653 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
654 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
655 && def_stmt_info)
656 dts[i] = dt = vect_reduction_def;
657
658 /* Check the types of the definition. */
659 switch (dt)
660 {
661 case vect_external_def:
662 case vect_constant_def:
663 case vect_internal_def:
664 case vect_reduction_def:
665 case vect_induction_def:
666 case vect_nested_cycle:
667 break;
668
669 default:
670 /* FORNOW: Not supported. */
671 if (dump_enabled_p ())
672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
673 "Build SLP failed: illegal type of def %T\n",
674 oprnd);
675 return -1;
676 }
677
678 oprnd_info->first_dt = dt;
679 oprnd_info->first_op_type = type;
680 }
681 }
682 if (first)
683 return 0;
684
685 /* Now match the operand definition types to that of the first stmt. */
686 for (i = 0; i < number_of_oprnds;)
687 {
688 if (skip_args[i])
689 {
690 ++i;
691 continue;
692 }
693
694 oprnd_info = (*oprnds_info)[i];
695 dt = dts[i];
696 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
697 oprnd = oprnd_info->ops[stmt_num];
698 tree type = TREE_TYPE (oprnd);
699
700 if (!types_compatible_p (oprnd_info->first_op_type, type))
701 {
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Build SLP failed: different operand types\n");
705 return 1;
706 }
707
708 /* Not first stmt of the group, check that the def-stmt/s match
709 the def-stmt/s of the first stmt. Allow different definition
710 types for reduction chains: the first stmt must be a
711 vect_reduction_def (a phi node), and the rest
712 end in the reduction chain. */
713 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
714 && !(oprnd_info->first_dt == vect_reduction_def
715 && !STMT_VINFO_DATA_REF (stmt_info)
716 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
717 && def_stmt_info
718 && !STMT_VINFO_DATA_REF (def_stmt_info)
719 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
720 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
721 || (!STMT_VINFO_DATA_REF (stmt_info)
722 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
723 && ((!def_stmt_info
724 || STMT_VINFO_DATA_REF (def_stmt_info)
725 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
726 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
727 != (oprnd_info->first_dt != vect_reduction_def))))
728 {
729 /* Try swapping operands if we got a mismatch. For BB
730 vectorization only in case it will clearly improve things. */
731 if (i == commutative_op && !swapped
732 && (!is_a <bb_vec_info> (vinfo)
733 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
734 dts[i+1])
735 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
736 || vect_def_types_match
737 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
738 {
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location,
741 "trying swapped operands\n");
742 std::swap (dts[i], dts[i+1]);
743 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
744 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
745 std::swap ((*oprnds_info)[i]->ops[stmt_num],
746 (*oprnds_info)[i+1]->ops[stmt_num]);
747 swapped = true;
748 continue;
749 }
750
751 if (is_a <bb_vec_info> (vinfo)
752 && !oprnd_info->any_pattern)
753 {
754 /* Now for commutative ops we should see whether we can
755 make the other operand matching. */
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "treating operand as external\n");
759 oprnd_info->first_dt = dt = vect_external_def;
760 }
761 else
762 {
763 if (dump_enabled_p ())
764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
765 "Build SLP failed: different types\n");
766 return 1;
767 }
768 }
769
770 /* Make sure to demote the overall operand to external. */
771 if (dt == vect_external_def)
772 oprnd_info->first_dt = vect_external_def;
773 /* For a SLP reduction chain we want to duplicate the reduction to
774 each of the chain members. That gets us a sane SLP graph (still
775 the stmts are not 100% correct wrt the initial values). */
776 else if ((dt == vect_internal_def
777 || dt == vect_reduction_def)
778 && oprnd_info->first_dt == vect_reduction_def
779 && !STMT_VINFO_DATA_REF (stmt_info)
780 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
781 && !STMT_VINFO_DATA_REF (def_stmt_info)
782 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
783 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
784 {
785 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
786 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
787 }
788
789 ++i;
790 }
791
792 /* Swap operands. */
793 if (swapped)
794 {
795 if (dump_enabled_p ())
796 dump_printf_loc (MSG_NOTE, vect_location,
797 "swapped operands to match def types in %G",
798 stmt_info->stmt);
799 }
800
801 return 0;
802 }
803
804 /* Return true if call statements CALL1 and CALL2 are similar enough
805 to be combined into the same SLP group. */
806
807 static bool
808 compatible_calls_p (gcall *call1, gcall *call2)
809 {
810 unsigned int nargs = gimple_call_num_args (call1);
811 if (nargs != gimple_call_num_args (call2))
812 return false;
813
814 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
815 return false;
816
817 if (gimple_call_internal_p (call1))
818 {
819 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
820 TREE_TYPE (gimple_call_lhs (call2))))
821 return false;
822 for (unsigned int i = 0; i < nargs; ++i)
823 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
824 TREE_TYPE (gimple_call_arg (call2, i))))
825 return false;
826 }
827 else
828 {
829 if (!operand_equal_p (gimple_call_fn (call1),
830 gimple_call_fn (call2), 0))
831 return false;
832
833 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
834 return false;
835 }
836
837 /* Check that any unvectorized arguments are equal. */
838 if (const int *map = vect_get_operand_map (call1))
839 {
840 unsigned int nkept = *map++;
841 unsigned int mapi = 0;
842 for (unsigned int i = 0; i < nargs; ++i)
843 if (mapi < nkept && map[mapi] == int (i))
844 mapi += 1;
845 else if (!operand_equal_p (gimple_call_arg (call1, i),
846 gimple_call_arg (call2, i)))
847 return false;
848 }
849
850 return true;
851 }
852
853 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
854 caller's attempt to find the vector type in STMT_INFO with the narrowest
855 element type. Return true if VECTYPE is nonnull and if it is valid
856 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
857 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
858 vect_build_slp_tree. */
859
860 static bool
861 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
862 unsigned int group_size,
863 tree vectype, poly_uint64 *max_nunits)
864 {
865 if (!vectype)
866 {
867 if (dump_enabled_p ())
868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
869 "Build SLP failed: unsupported data-type in %G\n",
870 stmt_info->stmt);
871 /* Fatal mismatch. */
872 return false;
873 }
874
875 /* If populating the vector type requires unrolling then fail
876 before adjusting *max_nunits for basic-block vectorization. */
877 if (is_a <bb_vec_info> (vinfo)
878 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
879 {
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
882 "Build SLP failed: unrolling required "
883 "in basic block SLP\n");
884 /* Fatal mismatch. */
885 return false;
886 }
887
888 /* In case of multiple types we need to detect the smallest type. */
889 vect_update_max_nunits (max_nunits, vectype);
890 return true;
891 }
892
893 /* Verify if the scalar stmts STMTS are isomorphic, require data
894 permutation or are of unsupported types of operation. Return
895 true if they are, otherwise return false and indicate in *MATCHES
896 which stmts are not isomorphic to the first one. If MATCHES[0]
897 is false then this indicates the comparison could not be
898 carried out or the stmts will never be vectorized by SLP.
899
900 Note COND_EXPR is possibly isomorphic to another one after swapping its
901 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
902 the first stmt by swapping the two operands of comparison; set SWAP[i]
903 to 2 if stmt I is isormorphic to the first stmt by inverting the code
904 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
905 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
906
907 static bool
908 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
909 vec<stmt_vec_info> stmts, unsigned int group_size,
910 poly_uint64 *max_nunits, bool *matches,
911 bool *two_operators, tree *node_vectype)
912 {
913 unsigned int i;
914 stmt_vec_info first_stmt_info = stmts[0];
915 code_helper first_stmt_code = ERROR_MARK;
916 code_helper alt_stmt_code = ERROR_MARK;
917 code_helper rhs_code = ERROR_MARK;
918 code_helper first_cond_code = ERROR_MARK;
919 tree lhs;
920 bool need_same_oprnds = false;
921 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
922 stmt_vec_info first_load = NULL, prev_first_load = NULL;
923 bool first_stmt_load_p = false, load_p = false;
924 bool first_stmt_phi_p = false, phi_p = false;
925 bool maybe_soft_fail = false;
926 tree soft_fail_nunits_vectype = NULL_TREE;
927
928 /* For every stmt in NODE find its def stmt/s. */
929 stmt_vec_info stmt_info;
930 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
931 {
932 gimple *stmt = stmt_info->stmt;
933 swap[i] = 0;
934 matches[i] = false;
935
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
938
939 /* Fail to vectorize statements marked as unvectorizable, throw
940 or are volatile. */
941 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
942 || stmt_can_throw_internal (cfun, stmt)
943 || gimple_has_volatile_ops (stmt))
944 {
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 "Build SLP failed: unvectorizable statement %G",
948 stmt);
949 /* ??? For BB vectorization we want to commutate operands in a way
950 to shuffle all unvectorizable defs into one operand and have
951 the other still vectorized. The following doesn't reliably
952 work for this though but it's the easiest we can do here. */
953 if (is_a <bb_vec_info> (vinfo) && i != 0)
954 continue;
955 /* Fatal mismatch. */
956 matches[0] = false;
957 return false;
958 }
959
960 lhs = gimple_get_lhs (stmt);
961 if (lhs == NULL_TREE)
962 {
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 "Build SLP failed: not GIMPLE_ASSIGN nor "
966 "GIMPLE_CALL %G", stmt);
967 if (is_a <bb_vec_info> (vinfo) && i != 0)
968 continue;
969 /* Fatal mismatch. */
970 matches[0] = false;
971 return false;
972 }
973
974 tree nunits_vectype;
975 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
976 &nunits_vectype, group_size))
977 {
978 if (is_a <bb_vec_info> (vinfo) && i != 0)
979 continue;
980 /* Fatal mismatch. */
981 matches[0] = false;
982 return false;
983 }
984 /* Record nunits required but continue analysis, producing matches[]
985 as if nunits was not an issue. This allows splitting of groups
986 to happen. */
987 if (nunits_vectype
988 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
989 nunits_vectype, max_nunits))
990 {
991 gcc_assert (is_a <bb_vec_info> (vinfo));
992 maybe_soft_fail = true;
993 soft_fail_nunits_vectype = nunits_vectype;
994 }
995
996 gcc_assert (vectype);
997
998 gcall *call_stmt = dyn_cast <gcall *> (stmt);
999 if (call_stmt)
1000 {
1001 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1002 if (cfn != CFN_LAST)
1003 rhs_code = cfn;
1004 else
1005 rhs_code = CALL_EXPR;
1006
1007 if (cfn == CFN_MASK_LOAD
1008 || cfn == CFN_GATHER_LOAD
1009 || cfn == CFN_MASK_GATHER_LOAD)
1010 load_p = true;
1011 else if ((internal_fn_p (cfn)
1012 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1013 || gimple_call_tail_p (call_stmt)
1014 || gimple_call_noreturn_p (call_stmt)
1015 || gimple_call_chain (call_stmt))
1016 {
1017 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019 "Build SLP failed: unsupported call type %G",
1020 call_stmt);
1021 if (is_a <bb_vec_info> (vinfo) && i != 0)
1022 continue;
1023 /* Fatal mismatch. */
1024 matches[0] = false;
1025 return false;
1026 }
1027 }
1028 else if (gimple_code (stmt) == GIMPLE_PHI)
1029 {
1030 rhs_code = ERROR_MARK;
1031 phi_p = true;
1032 }
1033 else
1034 {
1035 rhs_code = gimple_assign_rhs_code (stmt);
1036 load_p = gimple_vuse (stmt);
1037 }
1038
1039 /* Check the operation. */
1040 if (i == 0)
1041 {
1042 *node_vectype = vectype;
1043 first_stmt_code = rhs_code;
1044 first_stmt_load_p = load_p;
1045 first_stmt_phi_p = phi_p;
1046
1047 /* Shift arguments should be equal in all the packed stmts for a
1048 vector shift with scalar shift operand. */
1049 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1050 || rhs_code == LROTATE_EXPR
1051 || rhs_code == RROTATE_EXPR)
1052 {
1053 /* First see if we have a vector/vector shift. */
1054 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1055 {
1056 /* No vector/vector shift, try for a vector/scalar shift. */
1057 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1058 {
1059 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061 "Build SLP failed: "
1062 "op not supported by target.\n");
1063 if (is_a <bb_vec_info> (vinfo) && i != 0)
1064 continue;
1065 /* Fatal mismatch. */
1066 matches[0] = false;
1067 return false;
1068 }
1069 need_same_oprnds = true;
1070 first_op1 = gimple_assign_rhs2 (stmt);
1071 }
1072 }
1073 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1074 {
1075 need_same_oprnds = true;
1076 first_op1 = gimple_assign_rhs2 (stmt);
1077 }
1078 else if (!load_p
1079 && rhs_code == BIT_FIELD_REF)
1080 {
1081 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1082 if (!is_a <bb_vec_info> (vinfo)
1083 || TREE_CODE (vec) != SSA_NAME
1084 || !operand_equal_p (TYPE_SIZE (vectype),
1085 TYPE_SIZE (TREE_TYPE (vec))))
1086 {
1087 if (dump_enabled_p ())
1088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1089 "Build SLP failed: "
1090 "BIT_FIELD_REF not supported\n");
1091 /* Fatal mismatch. */
1092 matches[0] = false;
1093 return false;
1094 }
1095 }
1096 else if (rhs_code == CFN_DIV_POW2)
1097 {
1098 need_same_oprnds = true;
1099 first_op1 = gimple_call_arg (call_stmt, 1);
1100 }
1101 }
1102 else
1103 {
1104 if (first_stmt_code != rhs_code
1105 && alt_stmt_code == ERROR_MARK)
1106 alt_stmt_code = rhs_code;
1107 if ((first_stmt_code != rhs_code
1108 && (first_stmt_code != IMAGPART_EXPR
1109 || rhs_code != REALPART_EXPR)
1110 && (first_stmt_code != REALPART_EXPR
1111 || rhs_code != IMAGPART_EXPR)
1112 /* Handle mismatches in plus/minus by computing both
1113 and merging the results. */
1114 && !((first_stmt_code == PLUS_EXPR
1115 || first_stmt_code == MINUS_EXPR)
1116 && (alt_stmt_code == PLUS_EXPR
1117 || alt_stmt_code == MINUS_EXPR)
1118 && rhs_code == alt_stmt_code)
1119 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1120 && (first_stmt_code == ARRAY_REF
1121 || first_stmt_code == BIT_FIELD_REF
1122 || first_stmt_code == INDIRECT_REF
1123 || first_stmt_code == COMPONENT_REF
1124 || first_stmt_code == MEM_REF)))
1125 || first_stmt_load_p != load_p
1126 || first_stmt_phi_p != phi_p)
1127 {
1128 if (dump_enabled_p ())
1129 {
1130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1131 "Build SLP failed: different operation "
1132 "in stmt %G", stmt);
1133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1134 "original stmt %G", first_stmt_info->stmt);
1135 }
1136 /* Mismatch. */
1137 continue;
1138 }
1139
1140 if (!load_p
1141 && first_stmt_code == BIT_FIELD_REF
1142 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1143 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1144 {
1145 if (dump_enabled_p ())
1146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1147 "Build SLP failed: different BIT_FIELD_REF "
1148 "arguments in %G", stmt);
1149 /* Mismatch. */
1150 continue;
1151 }
1152
1153 if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1154 {
1155 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1156 call_stmt))
1157 {
1158 if (dump_enabled_p ())
1159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1160 "Build SLP failed: different calls in %G",
1161 stmt);
1162 /* Mismatch. */
1163 continue;
1164 }
1165 }
1166
1167 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1168 && (gimple_bb (first_stmt_info->stmt)
1169 != gimple_bb (stmt_info->stmt)))
1170 {
1171 if (dump_enabled_p ())
1172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1173 "Build SLP failed: different BB for PHI "
1174 "or possibly trapping operation in %G", stmt);
1175 /* Mismatch. */
1176 continue;
1177 }
1178
1179 if (need_same_oprnds)
1180 {
1181 tree other_op1 = gimple_arg (stmt, 1);
1182 if (!operand_equal_p (first_op1, other_op1, 0))
1183 {
1184 if (dump_enabled_p ())
1185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186 "Build SLP failed: different shift "
1187 "arguments in %G", stmt);
1188 /* Mismatch. */
1189 continue;
1190 }
1191 }
1192
1193 if (!types_compatible_p (vectype, *node_vectype))
1194 {
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197 "Build SLP failed: different vector type "
1198 "in %G", stmt);
1199 /* Mismatch. */
1200 continue;
1201 }
1202 }
1203
1204 /* Grouped store or load. */
1205 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1206 {
1207 if (REFERENCE_CLASS_P (lhs))
1208 {
1209 /* Store. */
1210 ;
1211 }
1212 else
1213 {
1214 /* Load. */
1215 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1216 if (prev_first_load)
1217 {
1218 /* Check that there are no loads from different interleaving
1219 chains in the same node. */
1220 if (prev_first_load != first_load)
1221 {
1222 if (dump_enabled_p ())
1223 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1224 vect_location,
1225 "Build SLP failed: different "
1226 "interleaving chains in one node %G",
1227 stmt);
1228 /* Mismatch. */
1229 continue;
1230 }
1231 }
1232 else
1233 prev_first_load = first_load;
1234 }
1235 } /* Grouped access. */
1236 else
1237 {
1238 if (load_p
1239 && rhs_code != CFN_GATHER_LOAD
1240 && rhs_code != CFN_MASK_GATHER_LOAD)
1241 {
1242 /* Not grouped load. */
1243 if (dump_enabled_p ())
1244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1245 "Build SLP failed: not grouped load %G", stmt);
1246
1247 /* FORNOW: Not grouped loads are not supported. */
1248 if (is_a <bb_vec_info> (vinfo) && i != 0)
1249 continue;
1250 /* Fatal mismatch. */
1251 matches[0] = false;
1252 return false;
1253 }
1254
1255 /* Not memory operation. */
1256 if (!phi_p
1257 && rhs_code.is_tree_code ()
1258 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1259 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1260 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1261 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1262 && rhs_code != VIEW_CONVERT_EXPR
1263 && rhs_code != CALL_EXPR
1264 && rhs_code != BIT_FIELD_REF)
1265 {
1266 if (dump_enabled_p ())
1267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1268 "Build SLP failed: operation unsupported %G",
1269 stmt);
1270 if (is_a <bb_vec_info> (vinfo) && i != 0)
1271 continue;
1272 /* Fatal mismatch. */
1273 matches[0] = false;
1274 return false;
1275 }
1276
1277 if (rhs_code == COND_EXPR)
1278 {
1279 tree cond_expr = gimple_assign_rhs1 (stmt);
1280 enum tree_code cond_code = TREE_CODE (cond_expr);
1281 enum tree_code swap_code = ERROR_MARK;
1282 enum tree_code invert_code = ERROR_MARK;
1283
1284 if (i == 0)
1285 first_cond_code = TREE_CODE (cond_expr);
1286 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1287 {
1288 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1289 swap_code = swap_tree_comparison (cond_code);
1290 invert_code = invert_tree_comparison (cond_code, honor_nans);
1291 }
1292
1293 if (first_cond_code == cond_code)
1294 ;
1295 /* Isomorphic can be achieved by swapping. */
1296 else if (first_cond_code == swap_code)
1297 swap[i] = 1;
1298 /* Isomorphic can be achieved by inverting. */
1299 else if (first_cond_code == invert_code)
1300 swap[i] = 2;
1301 else
1302 {
1303 if (dump_enabled_p ())
1304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1305 "Build SLP failed: different"
1306 " operation %G", stmt);
1307 /* Mismatch. */
1308 continue;
1309 }
1310 }
1311 }
1312
1313 matches[i] = true;
1314 }
1315
1316 for (i = 0; i < group_size; ++i)
1317 if (!matches[i])
1318 return false;
1319
1320 /* If we allowed a two-operation SLP node verify the target can cope
1321 with the permute we are going to use. */
1322 if (alt_stmt_code != ERROR_MARK
1323 && (!alt_stmt_code.is_tree_code ()
1324 || TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference))
1325 {
1326 *two_operators = true;
1327 }
1328
1329 if (maybe_soft_fail)
1330 {
1331 unsigned HOST_WIDE_INT const_nunits;
1332 if (!TYPE_VECTOR_SUBPARTS
1333 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1334 || const_nunits > group_size)
1335 matches[0] = false;
1336 else
1337 {
1338 /* With constant vector elements simulate a mismatch at the
1339 point we need to split. */
1340 unsigned tail = group_size & (const_nunits - 1);
1341 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1342 }
1343 return false;
1344 }
1345
1346 return true;
1347 }
1348
1349 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1350 Note we never remove apart from at destruction time so we do not
1351 need a special value for deleted that differs from empty. */
1352 struct bst_traits
1353 {
1354 typedef vec <stmt_vec_info> value_type;
1355 typedef vec <stmt_vec_info> compare_type;
1356 static inline hashval_t hash (value_type);
1357 static inline bool equal (value_type existing, value_type candidate);
1358 static inline bool is_empty (value_type x) { return !x.exists (); }
1359 static inline bool is_deleted (value_type x) { return !x.exists (); }
1360 static const bool empty_zero_p = true;
1361 static inline void mark_empty (value_type &x) { x.release (); }
1362 static inline void mark_deleted (value_type &x) { x.release (); }
1363 static inline void remove (value_type &x) { x.release (); }
1364 };
1365 inline hashval_t
1366 bst_traits::hash (value_type x)
1367 {
1368 inchash::hash h;
1369 for (unsigned i = 0; i < x.length (); ++i)
1370 h.add_int (gimple_uid (x[i]->stmt));
1371 return h.end ();
1372 }
1373 inline bool
1374 bst_traits::equal (value_type existing, value_type candidate)
1375 {
1376 if (existing.length () != candidate.length ())
1377 return false;
1378 for (unsigned i = 0; i < existing.length (); ++i)
1379 if (existing[i] != candidate[i])
1380 return false;
1381 return true;
1382 }
1383
1384 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1385 but then vec::insert does memmove and that's not compatible with
1386 std::pair. */
1387 struct chain_op_t
1388 {
1389 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1390 : code (code_), dt (dt_), op (op_) {}
1391 tree_code code;
1392 vect_def_type dt;
1393 tree op;
1394 };
1395
1396 /* Comparator for sorting associatable chains. */
1397
1398 static int
1399 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1400 {
1401 auto *op1 = (const chain_op_t *) op1_;
1402 auto *op2 = (const chain_op_t *) op2_;
1403 if (op1->dt != op2->dt)
1404 return (int)op1->dt - (int)op2->dt;
1405 return (int)op1->code - (int)op2->code;
1406 }
1407
1408 /* Linearize the associatable expression chain at START with the
1409 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1410 filling CHAIN with the result and using WORKLIST as intermediate storage.
1411 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1412 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1413 stmts, starting with START. */
1414
1415 static void
1416 vect_slp_linearize_chain (vec_info *vinfo,
1417 vec<std::pair<tree_code, gimple *> > &worklist,
1418 vec<chain_op_t> &chain,
1419 enum tree_code code, gimple *start,
1420 gimple *&code_stmt, gimple *&alt_code_stmt,
1421 vec<gimple *> *chain_stmts)
1422 {
1423 /* For each lane linearize the addition/subtraction (or other
1424 uniform associatable operation) expression tree. */
1425 worklist.safe_push (std::make_pair (code, start));
1426 while (!worklist.is_empty ())
1427 {
1428 auto entry = worklist.pop ();
1429 gassign *stmt = as_a <gassign *> (entry.second);
1430 enum tree_code in_code = entry.first;
1431 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1432 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1433 if (!code_stmt
1434 && gimple_assign_rhs_code (stmt) == code)
1435 code_stmt = stmt;
1436 else if (!alt_code_stmt
1437 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1438 alt_code_stmt = stmt;
1439 if (chain_stmts)
1440 chain_stmts->safe_push (stmt);
1441 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1442 {
1443 tree op = gimple_op (stmt, opnum);
1444 vect_def_type dt;
1445 stmt_vec_info def_stmt_info;
1446 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1447 gcc_assert (res);
1448 if (dt == vect_internal_def
1449 && is_pattern_stmt_p (def_stmt_info))
1450 op = gimple_get_lhs (def_stmt_info->stmt);
1451 gimple *use_stmt;
1452 use_operand_p use_p;
1453 if (dt == vect_internal_def
1454 && single_imm_use (op, &use_p, &use_stmt)
1455 && is_gimple_assign (def_stmt_info->stmt)
1456 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1457 || (code == PLUS_EXPR
1458 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1459 == MINUS_EXPR))))
1460 {
1461 tree_code op_def_code = this_code;
1462 if (op_def_code == MINUS_EXPR && opnum == 1)
1463 op_def_code = PLUS_EXPR;
1464 if (in_code == MINUS_EXPR)
1465 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1466 worklist.safe_push (std::make_pair (op_def_code,
1467 def_stmt_info->stmt));
1468 }
1469 else
1470 {
1471 tree_code op_def_code = this_code;
1472 if (op_def_code == MINUS_EXPR && opnum == 1)
1473 op_def_code = PLUS_EXPR;
1474 if (in_code == MINUS_EXPR)
1475 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1476 chain.safe_push (chain_op_t (op_def_code, dt, op));
1477 }
1478 }
1479 }
1480 }
1481
1482 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1483 simple_hashmap_traits <bst_traits, slp_tree> >
1484 scalar_stmts_to_slp_tree_map_t;
1485
1486 static slp_tree
1487 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1488 vec<stmt_vec_info> stmts, unsigned int group_size,
1489 poly_uint64 *max_nunits,
1490 bool *matches, unsigned *limit, unsigned *tree_size,
1491 scalar_stmts_to_slp_tree_map_t *bst_map);
1492
1493 static slp_tree
1494 vect_build_slp_tree (vec_info *vinfo,
1495 vec<stmt_vec_info> stmts, unsigned int group_size,
1496 poly_uint64 *max_nunits,
1497 bool *matches, unsigned *limit, unsigned *tree_size,
1498 scalar_stmts_to_slp_tree_map_t *bst_map)
1499 {
1500 if (slp_tree *leader = bst_map->get (stmts))
1501 {
1502 if (dump_enabled_p ())
1503 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1504 !(*leader)->failed ? "" : "failed ", *leader);
1505 if (!(*leader)->failed)
1506 {
1507 SLP_TREE_REF_COUNT (*leader)++;
1508 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1509 stmts.release ();
1510 return *leader;
1511 }
1512 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1513 return NULL;
1514 }
1515
1516 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1517 so we can pick up backedge destinations during discovery. */
1518 slp_tree res = new _slp_tree;
1519 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1520 SLP_TREE_SCALAR_STMTS (res) = stmts;
1521 bst_map->put (stmts.copy (), res);
1522
1523 if (*limit == 0)
1524 {
1525 if (dump_enabled_p ())
1526 dump_printf_loc (MSG_NOTE, vect_location,
1527 "SLP discovery limit exceeded\n");
1528 /* Mark the node invalid so we can detect those when still in use
1529 as backedge destinations. */
1530 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1531 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1532 res->failed = XNEWVEC (bool, group_size);
1533 memset (res->failed, 0, sizeof (bool) * group_size);
1534 memset (matches, 0, sizeof (bool) * group_size);
1535 return NULL;
1536 }
1537 --*limit;
1538
1539 if (dump_enabled_p ())
1540 dump_printf_loc (MSG_NOTE, vect_location,
1541 "starting SLP discovery for node %p\n", res);
1542
1543 poly_uint64 this_max_nunits = 1;
1544 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1545 &this_max_nunits,
1546 matches, limit, tree_size, bst_map);
1547 if (!res_)
1548 {
1549 if (dump_enabled_p ())
1550 dump_printf_loc (MSG_NOTE, vect_location,
1551 "SLP discovery for node %p failed\n", res);
1552 /* Mark the node invalid so we can detect those when still in use
1553 as backedge destinations. */
1554 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1555 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1556 res->failed = XNEWVEC (bool, group_size);
1557 if (flag_checking)
1558 {
1559 unsigned i;
1560 for (i = 0; i < group_size; ++i)
1561 if (!matches[i])
1562 break;
1563 gcc_assert (i < group_size);
1564 }
1565 memcpy (res->failed, matches, sizeof (bool) * group_size);
1566 }
1567 else
1568 {
1569 if (dump_enabled_p ())
1570 dump_printf_loc (MSG_NOTE, vect_location,
1571 "SLP discovery for node %p succeeded\n", res);
1572 gcc_assert (res_ == res);
1573 res->max_nunits = this_max_nunits;
1574 vect_update_max_nunits (max_nunits, this_max_nunits);
1575 /* Keep a reference for the bst_map use. */
1576 SLP_TREE_REF_COUNT (res)++;
1577 }
1578 return res_;
1579 }
1580
1581 /* Helper for building an associated SLP node chain. */
1582
1583 static void
1584 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1585 slp_tree op0, slp_tree op1,
1586 stmt_vec_info oper1, stmt_vec_info oper2,
1587 vec<std::pair<unsigned, unsigned> > lperm)
1588 {
1589 unsigned group_size = SLP_TREE_LANES (op1);
1590
1591 slp_tree child1 = new _slp_tree;
1592 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1593 SLP_TREE_VECTYPE (child1) = vectype;
1594 SLP_TREE_LANES (child1) = group_size;
1595 SLP_TREE_CHILDREN (child1).create (2);
1596 SLP_TREE_CHILDREN (child1).quick_push (op0);
1597 SLP_TREE_CHILDREN (child1).quick_push (op1);
1598 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1599
1600 slp_tree child2 = new _slp_tree;
1601 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1602 SLP_TREE_VECTYPE (child2) = vectype;
1603 SLP_TREE_LANES (child2) = group_size;
1604 SLP_TREE_CHILDREN (child2).create (2);
1605 SLP_TREE_CHILDREN (child2).quick_push (op0);
1606 SLP_TREE_REF_COUNT (op0)++;
1607 SLP_TREE_CHILDREN (child2).quick_push (op1);
1608 SLP_TREE_REF_COUNT (op1)++;
1609 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1610
1611 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1612 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1613 SLP_TREE_VECTYPE (perm) = vectype;
1614 SLP_TREE_LANES (perm) = group_size;
1615 /* ??? We should set this NULL but that's not expected. */
1616 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1617 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1618 SLP_TREE_CHILDREN (perm).quick_push (child1);
1619 SLP_TREE_CHILDREN (perm).quick_push (child2);
1620 }
1621
1622 /* Recursively build an SLP tree starting from NODE.
1623 Fail (and return a value not equal to zero) if def-stmts are not
1624 isomorphic, require data permutation or are of unsupported types of
1625 operation. Otherwise, return 0.
1626 The value returned is the depth in the SLP tree where a mismatch
1627 was found. */
1628
1629 static slp_tree
1630 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1631 vec<stmt_vec_info> stmts, unsigned int group_size,
1632 poly_uint64 *max_nunits,
1633 bool *matches, unsigned *limit, unsigned *tree_size,
1634 scalar_stmts_to_slp_tree_map_t *bst_map)
1635 {
1636 unsigned nops, i, this_tree_size = 0;
1637 poly_uint64 this_max_nunits = *max_nunits;
1638
1639 matches[0] = false;
1640
1641 stmt_vec_info stmt_info = stmts[0];
1642 if (!is_a<gcall *> (stmt_info->stmt)
1643 && !is_a<gassign *> (stmt_info->stmt)
1644 && !is_a<gphi *> (stmt_info->stmt))
1645 return NULL;
1646
1647 nops = gimple_num_args (stmt_info->stmt);
1648 if (const int *map = vect_get_operand_map (stmt_info->stmt))
1649 nops = map[0];
1650
1651 /* If the SLP node is a PHI (induction or reduction), terminate
1652 the recursion. */
1653 bool *skip_args = XALLOCAVEC (bool, nops);
1654 memset (skip_args, 0, sizeof (bool) * nops);
1655 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1656 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1657 {
1658 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1659 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1660 group_size);
1661 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1662 max_nunits))
1663 return NULL;
1664
1665 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1666 if (def_type == vect_induction_def)
1667 {
1668 /* Induction PHIs are not cycles but walk the initial
1669 value. Only for inner loops through, for outer loops
1670 we need to pick up the value from the actual PHIs
1671 to more easily support peeling and epilogue vectorization. */
1672 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1673 if (!nested_in_vect_loop_p (loop, stmt_info))
1674 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1675 else
1676 loop = loop->inner;
1677 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1678 }
1679 else if (def_type == vect_reduction_def
1680 || def_type == vect_double_reduction_def
1681 || def_type == vect_nested_cycle)
1682 {
1683 /* Else def types have to match. */
1684 stmt_vec_info other_info;
1685 bool all_same = true;
1686 FOR_EACH_VEC_ELT (stmts, i, other_info)
1687 {
1688 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1689 return NULL;
1690 if (other_info != stmt_info)
1691 all_same = false;
1692 }
1693 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1694 /* Reduction initial values are not explicitely represented. */
1695 if (!nested_in_vect_loop_p (loop, stmt_info))
1696 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1697 /* Reduction chain backedge defs are filled manually.
1698 ??? Need a better way to identify a SLP reduction chain PHI.
1699 Or a better overall way to SLP match those. */
1700 if (all_same && def_type == vect_reduction_def)
1701 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1702 }
1703 else if (def_type != vect_internal_def)
1704 return NULL;
1705 }
1706
1707
1708 bool two_operators = false;
1709 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1710 tree vectype = NULL_TREE;
1711 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1712 &this_max_nunits, matches, &two_operators,
1713 &vectype))
1714 return NULL;
1715
1716 /* If the SLP node is a load, terminate the recursion unless masked. */
1717 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1718 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1719 {
1720 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1721 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1722 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1723 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1724 else
1725 {
1726 *max_nunits = this_max_nunits;
1727 (*tree_size)++;
1728 node = vect_create_new_slp_node (node, stmts, 0);
1729 SLP_TREE_VECTYPE (node) = vectype;
1730 /* And compute the load permutation. Whether it is actually
1731 a permutation depends on the unrolling factor which is
1732 decided later. */
1733 vec<unsigned> load_permutation;
1734 int j;
1735 stmt_vec_info load_info;
1736 load_permutation.create (group_size);
1737 stmt_vec_info first_stmt_info
1738 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1739 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1740 {
1741 int load_place = vect_get_place_in_interleaving_chain
1742 (load_info, first_stmt_info);
1743 gcc_assert (load_place != -1);
1744 load_permutation.safe_push (load_place);
1745 }
1746 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1747 return node;
1748 }
1749 }
1750 else if (gimple_assign_single_p (stmt_info->stmt)
1751 && !gimple_vuse (stmt_info->stmt)
1752 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1753 {
1754 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1755 the same SSA name vector of a compatible type to vectype. */
1756 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1757 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1758 stmt_vec_info estmt_info;
1759 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1760 {
1761 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1762 tree bfref = gimple_assign_rhs1 (estmt);
1763 HOST_WIDE_INT lane;
1764 if (!known_eq (bit_field_size (bfref),
1765 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1766 || !constant_multiple_p (bit_field_offset (bfref),
1767 bit_field_size (bfref), &lane))
1768 {
1769 lperm.release ();
1770 matches[0] = false;
1771 return NULL;
1772 }
1773 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1774 }
1775 slp_tree vnode = vect_create_new_slp_node (vNULL);
1776 /* ??? We record vectype here but we hide eventually necessary
1777 punning and instead rely on code generation to materialize
1778 VIEW_CONVERT_EXPRs as necessary. We instead should make
1779 this explicit somehow. */
1780 SLP_TREE_VECTYPE (vnode) = vectype;
1781 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1782 /* We are always building a permutation node even if it is an identity
1783 permute to shield the rest of the vectorizer from the odd node
1784 representing an actual vector without any scalar ops.
1785 ??? We could hide it completely with making the permute node
1786 external? */
1787 node = vect_create_new_slp_node (node, stmts, 1);
1788 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1789 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1790 SLP_TREE_VECTYPE (node) = vectype;
1791 SLP_TREE_CHILDREN (node).quick_push (vnode);
1792 return node;
1793 }
1794 /* When discovery reaches an associatable operation see whether we can
1795 improve that to match up lanes in a way superior to the operand
1796 swapping code which at most looks at two defs.
1797 ??? For BB vectorization we cannot do the brute-force search
1798 for matching as we can succeed by means of builds from scalars
1799 and have no good way to "cost" one build against another. */
1800 else if (is_a <loop_vec_info> (vinfo)
1801 /* ??? We don't handle !vect_internal_def defs below. */
1802 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1803 && is_gimple_assign (stmt_info->stmt)
1804 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1805 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1806 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1807 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1808 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1809 {
1810 /* See if we have a chain of (mixed) adds or subtracts or other
1811 associatable ops. */
1812 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1813 if (code == MINUS_EXPR)
1814 code = PLUS_EXPR;
1815 stmt_vec_info other_op_stmt_info = NULL;
1816 stmt_vec_info op_stmt_info = NULL;
1817 unsigned chain_len = 0;
1818 auto_vec<chain_op_t> chain;
1819 auto_vec<std::pair<tree_code, gimple *> > worklist;
1820 auto_vec<vec<chain_op_t> > chains (group_size);
1821 auto_vec<slp_tree, 4> children;
1822 bool hard_fail = true;
1823 for (unsigned lane = 0; lane < group_size; ++lane)
1824 {
1825 /* For each lane linearize the addition/subtraction (or other
1826 uniform associatable operation) expression tree. */
1827 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1828 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1829 stmts[lane]->stmt, op_stmt, other_op_stmt,
1830 NULL);
1831 if (!op_stmt_info && op_stmt)
1832 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1833 if (!other_op_stmt_info && other_op_stmt)
1834 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1835 if (chain.length () == 2)
1836 {
1837 /* In a chain of just two elements resort to the regular
1838 operand swapping scheme. If we run into a length
1839 mismatch still hard-FAIL. */
1840 if (chain_len == 0)
1841 hard_fail = false;
1842 else
1843 {
1844 matches[lane] = false;
1845 /* ??? We might want to process the other lanes, but
1846 make sure to not give false matching hints to the
1847 caller for lanes we did not process. */
1848 if (lane != group_size - 1)
1849 matches[0] = false;
1850 }
1851 break;
1852 }
1853 else if (chain_len == 0)
1854 chain_len = chain.length ();
1855 else if (chain.length () != chain_len)
1856 {
1857 /* ??? Here we could slip in magic to compensate with
1858 neutral operands. */
1859 matches[lane] = false;
1860 if (lane != group_size - 1)
1861 matches[0] = false;
1862 break;
1863 }
1864 chains.quick_push (chain.copy ());
1865 chain.truncate (0);
1866 }
1867 if (chains.length () == group_size)
1868 {
1869 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1870 if (!op_stmt_info)
1871 {
1872 hard_fail = false;
1873 goto out;
1874 }
1875 /* Now we have a set of chains with the same length. */
1876 /* 1. pre-sort according to def_type and operation. */
1877 for (unsigned lane = 0; lane < group_size; ++lane)
1878 chains[lane].stablesort (dt_sort_cmp, vinfo);
1879 if (dump_enabled_p ())
1880 {
1881 dump_printf_loc (MSG_NOTE, vect_location,
1882 "pre-sorted chains of %s\n",
1883 get_tree_code_name (code));
1884 for (unsigned lane = 0; lane < group_size; ++lane)
1885 {
1886 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1887 dump_printf (MSG_NOTE, "%s %T ",
1888 get_tree_code_name (chains[lane][opnum].code),
1889 chains[lane][opnum].op);
1890 dump_printf (MSG_NOTE, "\n");
1891 }
1892 }
1893 /* 2. try to build children nodes, associating as necessary. */
1894 for (unsigned n = 0; n < chain_len; ++n)
1895 {
1896 vect_def_type dt = chains[0][n].dt;
1897 unsigned lane;
1898 for (lane = 0; lane < group_size; ++lane)
1899 if (chains[lane][n].dt != dt)
1900 {
1901 if (dt == vect_constant_def
1902 && chains[lane][n].dt == vect_external_def)
1903 dt = vect_external_def;
1904 else if (dt == vect_external_def
1905 && chains[lane][n].dt == vect_constant_def)
1906 ;
1907 else
1908 break;
1909 }
1910 if (lane != group_size)
1911 {
1912 if (dump_enabled_p ())
1913 dump_printf_loc (MSG_NOTE, vect_location,
1914 "giving up on chain due to mismatched "
1915 "def types\n");
1916 matches[lane] = false;
1917 if (lane != group_size - 1)
1918 matches[0] = false;
1919 goto out;
1920 }
1921 if (dt == vect_constant_def
1922 || dt == vect_external_def)
1923 {
1924 /* We can always build those. Might want to sort last
1925 or defer building. */
1926 vec<tree> ops;
1927 ops.create (group_size);
1928 for (lane = 0; lane < group_size; ++lane)
1929 ops.quick_push (chains[lane][n].op);
1930 slp_tree child = vect_create_new_slp_node (ops);
1931 SLP_TREE_DEF_TYPE (child) = dt;
1932 children.safe_push (child);
1933 }
1934 else if (dt != vect_internal_def)
1935 {
1936 /* Not sure, we might need sth special.
1937 gcc.dg/vect/pr96854.c,
1938 gfortran.dg/vect/fast-math-pr37021.f90
1939 and gfortran.dg/vect/pr61171.f trigger. */
1940 /* Soft-fail for now. */
1941 hard_fail = false;
1942 goto out;
1943 }
1944 else
1945 {
1946 vec<stmt_vec_info> op_stmts;
1947 op_stmts.create (group_size);
1948 slp_tree child = NULL;
1949 /* Brute-force our way. We have to consider a lane
1950 failing after fixing an earlier fail up in the
1951 SLP discovery recursion. So track the current
1952 permute per lane. */
1953 unsigned *perms = XALLOCAVEC (unsigned, group_size);
1954 memset (perms, 0, sizeof (unsigned) * group_size);
1955 do
1956 {
1957 op_stmts.truncate (0);
1958 for (lane = 0; lane < group_size; ++lane)
1959 op_stmts.quick_push
1960 (vinfo->lookup_def (chains[lane][n].op));
1961 child = vect_build_slp_tree (vinfo, op_stmts,
1962 group_size, &this_max_nunits,
1963 matches, limit,
1964 &this_tree_size, bst_map);
1965 /* ??? We're likely getting too many fatal mismatches
1966 here so maybe we want to ignore them (but then we
1967 have no idea which lanes fatally mismatched). */
1968 if (child || !matches[0])
1969 break;
1970 /* Swap another lane we have not yet matched up into
1971 lanes that did not match. If we run out of
1972 permute possibilities for a lane terminate the
1973 search. */
1974 bool term = false;
1975 for (lane = 1; lane < group_size; ++lane)
1976 if (!matches[lane])
1977 {
1978 if (n + perms[lane] + 1 == chain_len)
1979 {
1980 term = true;
1981 break;
1982 }
1983 std::swap (chains[lane][n],
1984 chains[lane][n + perms[lane] + 1]);
1985 perms[lane]++;
1986 }
1987 if (term)
1988 break;
1989 }
1990 while (1);
1991 if (!child)
1992 {
1993 if (dump_enabled_p ())
1994 dump_printf_loc (MSG_NOTE, vect_location,
1995 "failed to match up op %d\n", n);
1996 op_stmts.release ();
1997 if (lane != group_size - 1)
1998 matches[0] = false;
1999 else
2000 matches[lane] = false;
2001 goto out;
2002 }
2003 if (dump_enabled_p ())
2004 {
2005 dump_printf_loc (MSG_NOTE, vect_location,
2006 "matched up op %d to\n", n);
2007 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2008 }
2009 children.safe_push (child);
2010 }
2011 }
2012 /* 3. build SLP nodes to combine the chain. */
2013 for (unsigned lane = 0; lane < group_size; ++lane)
2014 if (chains[lane][0].code != code)
2015 {
2016 /* See if there's any alternate all-PLUS entry. */
2017 unsigned n;
2018 for (n = 1; n < chain_len; ++n)
2019 {
2020 for (lane = 0; lane < group_size; ++lane)
2021 if (chains[lane][n].code != code)
2022 break;
2023 if (lane == group_size)
2024 break;
2025 }
2026 if (n != chain_len)
2027 {
2028 /* Swap that in at first position. */
2029 std::swap (children[0], children[n]);
2030 for (lane = 0; lane < group_size; ++lane)
2031 std::swap (chains[lane][0], chains[lane][n]);
2032 }
2033 else
2034 {
2035 /* ??? When this triggers and we end up with two
2036 vect_constant/external_def up-front things break (ICE)
2037 spectacularly finding an insertion place for the
2038 all-constant op. We should have a fully
2039 vect_internal_def operand though(?) so we can swap
2040 that into first place and then prepend the all-zero
2041 constant. */
2042 if (dump_enabled_p ())
2043 dump_printf_loc (MSG_NOTE, vect_location,
2044 "inserting constant zero to compensate "
2045 "for (partially) negated first "
2046 "operand\n");
2047 chain_len++;
2048 for (lane = 0; lane < group_size; ++lane)
2049 chains[lane].safe_insert
2050 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2051 vec<tree> zero_ops;
2052 zero_ops.create (group_size);
2053 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2054 for (lane = 1; lane < group_size; ++lane)
2055 zero_ops.quick_push (zero_ops[0]);
2056 slp_tree zero = vect_create_new_slp_node (zero_ops);
2057 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2058 children.safe_insert (0, zero);
2059 }
2060 break;
2061 }
2062 for (unsigned i = 1; i < children.length (); ++i)
2063 {
2064 slp_tree op0 = children[i - 1];
2065 slp_tree op1 = children[i];
2066 bool this_two_op = false;
2067 for (unsigned lane = 0; lane < group_size; ++lane)
2068 if (chains[lane][i].code != chains[0][i].code)
2069 {
2070 this_two_op = true;
2071 break;
2072 }
2073 slp_tree child;
2074 if (i == children.length () - 1)
2075 child = vect_create_new_slp_node (node, stmts, 2);
2076 else
2077 child = vect_create_new_slp_node (2, ERROR_MARK);
2078 if (this_two_op)
2079 {
2080 vec<std::pair<unsigned, unsigned> > lperm;
2081 lperm.create (group_size);
2082 for (unsigned lane = 0; lane < group_size; ++lane)
2083 lperm.quick_push (std::make_pair
2084 (chains[lane][i].code != chains[0][i].code, lane));
2085 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2086 (chains[0][i].code == code
2087 ? op_stmt_info
2088 : other_op_stmt_info),
2089 (chains[0][i].code == code
2090 ? other_op_stmt_info
2091 : op_stmt_info),
2092 lperm);
2093 }
2094 else
2095 {
2096 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2097 SLP_TREE_VECTYPE (child) = vectype;
2098 SLP_TREE_LANES (child) = group_size;
2099 SLP_TREE_CHILDREN (child).quick_push (op0);
2100 SLP_TREE_CHILDREN (child).quick_push (op1);
2101 SLP_TREE_REPRESENTATIVE (child)
2102 = (chains[0][i].code == code
2103 ? op_stmt_info : other_op_stmt_info);
2104 }
2105 children[i] = child;
2106 }
2107 *tree_size += this_tree_size + 1;
2108 *max_nunits = this_max_nunits;
2109 while (!chains.is_empty ())
2110 chains.pop ().release ();
2111 return node;
2112 }
2113 out:
2114 while (!children.is_empty ())
2115 vect_free_slp_tree (children.pop ());
2116 while (!chains.is_empty ())
2117 chains.pop ().release ();
2118 /* Hard-fail, otherwise we might run into quadratic processing of the
2119 chains starting one stmt into the chain again. */
2120 if (hard_fail)
2121 return NULL;
2122 /* Fall thru to normal processing. */
2123 }
2124
2125 /* Get at the operands, verifying they are compatible. */
2126 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2127 slp_oprnd_info oprnd_info;
2128 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2129 {
2130 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2131 stmts, i, &oprnds_info);
2132 if (res != 0)
2133 matches[(res == -1) ? 0 : i] = false;
2134 if (!matches[0])
2135 break;
2136 }
2137 for (i = 0; i < group_size; ++i)
2138 if (!matches[i])
2139 {
2140 vect_free_oprnd_info (oprnds_info);
2141 return NULL;
2142 }
2143 swap = NULL;
2144
2145 auto_vec<slp_tree, 4> children;
2146
2147 stmt_info = stmts[0];
2148
2149 /* Create SLP_TREE nodes for the definition node/s. */
2150 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2151 {
2152 slp_tree child;
2153 unsigned int j;
2154
2155 /* We're skipping certain operands from processing, for example
2156 outer loop reduction initial defs. */
2157 if (skip_args[i])
2158 {
2159 children.safe_push (NULL);
2160 continue;
2161 }
2162
2163 if (oprnd_info->first_dt == vect_uninitialized_def)
2164 {
2165 /* COND_EXPR have one too many eventually if the condition
2166 is a SSA name. */
2167 gcc_assert (i == 3 && nops == 4);
2168 continue;
2169 }
2170
2171 if (is_a <bb_vec_info> (vinfo)
2172 && oprnd_info->first_dt == vect_internal_def
2173 && !oprnd_info->any_pattern)
2174 {
2175 /* For BB vectorization, if all defs are the same do not
2176 bother to continue the build along the single-lane
2177 graph but use a splat of the scalar value. */
2178 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2179 for (j = 1; j < group_size; ++j)
2180 if (oprnd_info->def_stmts[j] != first_def)
2181 break;
2182 if (j == group_size
2183 /* But avoid doing this for loads where we may be
2184 able to CSE things, unless the stmt is not
2185 vectorizable. */
2186 && (!STMT_VINFO_VECTORIZABLE (first_def)
2187 || !gimple_vuse (first_def->stmt)))
2188 {
2189 if (dump_enabled_p ())
2190 dump_printf_loc (MSG_NOTE, vect_location,
2191 "Using a splat of the uniform operand\n");
2192 oprnd_info->first_dt = vect_external_def;
2193 }
2194 }
2195
2196 if (oprnd_info->first_dt == vect_external_def
2197 || oprnd_info->first_dt == vect_constant_def)
2198 {
2199 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2200 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2201 oprnd_info->ops = vNULL;
2202 children.safe_push (invnode);
2203 continue;
2204 }
2205
2206 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2207 group_size, &this_max_nunits,
2208 matches, limit,
2209 &this_tree_size, bst_map)) != NULL)
2210 {
2211 oprnd_info->def_stmts = vNULL;
2212 children.safe_push (child);
2213 continue;
2214 }
2215
2216 /* If the SLP build for operand zero failed and operand zero
2217 and one can be commutated try that for the scalar stmts
2218 that failed the match. */
2219 if (i == 0
2220 /* A first scalar stmt mismatch signals a fatal mismatch. */
2221 && matches[0]
2222 /* ??? For COND_EXPRs we can swap the comparison operands
2223 as well as the arms under some constraints. */
2224 && nops == 2
2225 && oprnds_info[1]->first_dt == vect_internal_def
2226 && is_gimple_assign (stmt_info->stmt)
2227 /* Swapping operands for reductions breaks assumptions later on. */
2228 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2229 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2230 {
2231 /* See whether we can swap the matching or the non-matching
2232 stmt operands. */
2233 bool swap_not_matching = true;
2234 do
2235 {
2236 for (j = 0; j < group_size; ++j)
2237 {
2238 if (matches[j] != !swap_not_matching)
2239 continue;
2240 stmt_vec_info stmt_info = stmts[j];
2241 /* Verify if we can swap operands of this stmt. */
2242 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2243 if (!stmt
2244 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2245 {
2246 if (!swap_not_matching)
2247 goto fail;
2248 swap_not_matching = false;
2249 break;
2250 }
2251 }
2252 }
2253 while (j != group_size);
2254
2255 /* Swap mismatched definition stmts. */
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_NOTE, vect_location,
2258 "Re-trying with swapped operands of stmts ");
2259 for (j = 0; j < group_size; ++j)
2260 if (matches[j] == !swap_not_matching)
2261 {
2262 std::swap (oprnds_info[0]->def_stmts[j],
2263 oprnds_info[1]->def_stmts[j]);
2264 std::swap (oprnds_info[0]->ops[j],
2265 oprnds_info[1]->ops[j]);
2266 if (dump_enabled_p ())
2267 dump_printf (MSG_NOTE, "%d ", j);
2268 }
2269 if (dump_enabled_p ())
2270 dump_printf (MSG_NOTE, "\n");
2271 /* After swapping some operands we lost track whether an
2272 operand has any pattern defs so be conservative here. */
2273 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2274 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2275 /* And try again with scratch 'matches' ... */
2276 bool *tem = XALLOCAVEC (bool, group_size);
2277 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2278 group_size, &this_max_nunits,
2279 tem, limit,
2280 &this_tree_size, bst_map)) != NULL)
2281 {
2282 oprnd_info->def_stmts = vNULL;
2283 children.safe_push (child);
2284 continue;
2285 }
2286 }
2287 fail:
2288
2289 /* If the SLP build failed and we analyze a basic-block
2290 simply treat nodes we fail to build as externally defined
2291 (and thus build vectors from the scalar defs).
2292 The cost model will reject outright expensive cases.
2293 ??? This doesn't treat cases where permutation ultimatively
2294 fails (or we don't try permutation below). Ideally we'd
2295 even compute a permutation that will end up with the maximum
2296 SLP tree size... */
2297 if (is_a <bb_vec_info> (vinfo)
2298 /* ??? Rejecting patterns this way doesn't work. We'd have to
2299 do extra work to cancel the pattern so the uses see the
2300 scalar version. */
2301 && !is_pattern_stmt_p (stmt_info)
2302 && !oprnd_info->any_pattern)
2303 {
2304 /* But if there's a leading vector sized set of matching stmts
2305 fail here so we can split the group. This matches the condition
2306 vect_analyze_slp_instance uses. */
2307 /* ??? We might want to split here and combine the results to support
2308 multiple vector sizes better. */
2309 for (j = 0; j < group_size; ++j)
2310 if (!matches[j])
2311 break;
2312 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2313 {
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_NOTE, vect_location,
2316 "Building vector operands from scalars\n");
2317 this_tree_size++;
2318 child = vect_create_new_slp_node (oprnd_info->ops);
2319 children.safe_push (child);
2320 oprnd_info->ops = vNULL;
2321 continue;
2322 }
2323 }
2324
2325 gcc_assert (child == NULL);
2326 FOR_EACH_VEC_ELT (children, j, child)
2327 if (child)
2328 vect_free_slp_tree (child);
2329 vect_free_oprnd_info (oprnds_info);
2330 return NULL;
2331 }
2332
2333 vect_free_oprnd_info (oprnds_info);
2334
2335 /* If we have all children of a child built up from uniform scalars
2336 or does more than one possibly expensive vector construction then
2337 just throw that away, causing it built up from scalars.
2338 The exception is the SLP node for the vector store. */
2339 if (is_a <bb_vec_info> (vinfo)
2340 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2341 /* ??? Rejecting patterns this way doesn't work. We'd have to
2342 do extra work to cancel the pattern so the uses see the
2343 scalar version. */
2344 && !is_pattern_stmt_p (stmt_info))
2345 {
2346 slp_tree child;
2347 unsigned j;
2348 bool all_uniform_p = true;
2349 unsigned n_vector_builds = 0;
2350 FOR_EACH_VEC_ELT (children, j, child)
2351 {
2352 if (!child)
2353 ;
2354 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2355 all_uniform_p = false;
2356 else if (!vect_slp_tree_uniform_p (child))
2357 {
2358 all_uniform_p = false;
2359 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2360 n_vector_builds++;
2361 }
2362 }
2363 if (all_uniform_p
2364 || n_vector_builds > 1
2365 || (n_vector_builds == children.length ()
2366 && is_a <gphi *> (stmt_info->stmt)))
2367 {
2368 /* Roll back. */
2369 matches[0] = false;
2370 FOR_EACH_VEC_ELT (children, j, child)
2371 if (child)
2372 vect_free_slp_tree (child);
2373
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_NOTE, vect_location,
2376 "Building parent vector operands from "
2377 "scalars instead\n");
2378 return NULL;
2379 }
2380 }
2381
2382 *tree_size += this_tree_size + 1;
2383 *max_nunits = this_max_nunits;
2384
2385 if (two_operators)
2386 {
2387 /* ??? We'd likely want to either cache in bst_map sth like
2388 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2389 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2390 explicit stmts to put in so the keying on 'stmts' doesn't
2391 work (but we have the same issue with nodes that use 'ops'). */
2392 slp_tree one = new _slp_tree;
2393 slp_tree two = new _slp_tree;
2394 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2395 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2396 SLP_TREE_VECTYPE (one) = vectype;
2397 SLP_TREE_VECTYPE (two) = vectype;
2398 SLP_TREE_CHILDREN (one).safe_splice (children);
2399 SLP_TREE_CHILDREN (two).safe_splice (children);
2400 slp_tree child;
2401 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2402 SLP_TREE_REF_COUNT (child)++;
2403
2404 /* Here we record the original defs since this
2405 node represents the final lane configuration. */
2406 node = vect_create_new_slp_node (node, stmts, 2);
2407 SLP_TREE_VECTYPE (node) = vectype;
2408 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2409 SLP_TREE_CHILDREN (node).quick_push (one);
2410 SLP_TREE_CHILDREN (node).quick_push (two);
2411 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2412 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2413 enum tree_code ocode = ERROR_MARK;
2414 stmt_vec_info ostmt_info;
2415 unsigned j = 0;
2416 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2417 {
2418 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2419 if (gimple_assign_rhs_code (ostmt) != code0)
2420 {
2421 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2422 ocode = gimple_assign_rhs_code (ostmt);
2423 j = i;
2424 }
2425 else
2426 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2427 }
2428 SLP_TREE_CODE (one) = code0;
2429 SLP_TREE_CODE (two) = ocode;
2430 SLP_TREE_LANES (one) = stmts.length ();
2431 SLP_TREE_LANES (two) = stmts.length ();
2432 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2433 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2434 return node;
2435 }
2436
2437 node = vect_create_new_slp_node (node, stmts, nops);
2438 SLP_TREE_VECTYPE (node) = vectype;
2439 SLP_TREE_CHILDREN (node).splice (children);
2440 return node;
2441 }
2442
2443 /* Dump a single SLP tree NODE. */
2444
2445 static void
2446 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2447 slp_tree node)
2448 {
2449 unsigned i, j;
2450 slp_tree child;
2451 stmt_vec_info stmt_info;
2452 tree op;
2453
2454 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2455 dump_user_location_t user_loc = loc.get_user_location ();
2456 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2457 SLP_TREE_DEF_TYPE (node) == vect_external_def
2458 ? " (external)"
2459 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2460 ? " (constant)"
2461 : ""), node,
2462 estimated_poly_value (node->max_nunits),
2463 SLP_TREE_REF_COUNT (node));
2464 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2465 {
2466 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2467 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2468 else
2469 dump_printf_loc (metadata, user_loc, "op template: %G",
2470 SLP_TREE_REPRESENTATIVE (node)->stmt);
2471 }
2472 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2473 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2474 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2475 else
2476 {
2477 dump_printf_loc (metadata, user_loc, "\t{ ");
2478 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2479 dump_printf (metadata, "%T%s ", op,
2480 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2481 dump_printf (metadata, "}\n");
2482 }
2483 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2484 {
2485 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2486 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2487 dump_printf (dump_kind, " %u", j);
2488 dump_printf (dump_kind, " }\n");
2489 }
2490 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2491 {
2492 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2493 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2494 dump_printf (dump_kind, " %u[%u]",
2495 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2496 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2497 dump_printf (dump_kind, " }\n");
2498 }
2499 if (SLP_TREE_CHILDREN (node).is_empty ())
2500 return;
2501 dump_printf_loc (metadata, user_loc, "\tchildren");
2502 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2503 dump_printf (dump_kind, " %p", (void *)child);
2504 dump_printf (dump_kind, "\n");
2505 }
2506
2507 DEBUG_FUNCTION void
2508 debug (slp_tree node)
2509 {
2510 debug_dump_context ctx;
2511 vect_print_slp_tree (MSG_NOTE,
2512 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2513 node);
2514 }
2515
2516 /* Recursive helper for the dot producer below. */
2517
2518 static void
2519 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2520 {
2521 if (visited.add (node))
2522 return;
2523
2524 fprintf (f, "\"%p\" [label=\"", (void *)node);
2525 vect_print_slp_tree (MSG_NOTE,
2526 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2527 node);
2528 fprintf (f, "\"];\n");
2529
2530
2531 for (slp_tree child : SLP_TREE_CHILDREN (node))
2532 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2533
2534 for (slp_tree child : SLP_TREE_CHILDREN (node))
2535 dot_slp_tree (f, child, visited);
2536 }
2537
2538 DEBUG_FUNCTION void
2539 dot_slp_tree (const char *fname, slp_tree node)
2540 {
2541 FILE *f = fopen (fname, "w");
2542 fprintf (f, "digraph {\n");
2543 fflush (f);
2544 {
2545 debug_dump_context ctx (f);
2546 hash_set<slp_tree> visited;
2547 dot_slp_tree (f, node, visited);
2548 }
2549 fflush (f);
2550 fprintf (f, "}\n");
2551 fclose (f);
2552 }
2553
2554 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2555
2556 static void
2557 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2558 slp_tree node, hash_set<slp_tree> &visited)
2559 {
2560 unsigned i;
2561 slp_tree child;
2562
2563 if (visited.add (node))
2564 return;
2565
2566 vect_print_slp_tree (dump_kind, loc, node);
2567
2568 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2569 if (child)
2570 vect_print_slp_graph (dump_kind, loc, child, visited);
2571 }
2572
2573 static void
2574 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2575 slp_tree entry)
2576 {
2577 hash_set<slp_tree> visited;
2578 vect_print_slp_graph (dump_kind, loc, entry, visited);
2579 }
2580
2581 /* Mark the tree rooted at NODE with PURE_SLP. */
2582
2583 static void
2584 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2585 {
2586 int i;
2587 stmt_vec_info stmt_info;
2588 slp_tree child;
2589
2590 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2591 return;
2592
2593 if (visited.add (node))
2594 return;
2595
2596 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2597 STMT_SLP_TYPE (stmt_info) = pure_slp;
2598
2599 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2600 if (child)
2601 vect_mark_slp_stmts (child, visited);
2602 }
2603
2604 static void
2605 vect_mark_slp_stmts (slp_tree node)
2606 {
2607 hash_set<slp_tree> visited;
2608 vect_mark_slp_stmts (node, visited);
2609 }
2610
2611 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2612
2613 static void
2614 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2615 {
2616 int i;
2617 stmt_vec_info stmt_info;
2618 slp_tree child;
2619
2620 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2621 return;
2622
2623 if (visited.add (node))
2624 return;
2625
2626 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2627 {
2628 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2629 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2630 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2631 }
2632
2633 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2634 if (child)
2635 vect_mark_slp_stmts_relevant (child, visited);
2636 }
2637
2638 static void
2639 vect_mark_slp_stmts_relevant (slp_tree node)
2640 {
2641 hash_set<slp_tree> visited;
2642 vect_mark_slp_stmts_relevant (node, visited);
2643 }
2644
2645
2646 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2647
2648 static void
2649 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2650 hash_set<slp_tree> &visited)
2651 {
2652 if (!node || visited.add (node))
2653 return;
2654
2655 if (SLP_TREE_CHILDREN (node).length () == 0)
2656 {
2657 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2658 return;
2659 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2660 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2661 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2662 loads.safe_push (node);
2663 }
2664 else
2665 {
2666 unsigned i;
2667 slp_tree child;
2668 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2669 vect_gather_slp_loads (loads, child, visited);
2670 }
2671 }
2672
2673
2674 /* Find the last store in SLP INSTANCE. */
2675
2676 stmt_vec_info
2677 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2678 {
2679 stmt_vec_info last = NULL;
2680 stmt_vec_info stmt_vinfo;
2681
2682 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2683 {
2684 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2685 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2686 }
2687
2688 return last;
2689 }
2690
2691 /* Find the first stmt in NODE. */
2692
2693 stmt_vec_info
2694 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2695 {
2696 stmt_vec_info first = NULL;
2697 stmt_vec_info stmt_vinfo;
2698
2699 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2700 {
2701 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2702 if (!first
2703 || get_later_stmt (stmt_vinfo, first) == first)
2704 first = stmt_vinfo;
2705 }
2706
2707 return first;
2708 }
2709
2710 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2711 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2712 (also containing the first GROUP1_SIZE stmts, since stores are
2713 consecutive), the second containing the remainder.
2714 Return the first stmt in the second group. */
2715
2716 static stmt_vec_info
2717 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2718 {
2719 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2720 gcc_assert (group1_size > 0);
2721 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2722 gcc_assert (group2_size > 0);
2723 DR_GROUP_SIZE (first_vinfo) = group1_size;
2724
2725 stmt_vec_info stmt_info = first_vinfo;
2726 for (unsigned i = group1_size; i > 1; i--)
2727 {
2728 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2729 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2730 }
2731 /* STMT is now the last element of the first group. */
2732 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2733 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2734
2735 DR_GROUP_SIZE (group2) = group2_size;
2736 for (stmt_info = group2; stmt_info;
2737 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2738 {
2739 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2740 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2741 }
2742
2743 /* For the second group, the DR_GROUP_GAP is that before the original group,
2744 plus skipping over the first vector. */
2745 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2746
2747 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2748 DR_GROUP_GAP (first_vinfo) += group2_size;
2749
2750 if (dump_enabled_p ())
2751 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2752 group1_size, group2_size);
2753
2754 return group2;
2755 }
2756
2757 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2758 statements and a vector of NUNITS elements. */
2759
2760 static poly_uint64
2761 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2762 {
2763 return exact_div (common_multiple (nunits, group_size), group_size);
2764 }
2765
2766 /* Helper that checks to see if a node is a load node. */
2767
2768 static inline bool
2769 vect_is_slp_load_node (slp_tree root)
2770 {
2771 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2772 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2773 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2774 }
2775
2776
2777 /* Helper function of optimize_load_redistribution that performs the operation
2778 recursively. */
2779
2780 static slp_tree
2781 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2782 vec_info *vinfo, unsigned int group_size,
2783 hash_map<slp_tree, slp_tree> *load_map,
2784 slp_tree root)
2785 {
2786 if (slp_tree *leader = load_map->get (root))
2787 return *leader;
2788
2789 slp_tree node;
2790 unsigned i;
2791
2792 /* For now, we don't know anything about externals so do not do anything. */
2793 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2794 return NULL;
2795 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2796 {
2797 /* First convert this node into a load node and add it to the leaves
2798 list and flatten the permute from a lane to a load one. If it's
2799 unneeded it will be elided later. */
2800 vec<stmt_vec_info> stmts;
2801 stmts.create (SLP_TREE_LANES (root));
2802 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2803 for (unsigned j = 0; j < lane_perm.length (); j++)
2804 {
2805 std::pair<unsigned, unsigned> perm = lane_perm[j];
2806 node = SLP_TREE_CHILDREN (root)[perm.first];
2807
2808 if (!vect_is_slp_load_node (node)
2809 || SLP_TREE_CHILDREN (node).exists ())
2810 {
2811 stmts.release ();
2812 goto next;
2813 }
2814
2815 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2816 }
2817
2818 if (dump_enabled_p ())
2819 dump_printf_loc (MSG_NOTE, vect_location,
2820 "converting stmts on permute node %p\n", root);
2821
2822 bool *matches = XALLOCAVEC (bool, group_size);
2823 poly_uint64 max_nunits = 1;
2824 unsigned tree_size = 0, limit = 1;
2825 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2826 matches, &limit, &tree_size, bst_map);
2827 if (!node)
2828 stmts.release ();
2829
2830 load_map->put (root, node);
2831 return node;
2832 }
2833
2834 next:
2835 load_map->put (root, NULL);
2836
2837 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2838 {
2839 slp_tree value
2840 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2841 node);
2842 if (value)
2843 {
2844 SLP_TREE_REF_COUNT (value)++;
2845 SLP_TREE_CHILDREN (root)[i] = value;
2846 /* ??? We know the original leafs of the replaced nodes will
2847 be referenced by bst_map, only the permutes created by
2848 pattern matching are not. */
2849 if (SLP_TREE_REF_COUNT (node) == 1)
2850 load_map->remove (node);
2851 vect_free_slp_tree (node);
2852 }
2853 }
2854
2855 return NULL;
2856 }
2857
2858 /* Temporary workaround for loads not being CSEd during SLP build. This
2859 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2860 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2861 same DR such that the final operation is equal to a permuted load. Such
2862 NODES are then directly converted into LOADS themselves. The nodes are
2863 CSEd using BST_MAP. */
2864
2865 static void
2866 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2867 vec_info *vinfo, unsigned int group_size,
2868 hash_map<slp_tree, slp_tree> *load_map,
2869 slp_tree root)
2870 {
2871 slp_tree node;
2872 unsigned i;
2873
2874 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2875 {
2876 slp_tree value
2877 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2878 node);
2879 if (value)
2880 {
2881 SLP_TREE_REF_COUNT (value)++;
2882 SLP_TREE_CHILDREN (root)[i] = value;
2883 /* ??? We know the original leafs of the replaced nodes will
2884 be referenced by bst_map, only the permutes created by
2885 pattern matching are not. */
2886 if (SLP_TREE_REF_COUNT (node) == 1)
2887 load_map->remove (node);
2888 vect_free_slp_tree (node);
2889 }
2890 }
2891 }
2892
2893 /* Helper function of vect_match_slp_patterns.
2894
2895 Attempts to match patterns against the slp tree rooted in REF_NODE using
2896 VINFO. Patterns are matched in post-order traversal.
2897
2898 If matching is successful the value in REF_NODE is updated and returned, if
2899 not then it is returned unchanged. */
2900
2901 static bool
2902 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2903 slp_tree_to_load_perm_map_t *perm_cache,
2904 hash_set<slp_tree> *visited)
2905 {
2906 unsigned i;
2907 slp_tree node = *ref_node;
2908 bool found_p = false;
2909 if (!node || visited->add (node))
2910 return false;
2911
2912 slp_tree child;
2913 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2914 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2915 vinfo, perm_cache, visited);
2916
2917 for (unsigned x = 0; x < num__slp_patterns; x++)
2918 {
2919 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2920 if (pattern)
2921 {
2922 pattern->build (vinfo);
2923 delete pattern;
2924 found_p = true;
2925 }
2926 }
2927
2928 return found_p;
2929 }
2930
2931 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2932 vec_info VINFO.
2933
2934 The modified tree is returned. Patterns are tried in order and multiple
2935 patterns may match. */
2936
2937 static bool
2938 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2939 hash_set<slp_tree> *visited,
2940 slp_tree_to_load_perm_map_t *perm_cache)
2941 {
2942 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2943 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2944
2945 if (dump_enabled_p ())
2946 dump_printf_loc (MSG_NOTE, vect_location,
2947 "Analyzing SLP tree %p for patterns\n",
2948 SLP_INSTANCE_TREE (instance));
2949
2950 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2951 }
2952
2953 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2954 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2955 Return true if we could use IFN_STORE_LANES instead and if that appears
2956 to be the better approach. */
2957
2958 static bool
2959 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
2960 unsigned int group_size,
2961 unsigned int new_group_size)
2962 {
2963 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2964 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2965 if (!vectype)
2966 return false;
2967 /* Allow the split if one of the two new groups would operate on full
2968 vectors *within* rather than across one scalar loop iteration.
2969 This is purely a heuristic, but it should work well for group
2970 sizes of 3 and 4, where the possible splits are:
2971
2972 3->2+1: OK if the vector has exactly two elements
2973 4->2+2: Likewise
2974 4->3+1: Less clear-cut. */
2975 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
2976 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
2977 return false;
2978 return vect_store_lanes_supported (vectype, group_size, false);
2979 }
2980
2981 /* Analyze an SLP instance starting from a group of grouped stores. Call
2982 vect_build_slp_tree to build a tree of packed stmts if possible.
2983 Return FALSE if it's impossible to SLP any stmt in the loop. */
2984
2985 static bool
2986 vect_analyze_slp_instance (vec_info *vinfo,
2987 scalar_stmts_to_slp_tree_map_t *bst_map,
2988 stmt_vec_info stmt_info, slp_instance_kind kind,
2989 unsigned max_tree_size, unsigned *limit);
2990
2991 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2992 of KIND. Return true if successful. */
2993
2994 static bool
2995 vect_build_slp_instance (vec_info *vinfo,
2996 slp_instance_kind kind,
2997 vec<stmt_vec_info> &scalar_stmts,
2998 vec<stmt_vec_info> &root_stmt_infos,
2999 unsigned max_tree_size, unsigned *limit,
3000 scalar_stmts_to_slp_tree_map_t *bst_map,
3001 /* ??? We need stmt_info for group splitting. */
3002 stmt_vec_info stmt_info_)
3003 {
3004 if (dump_enabled_p ())
3005 {
3006 dump_printf_loc (MSG_NOTE, vect_location,
3007 "Starting SLP discovery for\n");
3008 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3009 dump_printf_loc (MSG_NOTE, vect_location,
3010 " %G", scalar_stmts[i]->stmt);
3011 }
3012
3013 /* Build the tree for the SLP instance. */
3014 unsigned int group_size = scalar_stmts.length ();
3015 bool *matches = XALLOCAVEC (bool, group_size);
3016 poly_uint64 max_nunits = 1;
3017 unsigned tree_size = 0;
3018 unsigned i;
3019 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3020 &max_nunits, matches, limit,
3021 &tree_size, bst_map);
3022 if (node != NULL)
3023 {
3024 /* Calculate the unrolling factor based on the smallest type. */
3025 poly_uint64 unrolling_factor
3026 = calculate_unrolling_factor (max_nunits, group_size);
3027
3028 if (maybe_ne (unrolling_factor, 1U)
3029 && is_a <bb_vec_info> (vinfo))
3030 {
3031 unsigned HOST_WIDE_INT const_max_nunits;
3032 if (!max_nunits.is_constant (&const_max_nunits)
3033 || const_max_nunits > group_size)
3034 {
3035 if (dump_enabled_p ())
3036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3037 "Build SLP failed: store group "
3038 "size not a multiple of the vector size "
3039 "in basic block SLP\n");
3040 vect_free_slp_tree (node);
3041 return false;
3042 }
3043 /* Fatal mismatch. */
3044 if (dump_enabled_p ())
3045 dump_printf_loc (MSG_NOTE, vect_location,
3046 "SLP discovery succeeded but node needs "
3047 "splitting\n");
3048 memset (matches, true, group_size);
3049 matches[group_size / const_max_nunits * const_max_nunits] = false;
3050 vect_free_slp_tree (node);
3051 }
3052 else
3053 {
3054 /* Create a new SLP instance. */
3055 slp_instance new_instance = XNEW (class _slp_instance);
3056 SLP_INSTANCE_TREE (new_instance) = node;
3057 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3058 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3059 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3060 SLP_INSTANCE_KIND (new_instance) = kind;
3061 new_instance->reduc_phis = NULL;
3062 new_instance->cost_vec = vNULL;
3063 new_instance->subgraph_entries = vNULL;
3064
3065 if (dump_enabled_p ())
3066 dump_printf_loc (MSG_NOTE, vect_location,
3067 "SLP size %u vs. limit %u.\n",
3068 tree_size, max_tree_size);
3069
3070 /* Fixup SLP reduction chains. */
3071 if (kind == slp_inst_kind_reduc_chain)
3072 {
3073 /* If this is a reduction chain with a conversion in front
3074 amend the SLP tree with a node for that. */
3075 gimple *scalar_def
3076 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3077 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3078 {
3079 /* Get at the conversion stmt - we know it's the single use
3080 of the last stmt of the reduction chain. */
3081 use_operand_p use_p;
3082 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3083 &use_p, &scalar_def);
3084 gcc_assert (r);
3085 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3086 next_info = vect_stmt_to_vectorize (next_info);
3087 scalar_stmts = vNULL;
3088 scalar_stmts.create (group_size);
3089 for (unsigned i = 0; i < group_size; ++i)
3090 scalar_stmts.quick_push (next_info);
3091 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3092 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3093 SLP_TREE_CHILDREN (conv).quick_push (node);
3094 SLP_INSTANCE_TREE (new_instance) = conv;
3095 /* We also have to fake this conversion stmt as SLP reduction
3096 group so we don't have to mess with too much code
3097 elsewhere. */
3098 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3099 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3100 }
3101 /* Fill the backedge child of the PHI SLP node. The
3102 general matching code cannot find it because the
3103 scalar code does not reflect how we vectorize the
3104 reduction. */
3105 use_operand_p use_p;
3106 imm_use_iterator imm_iter;
3107 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3108 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3109 gimple_get_lhs (scalar_def))
3110 /* There are exactly two non-debug uses, the reduction
3111 PHI and the loop-closed PHI node. */
3112 if (!is_gimple_debug (USE_STMT (use_p))
3113 && gimple_bb (USE_STMT (use_p)) == loop->header)
3114 {
3115 auto_vec<stmt_vec_info, 64> phis (group_size);
3116 stmt_vec_info phi_info
3117 = vinfo->lookup_stmt (USE_STMT (use_p));
3118 for (unsigned i = 0; i < group_size; ++i)
3119 phis.quick_push (phi_info);
3120 slp_tree *phi_node = bst_map->get (phis);
3121 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3122 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3123 = SLP_INSTANCE_TREE (new_instance);
3124 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3125 }
3126 }
3127
3128 vinfo->slp_instances.safe_push (new_instance);
3129
3130 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3131 the number of scalar stmts in the root in a few places.
3132 Verify that assumption holds. */
3133 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3134 .length () == group_size);
3135
3136 if (dump_enabled_p ())
3137 {
3138 dump_printf_loc (MSG_NOTE, vect_location,
3139 "Final SLP tree for instance %p:\n", new_instance);
3140 vect_print_slp_graph (MSG_NOTE, vect_location,
3141 SLP_INSTANCE_TREE (new_instance));
3142 }
3143
3144 return true;
3145 }
3146 }
3147 else
3148 {
3149 /* Failed to SLP. */
3150 /* Free the allocated memory. */
3151 scalar_stmts.release ();
3152 }
3153
3154 stmt_vec_info stmt_info = stmt_info_;
3155 /* Try to break the group up into pieces. */
3156 if (kind == slp_inst_kind_store)
3157 {
3158 /* ??? We could delay all the actual splitting of store-groups
3159 until after SLP discovery of the original group completed.
3160 Then we can recurse to vect_build_slp_instance directly. */
3161 for (i = 0; i < group_size; i++)
3162 if (!matches[i])
3163 break;
3164
3165 /* For basic block SLP, try to break the group up into multiples of
3166 a vector size. */
3167 if (is_a <bb_vec_info> (vinfo)
3168 && (i > 1 && i < group_size))
3169 {
3170 tree scalar_type
3171 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3172 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3173 1 << floor_log2 (i));
3174 unsigned HOST_WIDE_INT const_nunits;
3175 if (vectype
3176 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3177 {
3178 /* Split into two groups at the first vector boundary. */
3179 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3180 unsigned group1_size = i & ~(const_nunits - 1);
3181
3182 if (dump_enabled_p ())
3183 dump_printf_loc (MSG_NOTE, vect_location,
3184 "Splitting SLP group at stmt %u\n", i);
3185 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3186 group1_size);
3187 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3188 kind, max_tree_size,
3189 limit);
3190 /* Split the rest at the failure point and possibly
3191 re-analyze the remaining matching part if it has
3192 at least two lanes. */
3193 if (group1_size < i
3194 && (i + 1 < group_size
3195 || i - group1_size > 1))
3196 {
3197 stmt_vec_info rest2 = rest;
3198 rest = vect_split_slp_store_group (rest, i - group1_size);
3199 if (i - group1_size > 1)
3200 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3201 kind, max_tree_size,
3202 limit);
3203 }
3204 /* Re-analyze the non-matching tail if it has at least
3205 two lanes. */
3206 if (i + 1 < group_size)
3207 res |= vect_analyze_slp_instance (vinfo, bst_map,
3208 rest, kind, max_tree_size,
3209 limit);
3210 return res;
3211 }
3212 }
3213
3214 /* For loop vectorization split into arbitrary pieces of size > 1. */
3215 if (is_a <loop_vec_info> (vinfo)
3216 && (i > 1 && i < group_size)
3217 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3218 {
3219 unsigned group1_size = i;
3220
3221 if (dump_enabled_p ())
3222 dump_printf_loc (MSG_NOTE, vect_location,
3223 "Splitting SLP group at stmt %u\n", i);
3224
3225 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3226 group1_size);
3227 /* Loop vectorization cannot handle gaps in stores, make sure
3228 the split group appears as strided. */
3229 STMT_VINFO_STRIDED_P (rest) = 1;
3230 DR_GROUP_GAP (rest) = 0;
3231 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3232 DR_GROUP_GAP (stmt_info) = 0;
3233
3234 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3235 kind, max_tree_size, limit);
3236 if (i + 1 < group_size)
3237 res |= vect_analyze_slp_instance (vinfo, bst_map,
3238 rest, kind, max_tree_size, limit);
3239
3240 return res;
3241 }
3242
3243 /* Even though the first vector did not all match, we might be able to SLP
3244 (some) of the remainder. FORNOW ignore this possibility. */
3245 }
3246
3247 /* Failed to SLP. */
3248 if (dump_enabled_p ())
3249 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3250 return false;
3251 }
3252
3253
3254 /* Analyze an SLP instance starting from a group of grouped stores. Call
3255 vect_build_slp_tree to build a tree of packed stmts if possible.
3256 Return FALSE if it's impossible to SLP any stmt in the loop. */
3257
3258 static bool
3259 vect_analyze_slp_instance (vec_info *vinfo,
3260 scalar_stmts_to_slp_tree_map_t *bst_map,
3261 stmt_vec_info stmt_info,
3262 slp_instance_kind kind,
3263 unsigned max_tree_size, unsigned *limit)
3264 {
3265 unsigned int i;
3266 vec<stmt_vec_info> scalar_stmts;
3267
3268 if (is_a <bb_vec_info> (vinfo))
3269 vect_location = stmt_info->stmt;
3270
3271 stmt_vec_info next_info = stmt_info;
3272 if (kind == slp_inst_kind_store)
3273 {
3274 /* Collect the stores and store them in scalar_stmts. */
3275 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3276 while (next_info)
3277 {
3278 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3279 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3280 }
3281 }
3282 else if (kind == slp_inst_kind_reduc_chain)
3283 {
3284 /* Collect the reduction stmts and store them in scalar_stmts. */
3285 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3286 while (next_info)
3287 {
3288 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3289 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3290 }
3291 /* Mark the first element of the reduction chain as reduction to properly
3292 transform the node. In the reduction analysis phase only the last
3293 element of the chain is marked as reduction. */
3294 STMT_VINFO_DEF_TYPE (stmt_info)
3295 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3296 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3297 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3298 }
3299 else if (kind == slp_inst_kind_ctor)
3300 {
3301 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3302 tree val;
3303 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3304 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3305 {
3306 stmt_vec_info def_info = vinfo->lookup_def (val);
3307 def_info = vect_stmt_to_vectorize (def_info);
3308 scalar_stmts.quick_push (def_info);
3309 }
3310 if (dump_enabled_p ())
3311 dump_printf_loc (MSG_NOTE, vect_location,
3312 "Analyzing vectorizable constructor: %G\n",
3313 stmt_info->stmt);
3314 }
3315 else if (kind == slp_inst_kind_reduc_group)
3316 {
3317 /* Collect reduction statements. */
3318 const vec<stmt_vec_info> &reductions
3319 = as_a <loop_vec_info> (vinfo)->reductions;
3320 scalar_stmts.create (reductions.length ());
3321 for (i = 0; reductions.iterate (i, &next_info); i++)
3322 if (STMT_VINFO_RELEVANT_P (next_info)
3323 || STMT_VINFO_LIVE_P (next_info))
3324 scalar_stmts.quick_push (next_info);
3325 /* If less than two were relevant/live there's nothing to SLP. */
3326 if (scalar_stmts.length () < 2)
3327 return false;
3328 }
3329 else
3330 gcc_unreachable ();
3331
3332 vec<stmt_vec_info> roots = vNULL;
3333 if (kind == slp_inst_kind_ctor)
3334 {
3335 roots.create (1);
3336 roots.quick_push (stmt_info);
3337 }
3338 /* Build the tree for the SLP instance. */
3339 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3340 roots,
3341 max_tree_size, limit, bst_map,
3342 kind == slp_inst_kind_store
3343 ? stmt_info : NULL);
3344 if (!res)
3345 roots.release ();
3346
3347 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3348 where we should do store group splitting. */
3349
3350 return res;
3351 }
3352
3353 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3354 trees of packed scalar stmts if SLP is possible. */
3355
3356 opt_result
3357 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3358 {
3359 unsigned int i;
3360 stmt_vec_info first_element;
3361 slp_instance instance;
3362
3363 DUMP_VECT_SCOPE ("vect_analyze_slp");
3364
3365 unsigned limit = max_tree_size;
3366
3367 scalar_stmts_to_slp_tree_map_t *bst_map
3368 = new scalar_stmts_to_slp_tree_map_t ();
3369
3370 /* Find SLP sequences starting from groups of grouped stores. */
3371 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3372 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3373 STMT_VINFO_GROUPED_ACCESS (first_element)
3374 ? slp_inst_kind_store : slp_inst_kind_ctor,
3375 max_tree_size, &limit);
3376
3377 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3378 {
3379 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3380 {
3381 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3382 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3383 bb_vinfo->roots[i].stmts,
3384 bb_vinfo->roots[i].roots,
3385 max_tree_size, &limit, bst_map, NULL))
3386 {
3387 bb_vinfo->roots[i].stmts = vNULL;
3388 bb_vinfo->roots[i].roots = vNULL;
3389 }
3390 }
3391 }
3392
3393 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3394 {
3395 /* Find SLP sequences starting from reduction chains. */
3396 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3397 if (! STMT_VINFO_RELEVANT_P (first_element)
3398 && ! STMT_VINFO_LIVE_P (first_element))
3399 ;
3400 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3401 slp_inst_kind_reduc_chain,
3402 max_tree_size, &limit))
3403 {
3404 /* Dissolve reduction chain group. */
3405 stmt_vec_info vinfo = first_element;
3406 stmt_vec_info last = NULL;
3407 while (vinfo)
3408 {
3409 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3410 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3411 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3412 last = vinfo;
3413 vinfo = next;
3414 }
3415 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3416 /* It can be still vectorized as part of an SLP reduction. */
3417 loop_vinfo->reductions.safe_push (last);
3418 }
3419
3420 /* Find SLP sequences starting from groups of reductions. */
3421 if (loop_vinfo->reductions.length () > 1)
3422 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3423 slp_inst_kind_reduc_group, max_tree_size,
3424 &limit);
3425 }
3426
3427 hash_set<slp_tree> visited_patterns;
3428 slp_tree_to_load_perm_map_t perm_cache;
3429
3430 /* See if any patterns can be found in the SLP tree. */
3431 bool pattern_found = false;
3432 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3433 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3434 &visited_patterns, &perm_cache);
3435
3436 /* If any were found optimize permutations of loads. */
3437 if (pattern_found)
3438 {
3439 hash_map<slp_tree, slp_tree> load_map;
3440 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3441 {
3442 slp_tree root = SLP_INSTANCE_TREE (instance);
3443 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3444 &load_map, root);
3445 }
3446 }
3447
3448
3449
3450 /* The map keeps a reference on SLP nodes built, release that. */
3451 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3452 it != bst_map->end (); ++it)
3453 if ((*it).second)
3454 vect_free_slp_tree ((*it).second);
3455 delete bst_map;
3456
3457 if (pattern_found && dump_enabled_p ())
3458 {
3459 dump_printf_loc (MSG_NOTE, vect_location,
3460 "Pattern matched SLP tree\n");
3461 hash_set<slp_tree> visited;
3462 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3463 vect_print_slp_graph (MSG_NOTE, vect_location,
3464 SLP_INSTANCE_TREE (instance), visited);
3465 }
3466
3467 return opt_result::success ();
3468 }
3469
3470 struct slpg_vertex
3471 {
3472 slpg_vertex (slp_tree node_)
3473 : node (node_), perm_in (-1), perm_out (-1) {}
3474
3475 int get_perm_materialized () const
3476 { return perm_in != perm_out ? perm_in : 0; }
3477
3478 slp_tree node;
3479 /* The common permutation on the incoming lanes (towards SLP children). */
3480 int perm_in;
3481 /* The permutation on the outgoing lanes (towards SLP parents). When
3482 the node is a materialization point for a permute this differs
3483 from perm_in (and is then usually zero). Materialization happens
3484 on the input side. */
3485 int perm_out;
3486 };
3487
3488 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3489
3490 static void
3491 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3492 vec<slpg_vertex> &vertices, vec<int> &leafs)
3493 {
3494 unsigned i;
3495 slp_tree child;
3496
3497 if (visited.add (node))
3498 return;
3499
3500 node->vertex = vertices.length ();
3501 vertices.safe_push (slpg_vertex (node));
3502
3503 bool leaf = true;
3504 bool force_leaf = false;
3505 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3506 if (child)
3507 {
3508 leaf = false;
3509 vect_slp_build_vertices (visited, child, vertices, leafs);
3510 }
3511 else
3512 force_leaf = true;
3513 /* Since SLP discovery works along use-def edges all cycles have an
3514 entry - but there's the exception of cycles where we do not handle
3515 the entry explicitely (but with a NULL SLP node), like some reductions
3516 and inductions. Force those SLP PHIs to act as leafs to make them
3517 backwards reachable. */
3518 if (leaf || force_leaf)
3519 leafs.safe_push (node->vertex);
3520 }
3521
3522 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3523
3524 static void
3525 vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
3526 vec<int> &leafs)
3527 {
3528 hash_set<slp_tree> visited;
3529 unsigned i;
3530 slp_instance instance;
3531 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3532 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3533 leafs);
3534 }
3535
3536 /* Apply (reverse) bijectite PERM to VEC. */
3537
3538 template <class T>
3539 static void
3540 vect_slp_permute (vec<unsigned> perm,
3541 vec<T> &vec, bool reverse)
3542 {
3543 auto_vec<T, 64> saved;
3544 saved.create (vec.length ());
3545 for (unsigned i = 0; i < vec.length (); ++i)
3546 saved.quick_push (vec[i]);
3547
3548 if (reverse)
3549 {
3550 for (unsigned i = 0; i < vec.length (); ++i)
3551 vec[perm[i]] = saved[i];
3552 for (unsigned i = 0; i < vec.length (); ++i)
3553 gcc_assert (vec[perm[i]] == saved[i]);
3554 }
3555 else
3556 {
3557 for (unsigned i = 0; i < vec.length (); ++i)
3558 vec[i] = saved[perm[i]];
3559 for (unsigned i = 0; i < vec.length (); ++i)
3560 gcc_assert (vec[i] == saved[perm[i]]);
3561 }
3562 }
3563
3564 /* Return whether permutations PERM_A and PERM_B as recorded in the
3565 PERMS vector are equal. */
3566
3567 static bool
3568 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3569 int perm_a, int perm_b)
3570 {
3571 return (perm_a == perm_b
3572 || (perm_a != -1 && perm_b != -1
3573 && perms[perm_a].length () == perms[perm_b].length ()
3574 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3575 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3576 }
3577
3578 /* Optimize the SLP graph of VINFO. */
3579
3580 void
3581 vect_optimize_slp (vec_info *vinfo)
3582 {
3583 if (vinfo->slp_instances.is_empty ())
3584 return;
3585
3586 slp_tree node;
3587 unsigned i;
3588 auto_vec<slpg_vertex> vertices;
3589 auto_vec<int> leafs;
3590 vect_slp_build_vertices (vinfo, vertices, leafs);
3591
3592 struct graph *slpg = new_graph (vertices.length ());
3593 for (slpg_vertex &v : vertices)
3594 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
3595 if (child)
3596 add_edge (slpg, v.node->vertex, child->vertex);
3597
3598 /* Compute (reverse) postorder on the inverted graph. */
3599 auto_vec<int> ipo;
3600 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3601
3602 auto_vec<vec<unsigned> > perms;
3603 perms.safe_push (vNULL); /* zero is no permute */
3604
3605 /* Produce initial permutations. */
3606 for (i = 0; i < leafs.length (); ++i)
3607 {
3608 int idx = leafs[i];
3609 slp_tree node = vertices[idx].node;
3610
3611 /* Handle externals and constants optimistically throughout the
3612 iteration. But treat existing vectors as fixed since we
3613 do not handle permuting them below. */
3614 if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
3615 && !SLP_TREE_VEC_DEFS (node).exists ())
3616 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3617 continue;
3618
3619 /* Leafs do not change across iterations. Note leafs also double
3620 as entries to the reverse graph. */
3621 if (!slpg->vertices[idx].succ)
3622 {
3623 vertices[idx].perm_in = 0;
3624 vertices[idx].perm_out = 0;
3625 }
3626
3627 /* Loads are the only thing generating permutes. */
3628 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3629 continue;
3630
3631 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3632 node unpermuted, record this permute. */
3633 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3634 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3635 continue;
3636 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3637 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3638 bool any_permute = false;
3639 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3640 {
3641 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3642 imin = MIN (imin, idx);
3643 imax = MAX (imax, idx);
3644 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3645 any_permute = true;
3646 }
3647 /* If there's no permute no need to split one out. */
3648 if (!any_permute)
3649 continue;
3650 /* If the span doesn't match we'd disrupt VF computation, avoid
3651 that for now. */
3652 if (imax - imin + 1 != SLP_TREE_LANES (node))
3653 continue;
3654
3655 /* For now only handle true permutes, like
3656 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3657 when permuting constants and invariants keeping the permute
3658 bijective. */
3659 auto_sbitmap load_index (SLP_TREE_LANES (node));
3660 bitmap_clear (load_index);
3661 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3662 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3663 unsigned j;
3664 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3665 if (!bitmap_bit_p (load_index, j))
3666 break;
3667 if (j != SLP_TREE_LANES (node))
3668 continue;
3669
3670 vec<unsigned> perm = vNULL;
3671 perm.safe_grow (SLP_TREE_LANES (node), true);
3672 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3673 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3674 perms.safe_push (perm);
3675 vertices[idx].perm_in = perms.length () - 1;
3676 vertices[idx].perm_out = perms.length () - 1;
3677 }
3678
3679 /* In addition to the above we have to mark outgoing permutes facing
3680 non-reduction graph entries that are not represented as to be
3681 materialized. */
3682 for (slp_instance instance : vinfo->slp_instances)
3683 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
3684 {
3685 /* Just setting perm_out isn't enough for the propagation to
3686 pick this up. */
3687 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
3688 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
3689 }
3690
3691 /* Propagate permutes along the graph and compute materialization points. */
3692 bool changed;
3693 bool do_materialization = false;
3694 unsigned iteration = 0;
3695 do
3696 {
3697 changed = false;
3698 ++iteration;
3699
3700 if (dump_enabled_p ())
3701 dump_printf_loc (MSG_NOTE, vect_location,
3702 "SLP optimize iteration %d\n", iteration);
3703
3704 for (i = vertices.length (); i > 0 ; --i)
3705 {
3706 int idx = ipo[i-1];
3707 slp_tree node = vertices[idx].node;
3708
3709 /* Handle externals and constants optimistically throughout the
3710 iteration. */
3711 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3712 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3713 continue;
3714
3715 /* We still eventually have failed backedge SLP nodes in the
3716 graph, those are only cancelled when analyzing operations.
3717 Simply treat them as transparent ops, propagating permutes
3718 through them. */
3719 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3720 {
3721 /* We do not handle stores with a permutation, so all
3722 incoming permutes must have been materialized. */
3723 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3724 if (STMT_VINFO_DATA_REF (rep)
3725 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3726 {
3727 /* ??? We're forcing materialization in place
3728 of the child here, we'd need special handling
3729 in materialization to leave perm_in -1 here. */
3730 vertices[idx].perm_in = 0;
3731 vertices[idx].perm_out = 0;
3732 }
3733 /* We cannot move a permute across an operation that is
3734 not independent on lanes. Note this is an explicit
3735 negative list since that's much shorter than the respective
3736 positive one but it's critical to keep maintaining it. */
3737 if (is_gimple_call (STMT_VINFO_STMT (rep)))
3738 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3739 {
3740 case CFN_COMPLEX_ADD_ROT90:
3741 case CFN_COMPLEX_ADD_ROT270:
3742 case CFN_COMPLEX_MUL:
3743 case CFN_COMPLEX_MUL_CONJ:
3744 case CFN_VEC_ADDSUB:
3745 case CFN_VEC_FMADDSUB:
3746 case CFN_VEC_FMSUBADD:
3747 vertices[idx].perm_in = 0;
3748 vertices[idx].perm_out = 0;
3749 default:;
3750 }
3751 }
3752
3753 if (!slpg->vertices[idx].succ)
3754 /* Pick up pre-computed leaf values. */
3755 ;
3756 else
3757 {
3758 bool any_succ_perm_out_m1 = false;
3759 int perm_in = vertices[idx].perm_in;
3760 for (graph_edge *succ = slpg->vertices[idx].succ;
3761 succ; succ = succ->succ_next)
3762 {
3763 int succ_idx = succ->dest;
3764 int succ_perm = vertices[succ_idx].perm_out;
3765 /* Handle unvisited (and constant) nodes optimistically. */
3766 /* ??? But for constants once we want to handle
3767 non-bijective permutes we have to verify the permute,
3768 when unifying lanes, will not unify different constants.
3769 For example see gcc.dg/vect/bb-slp-14.c for a case
3770 that would break. */
3771 if (succ_perm == -1)
3772 {
3773 /* When we handled a non-leaf optimistically, note
3774 that so we can adjust its outgoing permute below. */
3775 slp_tree succ_node = vertices[succ_idx].node;
3776 if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3777 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3778 any_succ_perm_out_m1 = true;
3779 continue;
3780 }
3781 if (perm_in == -1)
3782 perm_in = succ_perm;
3783 else if (succ_perm == 0
3784 || !vect_slp_perms_eq (perms, perm_in, succ_perm))
3785 {
3786 perm_in = 0;
3787 break;
3788 }
3789 }
3790
3791 /* Adjust any incoming permutes we treated optimistically. */
3792 if (perm_in != -1 && any_succ_perm_out_m1)
3793 {
3794 for (graph_edge *succ = slpg->vertices[idx].succ;
3795 succ; succ = succ->succ_next)
3796 {
3797 slp_tree succ_node = vertices[succ->dest].node;
3798 if (vertices[succ->dest].perm_out == -1
3799 && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3800 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3801 {
3802 vertices[succ->dest].perm_out = perm_in;
3803 /* And ensure this propagates. */
3804 if (vertices[succ->dest].perm_in == -1)
3805 vertices[succ->dest].perm_in = perm_in;
3806 }
3807 }
3808 changed = true;
3809 }
3810
3811 if (!vect_slp_perms_eq (perms, perm_in,
3812 vertices[idx].perm_in))
3813 {
3814 /* Make sure we eventually converge. */
3815 gcc_checking_assert (vertices[idx].perm_in == -1
3816 || perm_in == 0);
3817 vertices[idx].perm_in = perm_in;
3818
3819 /* While we can handle VEC_PERM nodes as transparent
3820 pass-through they can be a cheap materialization
3821 point as well. In addition they can act as source
3822 of a random permutation as well.
3823 The following ensures that former materialization
3824 points that now have zero incoming permutes no
3825 longer appear as such and that former "any" permutes
3826 get pass-through. We keep VEC_PERM nodes optimistic
3827 as "any" outgoing permute though. */
3828 if (vertices[idx].perm_out != 0
3829 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3830 vertices[idx].perm_out = perm_in;
3831 changed = true;
3832 }
3833 }
3834
3835 /* Elide pruning at materialization points in the first
3836 iteration phase. */
3837 if (!do_materialization)
3838 continue;
3839
3840 int perm = vertices[idx].perm_out;
3841 if (perm == 0 || perm == -1)
3842 continue;
3843
3844 /* Decide on permute materialization. Look whether there's
3845 a use (pred) edge that is permuted differently than us.
3846 In that case mark ourselves so the permutation is applied. */
3847 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
3848 if (all_preds_permuted)
3849 for (graph_edge *pred = slpg->vertices[idx].pred;
3850 pred; pred = pred->pred_next)
3851 {
3852 int pred_perm = vertices[pred->src].perm_in;
3853 gcc_checking_assert (pred_perm != -1);
3854 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3855 {
3856 all_preds_permuted = false;
3857 break;
3858 }
3859 }
3860 if (!all_preds_permuted)
3861 {
3862 vertices[idx].perm_out = 0;
3863 changed = true;
3864 }
3865 }
3866
3867 /* If the initial propagation converged, switch on materialization
3868 and re-propagate. */
3869 if (!changed && !do_materialization)
3870 {
3871 do_materialization = true;
3872 changed = true;
3873 }
3874 }
3875 while (changed);
3876 statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
3877
3878 /* Materialize. */
3879 for (i = 0; i < vertices.length (); ++i)
3880 {
3881 int perm_in = vertices[i].perm_in;
3882 slp_tree node = vertices[i].node;
3883
3884 /* First permute invariant/external original successors, we handle
3885 those optimistically during propagation and duplicate them if
3886 they are used with different permutations. */
3887 unsigned j;
3888 slp_tree child;
3889 if (perm_in > 0)
3890 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3891 {
3892 if (!child
3893 || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
3894 && SLP_TREE_DEF_TYPE (child) != vect_external_def))
3895 continue;
3896
3897 /* If the vector is uniform there's nothing to do. */
3898 if (vect_slp_tree_uniform_p (child))
3899 continue;
3900
3901 /* We can end up sharing some externals via two_operator
3902 handling. Be prepared to unshare those. */
3903 if (child->refcnt != 1)
3904 {
3905 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3906 SLP_TREE_CHILDREN (node)[j] = child
3907 = vect_create_new_slp_node
3908 (SLP_TREE_SCALAR_OPS (child).copy ());
3909 }
3910 vect_slp_permute (perms[perm_in],
3911 SLP_TREE_SCALAR_OPS (child), true);
3912 }
3913
3914 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3915 {
3916 /* Apply the common permutes to the input vectors. */
3917 if (perm_in > 0)
3918 {
3919 /* If the node is already a permute node we can apply
3920 the permutation to the lane selection, effectively
3921 materializing it on the incoming vectors. */
3922 if (dump_enabled_p ())
3923 dump_printf_loc (MSG_NOTE, vect_location,
3924 "simplifying permute node %p\n",
3925 node);
3926 for (unsigned k = 0;
3927 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3928 SLP_TREE_LANE_PERMUTATION (node)[k].second
3929 = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3930 }
3931 /* Apply the anticipated output permute to the permute and
3932 stmt vectors. */
3933 int perm_out = vertices[i].perm_out;
3934 if (perm_out > 0)
3935 {
3936 vect_slp_permute (perms[perm_out],
3937 SLP_TREE_SCALAR_STMTS (node), true);
3938 vect_slp_permute (perms[perm_out],
3939 SLP_TREE_LANE_PERMUTATION (node), true);
3940 }
3941 }
3942 else if (vertices[i].get_perm_materialized () != 0)
3943 {
3944 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3945 /* For loads simply drop the permutation, the load permutation
3946 already performs the desired permutation. */
3947 ;
3948 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3949 gcc_unreachable ();
3950 else
3951 {
3952 if (dump_enabled_p ())
3953 dump_printf_loc (MSG_NOTE, vect_location,
3954 "inserting permute node in place of %p\n",
3955 node);
3956
3957 /* Make a copy of NODE and in-place change it to a
3958 VEC_PERM node to permute the lanes of the copy. */
3959 slp_tree copy = new _slp_tree;
3960 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3961 SLP_TREE_CHILDREN (node) = vNULL;
3962 SLP_TREE_SCALAR_STMTS (copy)
3963 = SLP_TREE_SCALAR_STMTS (node).copy ();
3964 vect_slp_permute (perms[perm_in],
3965 SLP_TREE_SCALAR_STMTS (copy), true);
3966 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3967 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3968 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3969 SLP_TREE_LANE_PERMUTATION (copy)
3970 = SLP_TREE_LANE_PERMUTATION (node);
3971 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3972 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3973 copy->refcnt = 1;
3974 copy->max_nunits = node->max_nunits;
3975 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3976 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3977 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3978
3979 /* Now turn NODE into a VEC_PERM. */
3980 SLP_TREE_CHILDREN (node).safe_push (copy);
3981 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3982 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3983 SLP_TREE_LANE_PERMUTATION (node)
3984 .quick_push (std::make_pair (0, perms[perm_in][j]));
3985 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3986 }
3987 }
3988 else if (perm_in > 0) /* perm_in == perm_out */
3989 {
3990 /* Apply the reverse permutation to our stmts. */
3991 vect_slp_permute (perms[perm_in],
3992 SLP_TREE_SCALAR_STMTS (node), true);
3993 /* And to the lane/load permutation, which we can simply
3994 make regular by design. */
3995 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3996 {
3997 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
3998 /* ??? When we handle non-bijective permutes the idea
3999 is that we can force the load-permutation to be
4000 { min, min + 1, min + 2, ... max }. But then the
4001 scalar defs might no longer match the lane content
4002 which means wrong-code with live lane vectorization.
4003 So we possibly have to have NULL entries for those. */
4004 vect_slp_permute (perms[perm_in],
4005 SLP_TREE_LOAD_PERMUTATION (node), true);
4006 }
4007 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4008 gcc_unreachable ();
4009 }
4010 }
4011
4012 /* Elide any permutations at BB reduction roots. */
4013 if (is_a <bb_vec_info> (vinfo))
4014 {
4015 for (slp_instance instance : vinfo->slp_instances)
4016 {
4017 if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
4018 continue;
4019 slp_tree old = SLP_INSTANCE_TREE (instance);
4020 if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
4021 && SLP_TREE_CHILDREN (old).length () == 1)
4022 {
4023 slp_tree child = SLP_TREE_CHILDREN (old)[0];
4024 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
4025 {
4026 /* Preserve the special VEC_PERM we use to shield existing
4027 vector defs from the rest. But make it a no-op. */
4028 unsigned i = 0;
4029 for (std::pair<unsigned, unsigned> &p
4030 : SLP_TREE_LANE_PERMUTATION (old))
4031 p.second = i++;
4032 }
4033 else
4034 {
4035 SLP_INSTANCE_TREE (instance) = child;
4036 SLP_TREE_REF_COUNT (child)++;
4037 vect_free_slp_tree (old);
4038 }
4039 }
4040 else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
4041 && SLP_TREE_REF_COUNT (old) == 1
4042 && vertices[old->vertex].get_perm_materialized () != 0)
4043 {
4044 /* ??? For loads the situation is more complex since
4045 we can't modify the permute in place in case the
4046 node is used multiple times. In fact for loads this
4047 should be somehow handled in the propagation engine. */
4048 /* Apply the reverse permutation to our stmts. */
4049 int perm = vertices[old->vertex].get_perm_materialized ();
4050 vect_slp_permute (perms[perm],
4051 SLP_TREE_SCALAR_STMTS (old), true);
4052 vect_slp_permute (perms[perm],
4053 SLP_TREE_LOAD_PERMUTATION (old), true);
4054 }
4055 }
4056 }
4057
4058 /* Free the perms vector used for propagation. */
4059 while (!perms.is_empty ())
4060 perms.pop ().release ();
4061 free_graph (slpg);
4062
4063
4064 /* Now elide load permutations that are not necessary. */
4065 for (i = 0; i < leafs.length (); ++i)
4066 {
4067 node = vertices[leafs[i]].node;
4068 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
4069 continue;
4070
4071 /* In basic block vectorization we allow any subchain of an interleaving
4072 chain.
4073 FORNOW: not in loop SLP because of realignment complications. */
4074 if (is_a <bb_vec_info> (vinfo))
4075 {
4076 bool subchain_p = true;
4077 stmt_vec_info next_load_info = NULL;
4078 stmt_vec_info load_info;
4079 unsigned j;
4080 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4081 {
4082 if (j != 0
4083 && (next_load_info != load_info
4084 || DR_GROUP_GAP (load_info) != 1))
4085 {
4086 subchain_p = false;
4087 break;
4088 }
4089 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
4090 }
4091 if (subchain_p)
4092 {
4093 SLP_TREE_LOAD_PERMUTATION (node).release ();
4094 continue;
4095 }
4096 }
4097 else
4098 {
4099 stmt_vec_info load_info;
4100 bool this_load_permuted = false;
4101 unsigned j;
4102 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4103 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
4104 {
4105 this_load_permuted = true;
4106 break;
4107 }
4108 stmt_vec_info first_stmt_info
4109 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
4110 if (!this_load_permuted
4111 /* The load requires permutation when unrolling exposes
4112 a gap either because the group is larger than the SLP
4113 group-size or because there is a gap between the groups. */
4114 && (known_eq (LOOP_VINFO_VECT_FACTOR
4115 (as_a <loop_vec_info> (vinfo)), 1U)
4116 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
4117 && DR_GROUP_GAP (first_stmt_info) == 0)))
4118 {
4119 SLP_TREE_LOAD_PERMUTATION (node).release ();
4120 continue;
4121 }
4122 }
4123 }
4124 }
4125
4126 /* Gather loads reachable from the individual SLP graph entries. */
4127
4128 void
4129 vect_gather_slp_loads (vec_info *vinfo)
4130 {
4131 unsigned i;
4132 slp_instance instance;
4133 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4134 {
4135 hash_set<slp_tree> visited;
4136 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4137 SLP_INSTANCE_TREE (instance), visited);
4138 }
4139 }
4140
4141
4142 /* For each possible SLP instance decide whether to SLP it and calculate overall
4143 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4144 least one instance. */
4145
4146 bool
4147 vect_make_slp_decision (loop_vec_info loop_vinfo)
4148 {
4149 unsigned int i;
4150 poly_uint64 unrolling_factor = 1;
4151 const vec<slp_instance> &slp_instances
4152 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4153 slp_instance instance;
4154 int decided_to_slp = 0;
4155
4156 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4157
4158 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4159 {
4160 /* FORNOW: SLP if you can. */
4161 /* All unroll factors have the form:
4162
4163 GET_MODE_SIZE (vinfo->vector_mode) * X
4164
4165 for some rational X, so they must have a common multiple. */
4166 unrolling_factor
4167 = force_common_multiple (unrolling_factor,
4168 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4169
4170 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4171 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4172 loop-based vectorization. Such stmts will be marked as HYBRID. */
4173 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4174 decided_to_slp++;
4175 }
4176
4177 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4178
4179 if (decided_to_slp && dump_enabled_p ())
4180 {
4181 dump_printf_loc (MSG_NOTE, vect_location,
4182 "Decided to SLP %d instances. Unrolling factor ",
4183 decided_to_slp);
4184 dump_dec (MSG_NOTE, unrolling_factor);
4185 dump_printf (MSG_NOTE, "\n");
4186 }
4187
4188 return (decided_to_slp > 0);
4189 }
4190
4191 /* Private data for vect_detect_hybrid_slp. */
4192 struct vdhs_data
4193 {
4194 loop_vec_info loop_vinfo;
4195 vec<stmt_vec_info> *worklist;
4196 };
4197
4198 /* Walker for walk_gimple_op. */
4199
4200 static tree
4201 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4202 {
4203 walk_stmt_info *wi = (walk_stmt_info *)data;
4204 vdhs_data *dat = (vdhs_data *)wi->info;
4205
4206 if (wi->is_lhs)
4207 return NULL_TREE;
4208
4209 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4210 if (!def_stmt_info)
4211 return NULL_TREE;
4212 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4213 if (PURE_SLP_STMT (def_stmt_info))
4214 {
4215 if (dump_enabled_p ())
4216 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4217 def_stmt_info->stmt);
4218 STMT_SLP_TYPE (def_stmt_info) = hybrid;
4219 dat->worklist->safe_push (def_stmt_info);
4220 }
4221
4222 return NULL_TREE;
4223 }
4224
4225 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4226 if so, otherwise pushing it to WORKLIST. */
4227
4228 static void
4229 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4230 vec<stmt_vec_info> &worklist,
4231 stmt_vec_info stmt_info)
4232 {
4233 if (dump_enabled_p ())
4234 dump_printf_loc (MSG_NOTE, vect_location,
4235 "Processing hybrid candidate : %G", stmt_info->stmt);
4236 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4237 imm_use_iterator iter2;
4238 ssa_op_iter iter1;
4239 use_operand_p use_p;
4240 def_operand_p def_p;
4241 bool any_def = false;
4242 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4243 {
4244 any_def = true;
4245 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4246 {
4247 if (is_gimple_debug (USE_STMT (use_p)))
4248 continue;
4249 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4250 /* An out-of loop use means this is a loop_vect sink. */
4251 if (!use_info)
4252 {
4253 if (dump_enabled_p ())
4254 dump_printf_loc (MSG_NOTE, vect_location,
4255 "Found loop_vect sink: %G", stmt_info->stmt);
4256 worklist.safe_push (stmt_info);
4257 return;
4258 }
4259 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4260 {
4261 if (dump_enabled_p ())
4262 dump_printf_loc (MSG_NOTE, vect_location,
4263 "Found loop_vect use: %G", use_info->stmt);
4264 worklist.safe_push (stmt_info);
4265 return;
4266 }
4267 }
4268 }
4269 /* No def means this is a loo_vect sink. */
4270 if (!any_def)
4271 {
4272 if (dump_enabled_p ())
4273 dump_printf_loc (MSG_NOTE, vect_location,
4274 "Found loop_vect sink: %G", stmt_info->stmt);
4275 worklist.safe_push (stmt_info);
4276 return;
4277 }
4278 if (dump_enabled_p ())
4279 dump_printf_loc (MSG_NOTE, vect_location,
4280 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4281 STMT_SLP_TYPE (stmt_info) = pure_slp;
4282 }
4283
4284 /* Find stmts that must be both vectorized and SLPed. */
4285
4286 void
4287 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4288 {
4289 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4290
4291 /* All stmts participating in SLP are marked pure_slp, all other
4292 stmts are loop_vect.
4293 First collect all loop_vect stmts into a worklist.
4294 SLP patterns cause not all original scalar stmts to appear in
4295 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4296 Rectify this here and do a backward walk over the IL only considering
4297 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4298 mark them as pure_slp. */
4299 auto_vec<stmt_vec_info> worklist;
4300 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4301 {
4302 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4303 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4304 gsi_next (&gsi))
4305 {
4306 gphi *phi = gsi.phi ();
4307 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4308 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4309 maybe_push_to_hybrid_worklist (loop_vinfo,
4310 worklist, stmt_info);
4311 }
4312 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4313 gsi_prev (&gsi))
4314 {
4315 gimple *stmt = gsi_stmt (gsi);
4316 if (is_gimple_debug (stmt))
4317 continue;
4318 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4319 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4320 {
4321 for (gimple_stmt_iterator gsi2
4322 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4323 !gsi_end_p (gsi2); gsi_next (&gsi2))
4324 {
4325 stmt_vec_info patt_info
4326 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4327 if (!STMT_SLP_TYPE (patt_info)
4328 && STMT_VINFO_RELEVANT (patt_info))
4329 maybe_push_to_hybrid_worklist (loop_vinfo,
4330 worklist, patt_info);
4331 }
4332 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4333 }
4334 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4335 maybe_push_to_hybrid_worklist (loop_vinfo,
4336 worklist, stmt_info);
4337 }
4338 }
4339
4340 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4341 mark any SLP vectorized stmt as hybrid.
4342 ??? We're visiting def stmts N times (once for each non-SLP and
4343 once for each hybrid-SLP use). */
4344 walk_stmt_info wi;
4345 vdhs_data dat;
4346 dat.worklist = &worklist;
4347 dat.loop_vinfo = loop_vinfo;
4348 memset (&wi, 0, sizeof (wi));
4349 wi.info = (void *)&dat;
4350 while (!worklist.is_empty ())
4351 {
4352 stmt_vec_info stmt_info = worklist.pop ();
4353 /* Since SSA operands are not set up for pattern stmts we need
4354 to use walk_gimple_op. */
4355 wi.is_lhs = 0;
4356 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4357 }
4358 }
4359
4360
4361 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4362
4363 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4364 : vec_info (vec_info::bb, shared),
4365 bbs (_bbs),
4366 roots (vNULL)
4367 {
4368 for (unsigned i = 0; i < bbs.length (); ++i)
4369 {
4370 if (i != 0)
4371 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4372 gsi_next (&si))
4373 {
4374 gphi *phi = si.phi ();
4375 gimple_set_uid (phi, 0);
4376 add_stmt (phi);
4377 }
4378 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4379 !gsi_end_p (gsi); gsi_next (&gsi))
4380 {
4381 gimple *stmt = gsi_stmt (gsi);
4382 gimple_set_uid (stmt, 0);
4383 if (is_gimple_debug (stmt))
4384 continue;
4385 add_stmt (stmt);
4386 }
4387 }
4388 }
4389
4390
4391 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4392 stmts in the basic block. */
4393
4394 _bb_vec_info::~_bb_vec_info ()
4395 {
4396 /* Reset region marker. */
4397 for (unsigned i = 0; i < bbs.length (); ++i)
4398 {
4399 if (i != 0)
4400 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4401 gsi_next (&si))
4402 {
4403 gphi *phi = si.phi ();
4404 gimple_set_uid (phi, -1);
4405 }
4406 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4407 !gsi_end_p (gsi); gsi_next (&gsi))
4408 {
4409 gimple *stmt = gsi_stmt (gsi);
4410 gimple_set_uid (stmt, -1);
4411 }
4412 }
4413
4414 for (unsigned i = 0; i < roots.length (); ++i)
4415 {
4416 roots[i].stmts.release ();
4417 roots[i].roots.release ();
4418 }
4419 roots.release ();
4420 }
4421
4422 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4423 given then that child nodes have already been processed, and that
4424 their def types currently match their SLP node's def type. */
4425
4426 static bool
4427 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4428 slp_instance node_instance,
4429 stmt_vector_for_cost *cost_vec)
4430 {
4431 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4432
4433 /* Calculate the number of vector statements to be created for the
4434 scalar stmts in this node. For SLP reductions it is equal to the
4435 number of vector statements in the children (which has already been
4436 calculated by the recursive call). Otherwise it is the number of
4437 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4438 VF divided by the number of elements in a vector. */
4439 if (!STMT_VINFO_DATA_REF (stmt_info)
4440 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4441 {
4442 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4443 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4444 {
4445 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4446 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4447 break;
4448 }
4449 }
4450 else
4451 {
4452 poly_uint64 vf;
4453 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4454 vf = loop_vinfo->vectorization_factor;
4455 else
4456 vf = 1;
4457 unsigned int group_size = SLP_TREE_LANES (node);
4458 tree vectype = SLP_TREE_VECTYPE (node);
4459 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4460 = vect_get_num_vectors (vf * group_size, vectype);
4461 }
4462
4463 /* Handle purely internal nodes. */
4464 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4465 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
4466
4467 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
4468
4469 bool dummy;
4470 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4471 node, node_instance, cost_vec);
4472 }
4473
4474 /* Try to build NODE from scalars, returning true on success.
4475 NODE_INSTANCE is the SLP instance that contains NODE. */
4476
4477 static bool
4478 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4479 slp_instance node_instance)
4480 {
4481 stmt_vec_info stmt_info;
4482 unsigned int i;
4483
4484 if (!is_a <bb_vec_info> (vinfo)
4485 || node == SLP_INSTANCE_TREE (node_instance)
4486 || !SLP_TREE_SCALAR_STMTS (node).exists ()
4487 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4488 return false;
4489
4490 if (dump_enabled_p ())
4491 dump_printf_loc (MSG_NOTE, vect_location,
4492 "Building vector operands of %p from scalars instead\n", node);
4493
4494 /* Don't remove and free the child nodes here, since they could be
4495 referenced by other structures. The analysis and scheduling phases
4496 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4497 unsigned int group_size = SLP_TREE_LANES (node);
4498 SLP_TREE_DEF_TYPE (node) = vect_external_def;
4499 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4500 SLP_TREE_LOAD_PERMUTATION (node).release ();
4501 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4502 {
4503 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4504 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4505 }
4506 return true;
4507 }
4508
4509 /* Compute the prologue cost for invariant or constant operands represented
4510 by NODE. */
4511
4512 static void
4513 vect_prologue_cost_for_slp (slp_tree node,
4514 stmt_vector_for_cost *cost_vec)
4515 {
4516 /* There's a special case of an existing vector, that costs nothing. */
4517 if (SLP_TREE_SCALAR_OPS (node).length () == 0
4518 && !SLP_TREE_VEC_DEFS (node).is_empty ())
4519 return;
4520 /* Without looking at the actual initializer a vector of
4521 constants can be implemented as load from the constant pool.
4522 When all elements are the same we can use a splat. */
4523 tree vectype = SLP_TREE_VECTYPE (node);
4524 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4525 unsigned num_vects_to_check;
4526 unsigned HOST_WIDE_INT const_nunits;
4527 unsigned nelt_limit;
4528 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4529 && ! multiple_p (const_nunits, group_size))
4530 {
4531 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4532 nelt_limit = const_nunits;
4533 }
4534 else
4535 {
4536 /* If either the vector has variable length or the vectors
4537 are composed of repeated whole groups we only need to
4538 cost construction once. All vectors will be the same. */
4539 num_vects_to_check = 1;
4540 nelt_limit = group_size;
4541 }
4542 tree elt = NULL_TREE;
4543 unsigned nelt = 0;
4544 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
4545 {
4546 unsigned si = j % group_size;
4547 if (nelt == 0)
4548 elt = SLP_TREE_SCALAR_OPS (node)[si];
4549 /* ??? We're just tracking whether all operands of a single
4550 vector initializer are the same, ideally we'd check if
4551 we emitted the same one already. */
4552 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
4553 elt = NULL_TREE;
4554 nelt++;
4555 if (nelt == nelt_limit)
4556 {
4557 record_stmt_cost (cost_vec, 1,
4558 SLP_TREE_DEF_TYPE (node) == vect_external_def
4559 ? (elt ? scalar_to_vec : vec_construct)
4560 : vector_load,
4561 NULL, vectype, 0, vect_prologue);
4562 nelt = 0;
4563 }
4564 }
4565 }
4566
4567 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4568 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4569
4570 Return true if the operations are supported. */
4571
4572 static bool
4573 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4574 slp_instance node_instance,
4575 hash_set<slp_tree> &visited_set,
4576 vec<slp_tree> &visited_vec,
4577 stmt_vector_for_cost *cost_vec)
4578 {
4579 int i, j;
4580 slp_tree child;
4581
4582 /* Assume we can code-generate all invariants. */
4583 if (!node
4584 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4585 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4586 return true;
4587
4588 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4589 {
4590 if (dump_enabled_p ())
4591 dump_printf_loc (MSG_NOTE, vect_location,
4592 "Failed cyclic SLP reference in %p\n", node);
4593 return false;
4594 }
4595 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4596
4597 /* If we already analyzed the exact same set of scalar stmts we're done.
4598 We share the generated vector stmts for those. */
4599 if (visited_set.add (node))
4600 return true;
4601 visited_vec.safe_push (node);
4602
4603 bool res = true;
4604 unsigned visited_rec_start = visited_vec.length ();
4605 unsigned cost_vec_rec_start = cost_vec->length ();
4606 bool seen_non_constant_child = false;
4607 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4608 {
4609 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4610 visited_set, visited_vec,
4611 cost_vec);
4612 if (!res)
4613 break;
4614 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4615 seen_non_constant_child = true;
4616 }
4617 /* We're having difficulties scheduling nodes with just constant
4618 operands and no scalar stmts since we then cannot compute a stmt
4619 insertion place. */
4620 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4621 {
4622 if (dump_enabled_p ())
4623 dump_printf_loc (MSG_NOTE, vect_location,
4624 "Cannot vectorize all-constant op node %p\n", node);
4625 res = false;
4626 }
4627
4628 if (res)
4629 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4630 cost_vec);
4631 /* If analysis failed we have to pop all recursive visited nodes
4632 plus ourselves. */
4633 if (!res)
4634 {
4635 while (visited_vec.length () >= visited_rec_start)
4636 visited_set.remove (visited_vec.pop ());
4637 cost_vec->truncate (cost_vec_rec_start);
4638 }
4639
4640 /* When the node can be vectorized cost invariant nodes it references.
4641 This is not done in DFS order to allow the refering node
4642 vectorizable_* calls to nail down the invariant nodes vector type
4643 and possibly unshare it if it needs a different vector type than
4644 other referrers. */
4645 if (res)
4646 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4647 if (child
4648 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4649 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4650 /* Perform usual caching, note code-generation still
4651 code-gens these nodes multiple times but we expect
4652 to CSE them later. */
4653 && !visited_set.add (child))
4654 {
4655 visited_vec.safe_push (child);
4656 /* ??? After auditing more code paths make a "default"
4657 and push the vector type from NODE to all children
4658 if it is not already set. */
4659 /* Compute the number of vectors to be generated. */
4660 tree vector_type = SLP_TREE_VECTYPE (child);
4661 if (!vector_type)
4662 {
4663 /* For shifts with a scalar argument we don't need
4664 to cost or code-generate anything.
4665 ??? Represent this more explicitely. */
4666 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4667 == shift_vec_info_type)
4668 && j == 1);
4669 continue;
4670 }
4671 unsigned group_size = SLP_TREE_LANES (child);
4672 poly_uint64 vf = 1;
4673 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4674 vf = loop_vinfo->vectorization_factor;
4675 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4676 = vect_get_num_vectors (vf * group_size, vector_type);
4677 /* And cost them. */
4678 vect_prologue_cost_for_slp (child, cost_vec);
4679 }
4680
4681 /* If this node or any of its children can't be vectorized, try pruning
4682 the tree here rather than felling the whole thing. */
4683 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4684 {
4685 /* We'll need to revisit this for invariant costing and number
4686 of vectorized stmt setting. */
4687 res = true;
4688 }
4689
4690 return res;
4691 }
4692
4693 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4694 region and that can be vectorized using vectorizable_live_operation
4695 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4696 scalar code computing it to be retained. */
4697
4698 static void
4699 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4700 slp_instance instance,
4701 stmt_vector_for_cost *cost_vec,
4702 hash_set<stmt_vec_info> &svisited,
4703 hash_set<slp_tree> &visited)
4704 {
4705 if (visited.add (node))
4706 return;
4707
4708 unsigned i;
4709 stmt_vec_info stmt_info;
4710 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4711 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4712 {
4713 if (svisited.contains (stmt_info))
4714 continue;
4715 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4716 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4717 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4718 /* Only the pattern root stmt computes the original scalar value. */
4719 continue;
4720 bool mark_visited = true;
4721 gimple *orig_stmt = orig_stmt_info->stmt;
4722 ssa_op_iter op_iter;
4723 def_operand_p def_p;
4724 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4725 {
4726 imm_use_iterator use_iter;
4727 gimple *use_stmt;
4728 stmt_vec_info use_stmt_info;
4729 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4730 if (!is_gimple_debug (use_stmt))
4731 {
4732 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4733 if (!use_stmt_info
4734 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4735 {
4736 STMT_VINFO_LIVE_P (stmt_info) = true;
4737 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4738 NULL, node, instance, i,
4739 false, cost_vec))
4740 /* ??? So we know we can vectorize the live stmt
4741 from one SLP node. If we cannot do so from all
4742 or none consistently we'd have to record which
4743 SLP node (and lane) we want to use for the live
4744 operation. So make sure we can code-generate
4745 from all nodes. */
4746 mark_visited = false;
4747 else
4748 STMT_VINFO_LIVE_P (stmt_info) = false;
4749 break;
4750 }
4751 }
4752 /* We have to verify whether we can insert the lane extract
4753 before all uses. The following is a conservative approximation.
4754 We cannot put this into vectorizable_live_operation because
4755 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4756 doesn't work.
4757 Note that while the fact that we emit code for loads at the
4758 first load should make this a non-problem leafs we construct
4759 from scalars are vectorized after the last scalar def.
4760 ??? If we'd actually compute the insert location during
4761 analysis we could use sth less conservative than the last
4762 scalar stmt in the node for the dominance check. */
4763 /* ??? What remains is "live" uses in vector CTORs in the same
4764 SLP graph which is where those uses can end up code-generated
4765 right after their definition instead of close to their original
4766 use. But that would restrict us to code-generate lane-extracts
4767 from the latest stmt in a node. So we compensate for this
4768 during code-generation, simply not replacing uses for those
4769 hopefully rare cases. */
4770 if (STMT_VINFO_LIVE_P (stmt_info))
4771 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4772 if (!is_gimple_debug (use_stmt)
4773 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4774 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4775 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4776 {
4777 if (dump_enabled_p ())
4778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4779 "Cannot determine insertion place for "
4780 "lane extract\n");
4781 STMT_VINFO_LIVE_P (stmt_info) = false;
4782 mark_visited = true;
4783 }
4784 }
4785 if (mark_visited)
4786 svisited.add (stmt_info);
4787 }
4788
4789 slp_tree child;
4790 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4791 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4792 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4793 cost_vec, svisited, visited);
4794 }
4795
4796 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4797
4798 static bool
4799 vectorizable_bb_reduc_epilogue (slp_instance instance,
4800 stmt_vector_for_cost *cost_vec)
4801 {
4802 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
4803 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
4804 if (reduc_code == MINUS_EXPR)
4805 reduc_code = PLUS_EXPR;
4806 internal_fn reduc_fn;
4807 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4808 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4809 || reduc_fn == IFN_LAST
4810 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
4811 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4812 TREE_TYPE (vectype)))
4813 return false;
4814
4815 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4816 cost log2 vector operations plus shuffles and one extraction. */
4817 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4818 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4819 vectype, 0, vect_body);
4820 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4821 vectype, 0, vect_body);
4822 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
4823 vectype, 0, vect_body);
4824 return true;
4825 }
4826
4827 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4828 and recurse to children. */
4829
4830 static void
4831 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4832 hash_set<slp_tree> &visited)
4833 {
4834 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4835 || visited.add (node))
4836 return;
4837
4838 stmt_vec_info stmt;
4839 unsigned i;
4840 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4841 roots.remove (vect_orig_stmt (stmt));
4842
4843 slp_tree child;
4844 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4845 if (child)
4846 vect_slp_prune_covered_roots (child, roots, visited);
4847 }
4848
4849 /* Analyze statements in SLP instances of VINFO. Return true if the
4850 operations are supported. */
4851
4852 bool
4853 vect_slp_analyze_operations (vec_info *vinfo)
4854 {
4855 slp_instance instance;
4856 int i;
4857
4858 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4859
4860 hash_set<slp_tree> visited;
4861 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4862 {
4863 auto_vec<slp_tree> visited_vec;
4864 stmt_vector_for_cost cost_vec;
4865 cost_vec.create (2);
4866 if (is_a <bb_vec_info> (vinfo))
4867 vect_location = instance->location ();
4868 if (!vect_slp_analyze_node_operations (vinfo,
4869 SLP_INSTANCE_TREE (instance),
4870 instance, visited, visited_vec,
4871 &cost_vec)
4872 /* CTOR instances require vectorized defs for the SLP tree root. */
4873 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
4874 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4875 != vect_internal_def))
4876 /* Check we can vectorize the reduction. */
4877 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
4878 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
4879 {
4880 slp_tree node = SLP_INSTANCE_TREE (instance);
4881 stmt_vec_info stmt_info;
4882 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4883 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4884 else
4885 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4886 if (dump_enabled_p ())
4887 dump_printf_loc (MSG_NOTE, vect_location,
4888 "removing SLP instance operations starting from: %G",
4889 stmt_info->stmt);
4890 vect_free_slp_instance (instance);
4891 vinfo->slp_instances.ordered_remove (i);
4892 cost_vec.release ();
4893 while (!visited_vec.is_empty ())
4894 visited.remove (visited_vec.pop ());
4895 }
4896 else
4897 {
4898 i++;
4899 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
4900 {
4901 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
4902 cost_vec.release ();
4903 }
4904 else
4905 /* For BB vectorization remember the SLP graph entry
4906 cost for later. */
4907 instance->cost_vec = cost_vec;
4908 }
4909 }
4910
4911 /* Now look for SLP instances with a root that are covered by other
4912 instances and remove them. */
4913 hash_set<stmt_vec_info> roots;
4914 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4915 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4916 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
4917 if (!roots.is_empty ())
4918 {
4919 visited.empty ();
4920 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4921 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
4922 visited);
4923 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4924 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
4925 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
4926 {
4927 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4928 if (dump_enabled_p ())
4929 dump_printf_loc (MSG_NOTE, vect_location,
4930 "removing SLP instance operations starting "
4931 "from: %G", root->stmt);
4932 vect_free_slp_instance (instance);
4933 vinfo->slp_instances.ordered_remove (i);
4934 }
4935 else
4936 ++i;
4937 }
4938
4939 /* Compute vectorizable live stmts. */
4940 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4941 {
4942 hash_set<stmt_vec_info> svisited;
4943 hash_set<slp_tree> visited;
4944 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4945 {
4946 vect_location = instance->location ();
4947 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4948 instance, &instance->cost_vec, svisited,
4949 visited);
4950 }
4951 }
4952
4953 return !vinfo->slp_instances.is_empty ();
4954 }
4955
4956 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4957 closing the eventual chain. */
4958
4959 static slp_instance
4960 get_ultimate_leader (slp_instance instance,
4961 hash_map<slp_instance, slp_instance> &instance_leader)
4962 {
4963 auto_vec<slp_instance *, 8> chain;
4964 slp_instance *tem;
4965 while (*(tem = instance_leader.get (instance)) != instance)
4966 {
4967 chain.safe_push (tem);
4968 instance = *tem;
4969 }
4970 while (!chain.is_empty ())
4971 *chain.pop () = instance;
4972 return instance;
4973 }
4974
4975 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4976
4977 static void
4978 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4979 slp_instance instance, slp_tree node,
4980 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4981 hash_map<slp_instance, slp_instance> &instance_leader,
4982 hash_set<slp_tree> &visited)
4983 {
4984 stmt_vec_info stmt_info;
4985 unsigned i;
4986
4987 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4988 {
4989 bool existed_p;
4990 slp_instance &stmt_instance
4991 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4992 if (!existed_p)
4993 ;
4994 else if (stmt_instance != instance)
4995 {
4996 /* If we're running into a previously marked stmt make us the
4997 leader of the current ultimate leader. This keeps the
4998 leader chain acyclic and works even when the current instance
4999 connects two previously independent graph parts. */
5000 slp_instance stmt_leader
5001 = get_ultimate_leader (stmt_instance, instance_leader);
5002 if (stmt_leader != instance)
5003 instance_leader.put (stmt_leader, instance);
5004 }
5005 stmt_instance = instance;
5006 }
5007
5008 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
5009 return;
5010
5011 slp_tree child;
5012 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5013 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5014 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
5015 instance_leader, visited);
5016 }
5017
5018 /* Partition the SLP graph into pieces that can be costed independently. */
5019
5020 static void
5021 vect_bb_partition_graph (bb_vec_info bb_vinfo)
5022 {
5023 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5024
5025 /* First walk the SLP graph assigning each involved scalar stmt a
5026 corresponding SLP graph entry and upon visiting a previously
5027 marked stmt, make the stmts leader the current SLP graph entry. */
5028 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
5029 hash_map<slp_instance, slp_instance> instance_leader;
5030 hash_set<slp_tree> visited;
5031 slp_instance instance;
5032 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5033 {
5034 instance_leader.put (instance, instance);
5035 vect_bb_partition_graph_r (bb_vinfo,
5036 instance, SLP_INSTANCE_TREE (instance),
5037 stmt_to_instance, instance_leader,
5038 visited);
5039 }
5040
5041 /* Then collect entries to each independent subgraph. */
5042 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5043 {
5044 slp_instance leader = get_ultimate_leader (instance, instance_leader);
5045 leader->subgraph_entries.safe_push (instance);
5046 if (dump_enabled_p ()
5047 && leader != instance)
5048 dump_printf_loc (MSG_NOTE, vect_location,
5049 "instance %p is leader of %p\n",
5050 leader, instance);
5051 }
5052 }
5053
5054 /* Compute the set of scalar stmts participating in internal and external
5055 nodes. */
5056
5057 static void
5058 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
5059 hash_set<slp_tree> &visited,
5060 hash_set<stmt_vec_info> &vstmts,
5061 hash_set<stmt_vec_info> &estmts)
5062 {
5063 int i;
5064 stmt_vec_info stmt_info;
5065 slp_tree child;
5066
5067 if (visited.add (node))
5068 return;
5069
5070 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5071 {
5072 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5073 vstmts.add (stmt_info);
5074
5075 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5076 if (child)
5077 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
5078 vstmts, estmts);
5079 }
5080 else
5081 for (tree def : SLP_TREE_SCALAR_OPS (node))
5082 {
5083 stmt_vec_info def_stmt = vinfo->lookup_def (def);
5084 if (def_stmt)
5085 estmts.add (def_stmt);
5086 }
5087 }
5088
5089
5090 /* Compute the scalar cost of the SLP node NODE and its children
5091 and return it. Do not account defs that are marked in LIFE and
5092 update LIFE according to uses of NODE. */
5093
5094 static void
5095 vect_bb_slp_scalar_cost (vec_info *vinfo,
5096 slp_tree node, vec<bool, va_heap> *life,
5097 stmt_vector_for_cost *cost_vec,
5098 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
5099 hash_set<slp_tree> &visited)
5100 {
5101 unsigned i;
5102 stmt_vec_info stmt_info;
5103 slp_tree child;
5104
5105 if (visited.add (node))
5106 return;
5107
5108 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5109 {
5110 ssa_op_iter op_iter;
5111 def_operand_p def_p;
5112
5113 if ((*life)[i])
5114 continue;
5115
5116 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5117 gimple *orig_stmt = orig_stmt_info->stmt;
5118
5119 /* If there is a non-vectorized use of the defs then the scalar
5120 stmt is kept live in which case we do not account it or any
5121 required defs in the SLP children in the scalar cost. This
5122 way we make the vectorization more costly when compared to
5123 the scalar cost. */
5124 if (!STMT_VINFO_LIVE_P (stmt_info))
5125 {
5126 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
5127 {
5128 imm_use_iterator use_iter;
5129 gimple *use_stmt;
5130 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
5131 if (!is_gimple_debug (use_stmt))
5132 {
5133 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
5134 if (!use_stmt_info
5135 || !vectorized_scalar_stmts.contains (use_stmt_info))
5136 {
5137 (*life)[i] = true;
5138 break;
5139 }
5140 }
5141 }
5142 if ((*life)[i])
5143 continue;
5144 }
5145
5146 /* Count scalar stmts only once. */
5147 if (gimple_visited_p (orig_stmt))
5148 continue;
5149 gimple_set_visited (orig_stmt, true);
5150
5151 vect_cost_for_stmt kind;
5152 if (STMT_VINFO_DATA_REF (orig_stmt_info))
5153 {
5154 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
5155 kind = scalar_load;
5156 else
5157 kind = scalar_store;
5158 }
5159 else if (vect_nop_conversion_p (orig_stmt_info))
5160 continue;
5161 /* For single-argument PHIs assume coalescing which means zero cost
5162 for the scalar and the vector PHIs. This avoids artificially
5163 favoring the vector path (but may pessimize it in some cases). */
5164 else if (is_a <gphi *> (orig_stmt_info->stmt)
5165 && gimple_phi_num_args
5166 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5167 continue;
5168 else
5169 kind = scalar_stmt;
5170 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5171 SLP_TREE_VECTYPE (node), 0, vect_body);
5172 }
5173
5174 auto_vec<bool, 20> subtree_life;
5175 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5176 {
5177 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5178 {
5179 /* Do not directly pass LIFE to the recursive call, copy it to
5180 confine changes in the callee to the current child/subtree. */
5181 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5182 {
5183 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5184 for (unsigned j = 0;
5185 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5186 {
5187 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5188 if (perm.first == i)
5189 subtree_life[perm.second] = (*life)[j];
5190 }
5191 }
5192 else
5193 {
5194 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5195 subtree_life.safe_splice (*life);
5196 }
5197 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5198 vectorized_scalar_stmts, visited);
5199 subtree_life.truncate (0);
5200 }
5201 }
5202 }
5203
5204 /* Comparator for the loop-index sorted cost vectors. */
5205
5206 static int
5207 li_cost_vec_cmp (const void *a_, const void *b_)
5208 {
5209 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5210 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5211 if (a->first < b->first)
5212 return -1;
5213 else if (a->first == b->first)
5214 return 0;
5215 return 1;
5216 }
5217
5218 /* Check if vectorization of the basic block is profitable for the
5219 subgraph denoted by SLP_INSTANCES. */
5220
5221 static bool
5222 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5223 vec<slp_instance> slp_instances,
5224 loop_p orig_loop)
5225 {
5226 slp_instance instance;
5227 int i;
5228 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5229 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5230
5231 if (dump_enabled_p ())
5232 {
5233 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5234 hash_set<slp_tree> visited;
5235 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5236 vect_print_slp_graph (MSG_NOTE, vect_location,
5237 SLP_INSTANCE_TREE (instance), visited);
5238 }
5239
5240 /* Compute the set of scalar stmts we know will go away 'locally' when
5241 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
5242 not accurate for nodes promoted extern late or for scalar stmts that
5243 are used both in extern defs and in vectorized defs. */
5244 hash_set<stmt_vec_info> vectorized_scalar_stmts;
5245 hash_set<stmt_vec_info> scalar_stmts_in_externs;
5246 hash_set<slp_tree> visited;
5247 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5248 {
5249 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
5250 SLP_INSTANCE_TREE (instance),
5251 visited,
5252 vectorized_scalar_stmts,
5253 scalar_stmts_in_externs);
5254 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
5255 vectorized_scalar_stmts.add (rstmt);
5256 }
5257 /* Scalar stmts used as defs in external nodes need to be preseved, so
5258 remove them from vectorized_scalar_stmts. */
5259 for (stmt_vec_info stmt : scalar_stmts_in_externs)
5260 vectorized_scalar_stmts.remove (stmt);
5261
5262 /* Calculate scalar cost and sum the cost for the vector stmts
5263 previously collected. */
5264 stmt_vector_for_cost scalar_costs = vNULL;
5265 stmt_vector_for_cost vector_costs = vNULL;
5266 visited.empty ();
5267 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5268 {
5269 auto_vec<bool, 20> life;
5270 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5271 true);
5272 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5273 record_stmt_cost (&scalar_costs,
5274 SLP_INSTANCE_ROOT_STMTS (instance).length (),
5275 scalar_stmt,
5276 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5277 vect_bb_slp_scalar_cost (bb_vinfo,
5278 SLP_INSTANCE_TREE (instance),
5279 &life, &scalar_costs, vectorized_scalar_stmts,
5280 visited);
5281 vector_costs.safe_splice (instance->cost_vec);
5282 instance->cost_vec.release ();
5283 }
5284
5285 if (dump_enabled_p ())
5286 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5287
5288 /* When costing non-loop vectorization we need to consider each covered
5289 loop independently and make sure vectorization is profitable. For
5290 now we assume a loop may be not entered or executed an arbitrary
5291 number of iterations (??? static information can provide more
5292 precise info here) which means we can simply cost each containing
5293 loops stmts separately. */
5294
5295 /* First produce cost vectors sorted by loop index. */
5296 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5297 li_scalar_costs (scalar_costs.length ());
5298 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5299 li_vector_costs (vector_costs.length ());
5300 stmt_info_for_cost *cost;
5301 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5302 {
5303 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5304 li_scalar_costs.quick_push (std::make_pair (l, cost));
5305 }
5306 /* Use a random used loop as fallback in case the first vector_costs
5307 entry does not have a stmt_info associated with it. */
5308 unsigned l = li_scalar_costs[0].first;
5309 FOR_EACH_VEC_ELT (vector_costs, i, cost)
5310 {
5311 /* We inherit from the previous COST, invariants, externals and
5312 extracts immediately follow the cost for the related stmt. */
5313 if (cost->stmt_info)
5314 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5315 li_vector_costs.quick_push (std::make_pair (l, cost));
5316 }
5317 li_scalar_costs.qsort (li_cost_vec_cmp);
5318 li_vector_costs.qsort (li_cost_vec_cmp);
5319
5320 /* Now cost the portions individually. */
5321 unsigned vi = 0;
5322 unsigned si = 0;
5323 bool profitable = true;
5324 while (si < li_scalar_costs.length ()
5325 && vi < li_vector_costs.length ())
5326 {
5327 unsigned sl = li_scalar_costs[si].first;
5328 unsigned vl = li_vector_costs[vi].first;
5329 if (sl != vl)
5330 {
5331 if (dump_enabled_p ())
5332 dump_printf_loc (MSG_NOTE, vect_location,
5333 "Scalar %d and vector %d loop part do not "
5334 "match up, skipping scalar part\n", sl, vl);
5335 /* Skip the scalar part, assuming zero cost on the vector side. */
5336 do
5337 {
5338 si++;
5339 }
5340 while (si < li_scalar_costs.length ()
5341 && li_scalar_costs[si].first == sl);
5342 continue;
5343 }
5344
5345 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
5346 do
5347 {
5348 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
5349 si++;
5350 }
5351 while (si < li_scalar_costs.length ()
5352 && li_scalar_costs[si].first == sl);
5353 unsigned dummy;
5354 finish_cost (scalar_target_cost_data, nullptr,
5355 &dummy, &scalar_cost, &dummy);
5356 delete scalar_target_cost_data;
5357
5358 /* Complete the target-specific vector cost calculation. */
5359 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
5360 do
5361 {
5362 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
5363 vi++;
5364 }
5365 while (vi < li_vector_costs.length ()
5366 && li_vector_costs[vi].first == vl);
5367 finish_cost (vect_target_cost_data, scalar_target_cost_data,
5368 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
5369 delete vect_target_cost_data;
5370
5371 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5372
5373 if (dump_enabled_p ())
5374 {
5375 dump_printf_loc (MSG_NOTE, vect_location,
5376 "Cost model analysis for part in loop %d:\n", sl);
5377 dump_printf (MSG_NOTE, " Vector cost: %d\n",
5378 vec_inside_cost + vec_outside_cost);
5379 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
5380 }
5381
5382 /* Vectorization is profitable if its cost is more than the cost of scalar
5383 version. Note that we err on the vector side for equal cost because
5384 the cost estimate is otherwise quite pessimistic (constant uses are
5385 free on the scalar side but cost a load on the vector side for
5386 example). */
5387 if (vec_outside_cost + vec_inside_cost > scalar_cost)
5388 {
5389 profitable = false;
5390 break;
5391 }
5392 }
5393 if (profitable && vi < li_vector_costs.length ())
5394 {
5395 if (dump_enabled_p ())
5396 dump_printf_loc (MSG_NOTE, vect_location,
5397 "Excess vector cost for part in loop %d:\n",
5398 li_vector_costs[vi].first);
5399 profitable = false;
5400 }
5401
5402 /* Unset visited flag. This is delayed when the subgraph is profitable
5403 and we process the loop for remaining unvectorized if-converted code. */
5404 if (!orig_loop || !profitable)
5405 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5406 gimple_set_visited (cost->stmt_info->stmt, false);
5407
5408 scalar_costs.release ();
5409 vector_costs.release ();
5410
5411 return profitable;
5412 }
5413
5414 /* qsort comparator for lane defs. */
5415
5416 static int
5417 vld_cmp (const void *a_, const void *b_)
5418 {
5419 auto *a = (const std::pair<unsigned, tree> *)a_;
5420 auto *b = (const std::pair<unsigned, tree> *)b_;
5421 return a->first - b->first;
5422 }
5423
5424 /* Return true if USE_STMT is a vector lane insert into VEC and set
5425 *THIS_LANE to the lane number that is set. */
5426
5427 static bool
5428 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5429 {
5430 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5431 if (!use_ass
5432 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5433 || (vec
5434 ? gimple_assign_rhs1 (use_ass) != vec
5435 : ((vec = gimple_assign_rhs1 (use_ass)), false))
5436 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5437 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5438 || !constant_multiple_p
5439 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5440 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5441 this_lane))
5442 return false;
5443 return true;
5444 }
5445
5446 /* Find any vectorizable constructors and add them to the grouped_store
5447 array. */
5448
5449 static void
5450 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5451 {
5452 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5453 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5454 !gsi_end_p (gsi); gsi_next (&gsi))
5455 {
5456 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5457 if (!assign)
5458 continue;
5459
5460 tree rhs = gimple_assign_rhs1 (assign);
5461 enum tree_code code = gimple_assign_rhs_code (assign);
5462 use_operand_p use_p;
5463 gimple *use_stmt;
5464 if (code == CONSTRUCTOR)
5465 {
5466 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5467 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5468 CONSTRUCTOR_NELTS (rhs))
5469 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5470 || uniform_vector_p (rhs))
5471 continue;
5472
5473 unsigned j;
5474 tree val;
5475 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5476 if (TREE_CODE (val) != SSA_NAME
5477 || !bb_vinfo->lookup_def (val))
5478 break;
5479 if (j != CONSTRUCTOR_NELTS (rhs))
5480 continue;
5481
5482 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5483 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5484 }
5485 else if (code == BIT_INSERT_EXPR
5486 && VECTOR_TYPE_P (TREE_TYPE (rhs))
5487 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5488 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5489 && integer_zerop (gimple_assign_rhs3 (assign))
5490 && useless_type_conversion_p
5491 (TREE_TYPE (TREE_TYPE (rhs)),
5492 TREE_TYPE (gimple_assign_rhs2 (assign)))
5493 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5494 {
5495 /* We start to match on insert to lane zero but since the
5496 inserts need not be ordered we'd have to search both
5497 the def and the use chains. */
5498 tree vectype = TREE_TYPE (rhs);
5499 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5500 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5501 auto_sbitmap lanes (nlanes);
5502 bitmap_clear (lanes);
5503 bitmap_set_bit (lanes, 0);
5504 tree def = gimple_assign_lhs (assign);
5505 lane_defs.quick_push
5506 (std::make_pair (0, gimple_assign_rhs2 (assign)));
5507 unsigned lanes_found = 1;
5508 /* Start with the use chains, the last stmt will be the root. */
5509 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5510 vec<stmt_vec_info> roots = vNULL;
5511 roots.safe_push (last);
5512 do
5513 {
5514 use_operand_p use_p;
5515 gimple *use_stmt;
5516 if (!single_imm_use (def, &use_p, &use_stmt))
5517 break;
5518 unsigned this_lane;
5519 if (!bb_vinfo->lookup_stmt (use_stmt)
5520 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5521 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5522 break;
5523 if (bitmap_bit_p (lanes, this_lane))
5524 break;
5525 lanes_found++;
5526 bitmap_set_bit (lanes, this_lane);
5527 gassign *use_ass = as_a <gassign *> (use_stmt);
5528 lane_defs.quick_push (std::make_pair
5529 (this_lane, gimple_assign_rhs2 (use_ass)));
5530 last = bb_vinfo->lookup_stmt (use_ass);
5531 roots.safe_push (last);
5532 def = gimple_assign_lhs (use_ass);
5533 }
5534 while (lanes_found < nlanes);
5535 if (roots.length () > 1)
5536 std::swap(roots[0], roots[roots.length () - 1]);
5537 if (lanes_found < nlanes)
5538 {
5539 /* Now search the def chain. */
5540 def = gimple_assign_rhs1 (assign);
5541 do
5542 {
5543 if (TREE_CODE (def) != SSA_NAME
5544 || !has_single_use (def))
5545 break;
5546 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5547 unsigned this_lane;
5548 if (!bb_vinfo->lookup_stmt (def_stmt)
5549 || !vect_slp_is_lane_insert (def_stmt,
5550 NULL_TREE, &this_lane)
5551 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5552 break;
5553 if (bitmap_bit_p (lanes, this_lane))
5554 break;
5555 lanes_found++;
5556 bitmap_set_bit (lanes, this_lane);
5557 lane_defs.quick_push (std::make_pair
5558 (this_lane,
5559 gimple_assign_rhs2 (def_stmt)));
5560 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5561 def = gimple_assign_rhs1 (def_stmt);
5562 }
5563 while (lanes_found < nlanes);
5564 }
5565 if (lanes_found == nlanes)
5566 {
5567 /* Sort lane_defs after the lane index and register the root. */
5568 lane_defs.qsort (vld_cmp);
5569 vec<stmt_vec_info> stmts;
5570 stmts.create (nlanes);
5571 for (unsigned i = 0; i < nlanes; ++i)
5572 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5573 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5574 stmts, roots));
5575 }
5576 else
5577 roots.release ();
5578 }
5579 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5580 && (associative_tree_code (code) || code == MINUS_EXPR)
5581 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5582 checks pessimize a two-element reduction. PR54400.
5583 ??? In-order reduction could be handled if we only
5584 traverse one operand chain in vect_slp_linearize_chain. */
5585 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5586 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5587 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5588 /* Ops with constants at the tail can be stripped here. */
5589 && TREE_CODE (rhs) == SSA_NAME
5590 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5591 /* Should be the chain end. */
5592 && (!single_imm_use (gimple_assign_lhs (assign),
5593 &use_p, &use_stmt)
5594 || !is_gimple_assign (use_stmt)
5595 || (gimple_assign_rhs_code (use_stmt) != code
5596 && ((code != PLUS_EXPR && code != MINUS_EXPR)
5597 || (gimple_assign_rhs_code (use_stmt)
5598 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5599 {
5600 /* We start the match at the end of a possible association
5601 chain. */
5602 auto_vec<chain_op_t> chain;
5603 auto_vec<std::pair<tree_code, gimple *> > worklist;
5604 auto_vec<gimple *> chain_stmts;
5605 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5606 if (code == MINUS_EXPR)
5607 code = PLUS_EXPR;
5608 internal_fn reduc_fn;
5609 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5610 || reduc_fn == IFN_LAST)
5611 continue;
5612 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5613 /* ??? */
5614 code_stmt, alt_code_stmt, &chain_stmts);
5615 if (chain.length () > 1)
5616 {
5617 /* Sort the chain according to def_type and operation. */
5618 chain.sort (dt_sort_cmp, bb_vinfo);
5619 /* ??? Now we'd want to strip externals and constants
5620 but record those to be handled in the epilogue. */
5621 /* ??? For now do not allow mixing ops or externs/constants. */
5622 bool invalid = false;
5623 for (unsigned i = 0; i < chain.length (); ++i)
5624 if (chain[i].dt != vect_internal_def
5625 || chain[i].code != code)
5626 invalid = true;
5627 if (!invalid)
5628 {
5629 vec<stmt_vec_info> stmts;
5630 stmts.create (chain.length ());
5631 for (unsigned i = 0; i < chain.length (); ++i)
5632 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5633 vec<stmt_vec_info> roots;
5634 roots.create (chain_stmts.length ());
5635 for (unsigned i = 0; i < chain_stmts.length (); ++i)
5636 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5637 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5638 stmts, roots));
5639 }
5640 }
5641 }
5642 }
5643 }
5644
5645 /* Walk the grouped store chains and replace entries with their
5646 pattern variant if any. */
5647
5648 static void
5649 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5650 {
5651 stmt_vec_info first_element;
5652 unsigned i;
5653
5654 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5655 {
5656 /* We also have CTORs in this array. */
5657 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5658 continue;
5659 if (STMT_VINFO_IN_PATTERN_P (first_element))
5660 {
5661 stmt_vec_info orig = first_element;
5662 first_element = STMT_VINFO_RELATED_STMT (first_element);
5663 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5664 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5665 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5666 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5667 vinfo->grouped_stores[i] = first_element;
5668 }
5669 stmt_vec_info prev = first_element;
5670 while (DR_GROUP_NEXT_ELEMENT (prev))
5671 {
5672 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5673 if (STMT_VINFO_IN_PATTERN_P (elt))
5674 {
5675 stmt_vec_info orig = elt;
5676 elt = STMT_VINFO_RELATED_STMT (elt);
5677 DR_GROUP_NEXT_ELEMENT (prev) = elt;
5678 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5679 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5680 }
5681 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5682 prev = elt;
5683 }
5684 }
5685 }
5686
5687 /* Check if the region described by BB_VINFO can be vectorized, returning
5688 true if so. When returning false, set FATAL to true if the same failure
5689 would prevent vectorization at other vector sizes, false if it is still
5690 worth trying other sizes. N_STMTS is the number of statements in the
5691 region. */
5692
5693 static bool
5694 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5695 vec<int> *dataref_groups)
5696 {
5697 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5698
5699 slp_instance instance;
5700 int i;
5701 poly_uint64 min_vf = 2;
5702
5703 /* The first group of checks is independent of the vector size. */
5704 fatal = true;
5705
5706 /* Analyze the data references. */
5707
5708 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5709 {
5710 if (dump_enabled_p ())
5711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5712 "not vectorized: unhandled data-ref in basic "
5713 "block.\n");
5714 return false;
5715 }
5716
5717 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5718 {
5719 if (dump_enabled_p ())
5720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5721 "not vectorized: unhandled data access in "
5722 "basic block.\n");
5723 return false;
5724 }
5725
5726 vect_slp_check_for_constructors (bb_vinfo);
5727
5728 /* If there are no grouped stores and no constructors in the region
5729 there is no need to continue with pattern recog as vect_analyze_slp
5730 will fail anyway. */
5731 if (bb_vinfo->grouped_stores.is_empty ()
5732 && bb_vinfo->roots.is_empty ())
5733 {
5734 if (dump_enabled_p ())
5735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5736 "not vectorized: no grouped stores in "
5737 "basic block.\n");
5738 return false;
5739 }
5740
5741 /* While the rest of the analysis below depends on it in some way. */
5742 fatal = false;
5743
5744 vect_pattern_recog (bb_vinfo);
5745
5746 /* Update store groups from pattern processing. */
5747 vect_fixup_store_groups_with_patterns (bb_vinfo);
5748
5749 /* Check the SLP opportunities in the basic block, analyze and build SLP
5750 trees. */
5751 if (!vect_analyze_slp (bb_vinfo, n_stmts))
5752 {
5753 if (dump_enabled_p ())
5754 {
5755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5756 "Failed to SLP the basic block.\n");
5757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5758 "not vectorized: failed to find SLP opportunities "
5759 "in basic block.\n");
5760 }
5761 return false;
5762 }
5763
5764 /* Optimize permutations. */
5765 vect_optimize_slp (bb_vinfo);
5766
5767 /* Gather the loads reachable from the SLP graph entries. */
5768 vect_gather_slp_loads (bb_vinfo);
5769
5770 vect_record_base_alignments (bb_vinfo);
5771
5772 /* Analyze and verify the alignment of data references and the
5773 dependence in the SLP instances. */
5774 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5775 {
5776 vect_location = instance->location ();
5777 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5778 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5779 {
5780 slp_tree node = SLP_INSTANCE_TREE (instance);
5781 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5782 if (dump_enabled_p ())
5783 dump_printf_loc (MSG_NOTE, vect_location,
5784 "removing SLP instance operations starting from: %G",
5785 stmt_info->stmt);
5786 vect_free_slp_instance (instance);
5787 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5788 continue;
5789 }
5790
5791 /* Mark all the statements that we want to vectorize as pure SLP and
5792 relevant. */
5793 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5794 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5795 unsigned j;
5796 stmt_vec_info root;
5797 /* Likewise consider instance root stmts as vectorized. */
5798 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5799 STMT_SLP_TYPE (root) = pure_slp;
5800
5801 i++;
5802 }
5803 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5804 return false;
5805
5806 if (!vect_slp_analyze_operations (bb_vinfo))
5807 {
5808 if (dump_enabled_p ())
5809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5810 "not vectorized: bad operation in basic block.\n");
5811 return false;
5812 }
5813
5814 vect_bb_partition_graph (bb_vinfo);
5815
5816 return true;
5817 }
5818
5819 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5820 basic blocks in BBS, returning true on success.
5821 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5822
5823 static bool
5824 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5825 vec<int> *dataref_groups, unsigned int n_stmts,
5826 loop_p orig_loop)
5827 {
5828 bb_vec_info bb_vinfo;
5829 auto_vector_modes vector_modes;
5830
5831 /* Autodetect first vector size we try. */
5832 machine_mode next_vector_mode = VOIDmode;
5833 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5834 unsigned int mode_i = 0;
5835
5836 vec_info_shared shared;
5837
5838 machine_mode autodetected_vector_mode = VOIDmode;
5839 while (1)
5840 {
5841 bool vectorized = false;
5842 bool fatal = false;
5843 bb_vinfo = new _bb_vec_info (bbs, &shared);
5844
5845 bool first_time_p = shared.datarefs.is_empty ();
5846 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
5847 if (first_time_p)
5848 bb_vinfo->shared->save_datarefs ();
5849 else
5850 bb_vinfo->shared->check_datarefs ();
5851 bb_vinfo->vector_mode = next_vector_mode;
5852
5853 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
5854 {
5855 if (dump_enabled_p ())
5856 {
5857 dump_printf_loc (MSG_NOTE, vect_location,
5858 "***** Analysis succeeded with vector mode"
5859 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
5860 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
5861 }
5862
5863 bb_vinfo->shared->check_datarefs ();
5864
5865 auto_vec<slp_instance> profitable_subgraphs;
5866 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
5867 {
5868 if (instance->subgraph_entries.is_empty ())
5869 continue;
5870
5871 vect_location = instance->location ();
5872 if (!unlimited_cost_model (NULL)
5873 && !vect_bb_vectorization_profitable_p
5874 (bb_vinfo, instance->subgraph_entries, orig_loop))
5875 {
5876 if (dump_enabled_p ())
5877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5878 "not vectorized: vectorization is not "
5879 "profitable.\n");
5880 continue;
5881 }
5882
5883 if (!dbg_cnt (vect_slp))
5884 continue;
5885
5886 profitable_subgraphs.safe_push (instance);
5887 }
5888
5889 /* When we're vectorizing an if-converted loop body with the
5890 very-cheap cost model make sure we vectorized all if-converted
5891 code. */
5892 if (!profitable_subgraphs.is_empty ()
5893 && orig_loop)
5894 {
5895 gcc_assert (bb_vinfo->bbs.length () == 1);
5896 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
5897 !gsi_end_p (gsi); gsi_next (&gsi))
5898 {
5899 /* The costing above left us with DCEable vectorized scalar
5900 stmts having the visited flag set on profitable
5901 subgraphs. Do the delayed clearing of the flag here. */
5902 if (gimple_visited_p (gsi_stmt (gsi)))
5903 {
5904 gimple_set_visited (gsi_stmt (gsi), false);
5905 continue;
5906 }
5907 if (flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP)
5908 continue;
5909
5910 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
5911 if (gimple_assign_rhs_code (ass) == COND_EXPR)
5912 {
5913 if (!profitable_subgraphs.is_empty ()
5914 && dump_enabled_p ())
5915 dump_printf_loc (MSG_NOTE, vect_location,
5916 "not profitable because of "
5917 "unprofitable if-converted scalar "
5918 "code\n");
5919 profitable_subgraphs.truncate (0);
5920 }
5921 }
5922 }
5923
5924 /* Finally schedule the profitable subgraphs. */
5925 for (slp_instance instance : profitable_subgraphs)
5926 {
5927 if (!vectorized && dump_enabled_p ())
5928 dump_printf_loc (MSG_NOTE, vect_location,
5929 "Basic block will be vectorized "
5930 "using SLP\n");
5931 vectorized = true;
5932
5933 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
5934
5935 unsigned HOST_WIDE_INT bytes;
5936 if (dump_enabled_p ())
5937 {
5938 if (GET_MODE_SIZE
5939 (bb_vinfo->vector_mode).is_constant (&bytes))
5940 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5941 "basic block part vectorized using %wu "
5942 "byte vectors\n", bytes);
5943 else
5944 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5945 "basic block part vectorized using "
5946 "variable length vectors\n");
5947 }
5948 }
5949 }
5950 else
5951 {
5952 if (dump_enabled_p ())
5953 dump_printf_loc (MSG_NOTE, vect_location,
5954 "***** Analysis failed with vector mode %s\n",
5955 GET_MODE_NAME (bb_vinfo->vector_mode));
5956 }
5957
5958 if (mode_i == 0)
5959 autodetected_vector_mode = bb_vinfo->vector_mode;
5960
5961 if (!fatal)
5962 while (mode_i < vector_modes.length ()
5963 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
5964 {
5965 if (dump_enabled_p ())
5966 dump_printf_loc (MSG_NOTE, vect_location,
5967 "***** The result for vector mode %s would"
5968 " be the same\n",
5969 GET_MODE_NAME (vector_modes[mode_i]));
5970 mode_i += 1;
5971 }
5972
5973 delete bb_vinfo;
5974
5975 if (mode_i < vector_modes.length ()
5976 && VECTOR_MODE_P (autodetected_vector_mode)
5977 && (related_vector_mode (vector_modes[mode_i],
5978 GET_MODE_INNER (autodetected_vector_mode))
5979 == autodetected_vector_mode)
5980 && (related_vector_mode (autodetected_vector_mode,
5981 GET_MODE_INNER (vector_modes[mode_i]))
5982 == vector_modes[mode_i]))
5983 {
5984 if (dump_enabled_p ())
5985 dump_printf_loc (MSG_NOTE, vect_location,
5986 "***** Skipping vector mode %s, which would"
5987 " repeat the analysis for %s\n",
5988 GET_MODE_NAME (vector_modes[mode_i]),
5989 GET_MODE_NAME (autodetected_vector_mode));
5990 mode_i += 1;
5991 }
5992
5993 if (vectorized
5994 || mode_i == vector_modes.length ()
5995 || autodetected_vector_mode == VOIDmode
5996 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5997 vector sizes will fail do not bother iterating. */
5998 || fatal)
5999 return vectorized;
6000
6001 /* Try the next biggest vector size. */
6002 next_vector_mode = vector_modes[mode_i++];
6003 if (dump_enabled_p ())
6004 dump_printf_loc (MSG_NOTE, vect_location,
6005 "***** Re-trying analysis with vector mode %s\n",
6006 GET_MODE_NAME (next_vector_mode));
6007 }
6008 }
6009
6010
6011 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
6012 true if anything in the basic-block was vectorized. */
6013
6014 static bool
6015 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
6016 {
6017 vec<data_reference_p> datarefs = vNULL;
6018 auto_vec<int> dataref_groups;
6019 int insns = 0;
6020 int current_group = 0;
6021
6022 for (unsigned i = 0; i < bbs.length (); i++)
6023 {
6024 basic_block bb = bbs[i];
6025 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
6026 gsi_next (&gsi))
6027 {
6028 gimple *stmt = gsi_stmt (gsi);
6029 if (is_gimple_debug (stmt))
6030 continue;
6031
6032 insns++;
6033
6034 if (gimple_location (stmt) != UNKNOWN_LOCATION)
6035 vect_location = stmt;
6036
6037 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
6038 &dataref_groups, current_group))
6039 ++current_group;
6040 }
6041 /* New BBs always start a new DR group. */
6042 ++current_group;
6043 }
6044
6045 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
6046 }
6047
6048 /* Special entry for the BB vectorizer. Analyze and transform a single
6049 if-converted BB with ORIG_LOOPs body being the not if-converted
6050 representation. Returns true if anything in the basic-block was
6051 vectorized. */
6052
6053 bool
6054 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
6055 {
6056 auto_vec<basic_block> bbs;
6057 bbs.safe_push (bb);
6058 return vect_slp_bbs (bbs, orig_loop);
6059 }
6060
6061 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
6062 true if anything in the basic-block was vectorized. */
6063
6064 bool
6065 vect_slp_function (function *fun)
6066 {
6067 bool r = false;
6068 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
6069 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
6070
6071 /* For the moment split the function into pieces to avoid making
6072 the iteration on the vector mode moot. Split at points we know
6073 to not handle well which is CFG merges (SLP discovery doesn't
6074 handle non-loop-header PHIs) and loop exits. Since pattern
6075 recog requires reverse iteration to visit uses before defs
6076 simply chop RPO into pieces. */
6077 auto_vec<basic_block> bbs;
6078 for (unsigned i = 0; i < n; i++)
6079 {
6080 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
6081 bool split = false;
6082
6083 /* Split when a BB is not dominated by the first block. */
6084 if (!bbs.is_empty ()
6085 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
6086 {
6087 if (dump_enabled_p ())
6088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6089 "splitting region at dominance boundary bb%d\n",
6090 bb->index);
6091 split = true;
6092 }
6093 /* Split when the loop determined by the first block
6094 is exited. This is because we eventually insert
6095 invariants at region begin. */
6096 else if (!bbs.is_empty ()
6097 && bbs[0]->loop_father != bb->loop_father
6098 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
6099 {
6100 if (dump_enabled_p ())
6101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6102 "splitting region at loop %d exit at bb%d\n",
6103 bbs[0]->loop_father->num, bb->index);
6104 split = true;
6105 }
6106
6107 if (split && !bbs.is_empty ())
6108 {
6109 r |= vect_slp_bbs (bbs, NULL);
6110 bbs.truncate (0);
6111 bbs.quick_push (bb);
6112 }
6113 else
6114 bbs.safe_push (bb);
6115
6116 /* When we have a stmt ending this block and defining a
6117 value we have to insert on edges when inserting after it for
6118 a vector containing its definition. Avoid this for now. */
6119 if (gimple *last = last_stmt (bb))
6120 if (gimple_get_lhs (last)
6121 && is_ctrl_altering_stmt (last))
6122 {
6123 if (dump_enabled_p ())
6124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6125 "splitting region at control altering "
6126 "definition %G", last);
6127 r |= vect_slp_bbs (bbs, NULL);
6128 bbs.truncate (0);
6129 }
6130 }
6131
6132 if (!bbs.is_empty ())
6133 r |= vect_slp_bbs (bbs, NULL);
6134
6135 free (rpo);
6136
6137 return r;
6138 }
6139
6140 /* Build a variable-length vector in which the elements in ELTS are repeated
6141 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
6142 RESULTS and add any new instructions to SEQ.
6143
6144 The approach we use is:
6145
6146 (1) Find a vector mode VM with integer elements of mode IM.
6147
6148 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6149 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
6150 from small vectors to IM.
6151
6152 (3) Duplicate each ELTS'[I] into a vector of mode VM.
6153
6154 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6155 correct byte contents.
6156
6157 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6158
6159 We try to find the largest IM for which this sequence works, in order
6160 to cut down on the number of interleaves. */
6161
6162 void
6163 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
6164 const vec<tree> &elts, unsigned int nresults,
6165 vec<tree> &results)
6166 {
6167 unsigned int nelts = elts.length ();
6168 tree element_type = TREE_TYPE (vector_type);
6169
6170 /* (1) Find a vector mode VM with integer elements of mode IM. */
6171 unsigned int nvectors = 1;
6172 tree new_vector_type;
6173 tree permutes[2];
6174 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
6175 &nvectors, &new_vector_type,
6176 permutes))
6177 gcc_unreachable ();
6178
6179 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
6180 unsigned int partial_nelts = nelts / nvectors;
6181 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
6182
6183 tree_vector_builder partial_elts;
6184 auto_vec<tree, 32> pieces (nvectors * 2);
6185 pieces.quick_grow_cleared (nvectors * 2);
6186 for (unsigned int i = 0; i < nvectors; ++i)
6187 {
6188 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6189 ELTS' has mode IM. */
6190 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
6191 for (unsigned int j = 0; j < partial_nelts; ++j)
6192 partial_elts.quick_push (elts[i * partial_nelts + j]);
6193 tree t = gimple_build_vector (seq, &partial_elts);
6194 t = gimple_build (seq, VIEW_CONVERT_EXPR,
6195 TREE_TYPE (new_vector_type), t);
6196
6197 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
6198 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
6199 }
6200
6201 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6202 correct byte contents.
6203
6204 Conceptually, we need to repeat the following operation log2(nvectors)
6205 times, where hi_start = nvectors / 2:
6206
6207 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6208 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6209
6210 However, if each input repeats every N elements and the VF is
6211 a multiple of N * 2, the HI result is the same as the LO result.
6212 This will be true for the first N1 iterations of the outer loop,
6213 followed by N2 iterations for which both the LO and HI results
6214 are needed. I.e.:
6215
6216 N1 + N2 = log2(nvectors)
6217
6218 Each "N1 iteration" doubles the number of redundant vectors and the
6219 effect of the process as a whole is to have a sequence of nvectors/2**N1
6220 vectors that repeats 2**N1 times. Rather than generate these redundant
6221 vectors, we halve the number of vectors for each N1 iteration. */
6222 unsigned int in_start = 0;
6223 unsigned int out_start = nvectors;
6224 unsigned int new_nvectors = nvectors;
6225 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
6226 {
6227 unsigned int hi_start = new_nvectors / 2;
6228 unsigned int out_i = 0;
6229 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6230 {
6231 if ((in_i & 1) != 0
6232 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6233 2 * in_repeat))
6234 continue;
6235
6236 tree output = make_ssa_name (new_vector_type);
6237 tree input1 = pieces[in_start + (in_i / 2)];
6238 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6239 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6240 input1, input2,
6241 permutes[in_i & 1]);
6242 gimple_seq_add_stmt (seq, stmt);
6243 pieces[out_start + out_i] = output;
6244 out_i += 1;
6245 }
6246 std::swap (in_start, out_start);
6247 new_nvectors = out_i;
6248 }
6249
6250 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6251 results.reserve (nresults);
6252 for (unsigned int i = 0; i < nresults; ++i)
6253 if (i < new_nvectors)
6254 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6255 pieces[in_start + i]));
6256 else
6257 results.quick_push (results[i - new_nvectors]);
6258 }
6259
6260
6261 /* For constant and loop invariant defs in OP_NODE this function creates
6262 vector defs that will be used in the vectorized stmts and stores them
6263 to SLP_TREE_VEC_DEFS of OP_NODE. */
6264
6265 static void
6266 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6267 {
6268 unsigned HOST_WIDE_INT nunits;
6269 tree vec_cst;
6270 unsigned j, number_of_places_left_in_vector;
6271 tree vector_type;
6272 tree vop;
6273 int group_size = op_node->ops.length ();
6274 unsigned int vec_num, i;
6275 unsigned number_of_copies = 1;
6276 bool constant_p;
6277 gimple_seq ctor_seq = NULL;
6278 auto_vec<tree, 16> permute_results;
6279
6280 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6281 vector_type = SLP_TREE_VECTYPE (op_node);
6282
6283 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6284 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6285 auto_vec<tree> voprnds (number_of_vectors);
6286
6287 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6288 created vectors. It is greater than 1 if unrolling is performed.
6289
6290 For example, we have two scalar operands, s1 and s2 (e.g., group of
6291 strided accesses of size two), while NUNITS is four (i.e., four scalars
6292 of this type can be packed in a vector). The output vector will contain
6293 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6294 will be 2).
6295
6296 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6297 containing the operands.
6298
6299 For example, NUNITS is four as before, and the group size is 8
6300 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6301 {s5, s6, s7, s8}. */
6302
6303 /* When using duplicate_and_interleave, we just need one element for
6304 each scalar statement. */
6305 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6306 nunits = group_size;
6307
6308 number_of_copies = nunits * number_of_vectors / group_size;
6309
6310 number_of_places_left_in_vector = nunits;
6311 constant_p = true;
6312 tree_vector_builder elts (vector_type, nunits, 1);
6313 elts.quick_grow (nunits);
6314 stmt_vec_info insert_after = NULL;
6315 for (j = 0; j < number_of_copies; j++)
6316 {
6317 tree op;
6318 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6319 {
6320 /* Create 'vect_ = {op0,op1,...,opn}'. */
6321 number_of_places_left_in_vector--;
6322 tree orig_op = op;
6323 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6324 {
6325 if (CONSTANT_CLASS_P (op))
6326 {
6327 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6328 {
6329 /* Can't use VIEW_CONVERT_EXPR for booleans because
6330 of possibly different sizes of scalar value and
6331 vector element. */
6332 if (integer_zerop (op))
6333 op = build_int_cst (TREE_TYPE (vector_type), 0);
6334 else if (integer_onep (op))
6335 op = build_all_ones_cst (TREE_TYPE (vector_type));
6336 else
6337 gcc_unreachable ();
6338 }
6339 else
6340 op = fold_unary (VIEW_CONVERT_EXPR,
6341 TREE_TYPE (vector_type), op);
6342 gcc_assert (op && CONSTANT_CLASS_P (op));
6343 }
6344 else
6345 {
6346 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6347 gimple *init_stmt;
6348 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6349 {
6350 tree true_val
6351 = build_all_ones_cst (TREE_TYPE (vector_type));
6352 tree false_val
6353 = build_zero_cst (TREE_TYPE (vector_type));
6354 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6355 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6356 op, true_val,
6357 false_val);
6358 }
6359 else
6360 {
6361 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6362 op);
6363 init_stmt
6364 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6365 op);
6366 }
6367 gimple_seq_add_stmt (&ctor_seq, init_stmt);
6368 op = new_temp;
6369 }
6370 }
6371 elts[number_of_places_left_in_vector] = op;
6372 if (!CONSTANT_CLASS_P (op))
6373 constant_p = false;
6374 /* For BB vectorization we have to compute an insert location
6375 when a def is inside the analyzed region since we cannot
6376 simply insert at the BB start in this case. */
6377 stmt_vec_info opdef;
6378 if (TREE_CODE (orig_op) == SSA_NAME
6379 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6380 && is_a <bb_vec_info> (vinfo)
6381 && (opdef = vinfo->lookup_def (orig_op)))
6382 {
6383 if (!insert_after)
6384 insert_after = opdef;
6385 else
6386 insert_after = get_later_stmt (insert_after, opdef);
6387 }
6388
6389 if (number_of_places_left_in_vector == 0)
6390 {
6391 if (constant_p
6392 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6393 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6394 vec_cst = gimple_build_vector (&ctor_seq, &elts);
6395 else
6396 {
6397 if (permute_results.is_empty ())
6398 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6399 elts, number_of_vectors,
6400 permute_results);
6401 vec_cst = permute_results[number_of_vectors - j - 1];
6402 }
6403 if (!gimple_seq_empty_p (ctor_seq))
6404 {
6405 if (insert_after)
6406 {
6407 gimple_stmt_iterator gsi;
6408 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6409 {
6410 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6411 gsi_insert_seq_before (&gsi, ctor_seq,
6412 GSI_CONTINUE_LINKING);
6413 }
6414 else if (!stmt_ends_bb_p (insert_after->stmt))
6415 {
6416 gsi = gsi_for_stmt (insert_after->stmt);
6417 gsi_insert_seq_after (&gsi, ctor_seq,
6418 GSI_CONTINUE_LINKING);
6419 }
6420 else
6421 {
6422 /* When we want to insert after a def where the
6423 defining stmt throws then insert on the fallthru
6424 edge. */
6425 edge e = find_fallthru_edge
6426 (gimple_bb (insert_after->stmt)->succs);
6427 basic_block new_bb
6428 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6429 gcc_assert (!new_bb);
6430 }
6431 }
6432 else
6433 vinfo->insert_seq_on_entry (NULL, ctor_seq);
6434 ctor_seq = NULL;
6435 }
6436 voprnds.quick_push (vec_cst);
6437 insert_after = NULL;
6438 number_of_places_left_in_vector = nunits;
6439 constant_p = true;
6440 elts.new_vector (vector_type, nunits, 1);
6441 elts.quick_grow (nunits);
6442 }
6443 }
6444 }
6445
6446 /* Since the vectors are created in the reverse order, we should invert
6447 them. */
6448 vec_num = voprnds.length ();
6449 for (j = vec_num; j != 0; j--)
6450 {
6451 vop = voprnds[j - 1];
6452 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6453 }
6454
6455 /* In case that VF is greater than the unrolling factor needed for the SLP
6456 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6457 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6458 to replicate the vectors. */
6459 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6460 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6461 i++)
6462 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6463 }
6464
6465 /* Get the Ith vectorized definition from SLP_NODE. */
6466
6467 tree
6468 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6469 {
6470 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6471 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6472 else
6473 return SLP_TREE_VEC_DEFS (slp_node)[i];
6474 }
6475
6476 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6477
6478 void
6479 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6480 {
6481 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6482 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6483 {
6484 unsigned j;
6485 gimple *vec_def_stmt;
6486 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6487 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6488 }
6489 else
6490 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6491 }
6492
6493 /* Get N vectorized definitions for SLP_NODE. */
6494
6495 void
6496 vect_get_slp_defs (vec_info *,
6497 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6498 {
6499 if (n == -1U)
6500 n = SLP_TREE_CHILDREN (slp_node).length ();
6501
6502 for (unsigned i = 0; i < n; ++i)
6503 {
6504 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6505 vec<tree> vec_defs = vNULL;
6506 vect_get_slp_defs (child, &vec_defs);
6507 vec_oprnds->quick_push (vec_defs);
6508 }
6509 }
6510
6511 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6512 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6513 permute statements for the SLP node NODE. Store the number of vector
6514 permute instructions in *N_PERMS and the number of vector load
6515 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6516 that were not needed. */
6517
6518 bool
6519 vect_transform_slp_perm_load (vec_info *vinfo,
6520 slp_tree node, const vec<tree> &dr_chain,
6521 gimple_stmt_iterator *gsi, poly_uint64 vf,
6522 bool analyze_only, unsigned *n_perms,
6523 unsigned int *n_loads, bool dce_chain)
6524 {
6525 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6526 int vec_index = 0;
6527 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6528 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6529 unsigned int mask_element;
6530 machine_mode mode;
6531
6532 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6533 return false;
6534
6535 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6536
6537 mode = TYPE_MODE (vectype);
6538 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6539
6540 /* Initialize the vect stmts of NODE to properly insert the generated
6541 stmts later. */
6542 if (! analyze_only)
6543 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6544 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6545 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6546
6547 /* Generate permutation masks for every NODE. Number of masks for each NODE
6548 is equal to GROUP_SIZE.
6549 E.g., we have a group of three nodes with three loads from the same
6550 location in each node, and the vector size is 4. I.e., we have a
6551 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6552 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6553 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6554 ...
6555
6556 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6557 The last mask is illegal since we assume two operands for permute
6558 operation, and the mask element values can't be outside that range.
6559 Hence, the last mask must be converted into {2,5,5,5}.
6560 For the first two permutations we need the first and the second input
6561 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6562 we need the second and the third vectors: {b1,c1,a2,b2} and
6563 {c2,a3,b3,c3}. */
6564
6565 int vect_stmts_counter = 0;
6566 unsigned int index = 0;
6567 int first_vec_index = -1;
6568 int second_vec_index = -1;
6569 bool noop_p = true;
6570 *n_perms = 0;
6571
6572 vec_perm_builder mask;
6573 unsigned int nelts_to_build;
6574 unsigned int nvectors_per_build;
6575 unsigned int in_nlanes;
6576 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6577 && multiple_p (nunits, group_size));
6578 if (repeating_p)
6579 {
6580 /* A single vector contains a whole number of copies of the node, so:
6581 (a) all permutes can use the same mask; and
6582 (b) the permutes only need a single vector input. */
6583 mask.new_vector (nunits, group_size, 3);
6584 nelts_to_build = mask.encoded_nelts ();
6585 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6586 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6587 }
6588 else
6589 {
6590 /* We need to construct a separate mask for each vector statement. */
6591 unsigned HOST_WIDE_INT const_nunits, const_vf;
6592 if (!nunits.is_constant (&const_nunits)
6593 || !vf.is_constant (&const_vf))
6594 return false;
6595 mask.new_vector (const_nunits, const_nunits, 1);
6596 nelts_to_build = const_vf * group_size;
6597 nvectors_per_build = 1;
6598 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6599 }
6600 auto_sbitmap used_in_lanes (in_nlanes);
6601 bitmap_clear (used_in_lanes);
6602 auto_bitmap used_defs;
6603
6604 unsigned int count = mask.encoded_nelts ();
6605 mask.quick_grow (count);
6606 vec_perm_indices indices;
6607
6608 for (unsigned int j = 0; j < nelts_to_build; j++)
6609 {
6610 unsigned int iter_num = j / group_size;
6611 unsigned int stmt_num = j % group_size;
6612 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6613 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6614 bitmap_set_bit (used_in_lanes, i);
6615 if (repeating_p)
6616 {
6617 first_vec_index = 0;
6618 mask_element = i;
6619 }
6620 else
6621 {
6622 /* Enforced before the loop when !repeating_p. */
6623 unsigned int const_nunits = nunits.to_constant ();
6624 vec_index = i / const_nunits;
6625 mask_element = i % const_nunits;
6626 if (vec_index == first_vec_index
6627 || first_vec_index == -1)
6628 {
6629 first_vec_index = vec_index;
6630 }
6631 else if (vec_index == second_vec_index
6632 || second_vec_index == -1)
6633 {
6634 second_vec_index = vec_index;
6635 mask_element += const_nunits;
6636 }
6637 else
6638 {
6639 if (dump_enabled_p ())
6640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6641 "permutation requires at "
6642 "least three vectors %G",
6643 stmt_info->stmt);
6644 gcc_assert (analyze_only);
6645 return false;
6646 }
6647
6648 gcc_assert (mask_element < 2 * const_nunits);
6649 }
6650
6651 if (mask_element != index)
6652 noop_p = false;
6653 mask[index++] = mask_element;
6654
6655 if (index == count && !noop_p)
6656 {
6657 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6658 if (!can_vec_perm_const_p (mode, indices))
6659 {
6660 if (dump_enabled_p ())
6661 {
6662 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6663 vect_location,
6664 "unsupported vect permute { ");
6665 for (i = 0; i < count; ++i)
6666 {
6667 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6668 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6669 }
6670 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6671 }
6672 gcc_assert (analyze_only);
6673 return false;
6674 }
6675
6676 ++*n_perms;
6677 }
6678
6679 if (index == count)
6680 {
6681 if (!analyze_only)
6682 {
6683 tree mask_vec = NULL_TREE;
6684
6685 if (! noop_p)
6686 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6687
6688 if (second_vec_index == -1)
6689 second_vec_index = first_vec_index;
6690
6691 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6692 {
6693 /* Generate the permute statement if necessary. */
6694 tree first_vec = dr_chain[first_vec_index + ri];
6695 tree second_vec = dr_chain[second_vec_index + ri];
6696 gimple *perm_stmt;
6697 if (! noop_p)
6698 {
6699 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6700 tree perm_dest
6701 = vect_create_destination_var (gimple_assign_lhs (stmt),
6702 vectype);
6703 perm_dest = make_ssa_name (perm_dest);
6704 perm_stmt
6705 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6706 first_vec, second_vec,
6707 mask_vec);
6708 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6709 gsi);
6710 if (dce_chain)
6711 {
6712 bitmap_set_bit (used_defs, first_vec_index + ri);
6713 bitmap_set_bit (used_defs, second_vec_index + ri);
6714 }
6715 }
6716 else
6717 {
6718 /* If mask was NULL_TREE generate the requested
6719 identity transform. */
6720 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6721 if (dce_chain)
6722 bitmap_set_bit (used_defs, first_vec_index + ri);
6723 }
6724
6725 /* Store the vector statement in NODE. */
6726 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6727 }
6728 }
6729
6730 index = 0;
6731 first_vec_index = -1;
6732 second_vec_index = -1;
6733 noop_p = true;
6734 }
6735 }
6736
6737 if (n_loads)
6738 {
6739 if (repeating_p)
6740 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6741 else
6742 {
6743 /* Enforced above when !repeating_p. */
6744 unsigned int const_nunits = nunits.to_constant ();
6745 *n_loads = 0;
6746 bool load_seen = false;
6747 for (unsigned i = 0; i < in_nlanes; ++i)
6748 {
6749 if (i % const_nunits == 0)
6750 {
6751 if (load_seen)
6752 *n_loads += 1;
6753 load_seen = false;
6754 }
6755 if (bitmap_bit_p (used_in_lanes, i))
6756 load_seen = true;
6757 }
6758 if (load_seen)
6759 *n_loads += 1;
6760 }
6761 }
6762
6763 if (dce_chain)
6764 for (unsigned i = 0; i < dr_chain.length (); ++i)
6765 if (!bitmap_bit_p (used_defs, i))
6766 {
6767 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6768 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6769 gsi_remove (&rgsi, true);
6770 release_defs (stmt);
6771 }
6772
6773 return true;
6774 }
6775
6776 /* Produce the next vector result for SLP permutation NODE by adding a vector
6777 statement at GSI. If MASK_VEC is nonnull, add:
6778
6779 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6780
6781 otherwise add:
6782
6783 <new SSA name> = FIRST_DEF. */
6784
6785 static void
6786 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6787 slp_tree node, tree first_def, tree second_def,
6788 tree mask_vec)
6789 {
6790 tree vectype = SLP_TREE_VECTYPE (node);
6791
6792 /* ??? We SLP match existing vector element extracts but
6793 allow punning which we need to re-instantiate at uses
6794 but have no good way of explicitly representing. */
6795 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6796 {
6797 gassign *conv_stmt
6798 = gimple_build_assign (make_ssa_name (vectype),
6799 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6800 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6801 first_def = gimple_assign_lhs (conv_stmt);
6802 }
6803 gassign *perm_stmt;
6804 tree perm_dest = make_ssa_name (vectype);
6805 if (mask_vec)
6806 {
6807 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6808 {
6809 gassign *conv_stmt
6810 = gimple_build_assign (make_ssa_name (vectype),
6811 build1 (VIEW_CONVERT_EXPR,
6812 vectype, second_def));
6813 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6814 second_def = gimple_assign_lhs (conv_stmt);
6815 }
6816 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6817 first_def, second_def,
6818 mask_vec);
6819 }
6820 else
6821 /* We need a copy here in case the def was external. */
6822 perm_stmt = gimple_build_assign (perm_dest, first_def);
6823 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6824 /* Store the vector statement in NODE. */
6825 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6826 }
6827
6828 /* Vectorize the SLP permutations in NODE as specified
6829 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6830 child number and lane number.
6831 Interleaving of two two-lane two-child SLP subtrees (not supported):
6832 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6833 A blend of two four-lane two-child SLP subtrees:
6834 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6835 Highpart of a four-lane one-child SLP subtree (not supported):
6836 [ { 0, 2 }, { 0, 3 } ]
6837 Where currently only a subset is supported by code generating below. */
6838
6839 static bool
6840 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6841 slp_tree node, stmt_vector_for_cost *cost_vec)
6842 {
6843 tree vectype = SLP_TREE_VECTYPE (node);
6844
6845 /* ??? We currently only support all same vector input and output types
6846 while the SLP IL should really do a concat + select and thus accept
6847 arbitrary mismatches. */
6848 slp_tree child;
6849 unsigned i;
6850 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6851 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
6852 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6853 {
6854 if (!vect_maybe_update_slp_op_vectype (child, vectype)
6855 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
6856 {
6857 if (dump_enabled_p ())
6858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6859 "Unsupported lane permutation\n");
6860 return false;
6861 }
6862 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
6863 repeating_p = false;
6864 }
6865
6866 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
6867 gcc_assert (perm.length () == SLP_TREE_LANES (node));
6868 if (dump_enabled_p ())
6869 {
6870 dump_printf_loc (MSG_NOTE, vect_location,
6871 "vectorizing permutation");
6872 for (unsigned i = 0; i < perm.length (); ++i)
6873 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
6874 if (repeating_p)
6875 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
6876 dump_printf (MSG_NOTE, "\n");
6877 }
6878
6879 /* REPEATING_P is true if every output vector is guaranteed to use the
6880 same permute vector. We can handle that case for both variable-length
6881 and constant-length vectors, but we only handle other cases for
6882 constant-length vectors.
6883
6884 Set:
6885
6886 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
6887 mask vector that we want to build.
6888
6889 - NCOPIES to the number of copies of PERM that we need in order
6890 to build the necessary permute mask vectors.
6891
6892 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
6893 for each permute mask vector. This is only relevant when GSI is
6894 nonnull. */
6895 uint64_t npatterns;
6896 unsigned nelts_per_pattern;
6897 uint64_t ncopies;
6898 unsigned noutputs_per_mask;
6899 if (repeating_p)
6900 {
6901 /* We need a single permute mask vector that has the form:
6902
6903 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
6904
6905 In other words, the original n-element permute in PERM is
6906 "unrolled" to fill a full vector. The stepped vector encoding
6907 that we use for permutes requires 3n elements. */
6908 npatterns = SLP_TREE_LANES (node);
6909 nelts_per_pattern = ncopies = 3;
6910 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6911 }
6912 else
6913 {
6914 /* Calculate every element of every permute mask vector explicitly,
6915 instead of relying on the pattern described above. */
6916 if (!nunits.is_constant (&npatterns))
6917 return false;
6918 nelts_per_pattern = ncopies = 1;
6919 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
6920 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
6921 return false;
6922 noutputs_per_mask = 1;
6923 }
6924 unsigned olanes = ncopies * SLP_TREE_LANES (node);
6925 gcc_assert (repeating_p || multiple_p (olanes, nunits));
6926
6927 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6928 from the { SLP operand, scalar lane } permutation as recorded in the
6929 SLP node as intermediate step. This part should already work
6930 with SLP children with arbitrary number of lanes. */
6931 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
6932 auto_vec<unsigned> active_lane;
6933 vperm.create (olanes);
6934 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
6935 for (unsigned i = 0; i < ncopies; ++i)
6936 {
6937 for (unsigned pi = 0; pi < perm.length (); ++pi)
6938 {
6939 std::pair<unsigned, unsigned> p = perm[pi];
6940 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
6941 if (repeating_p)
6942 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
6943 else
6944 {
6945 /* We checked above that the vectors are constant-length. */
6946 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
6947 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
6948 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
6949 vperm.quick_push ({{p.first, vi}, vl});
6950 }
6951 }
6952 /* Advance to the next group. */
6953 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
6954 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
6955 }
6956
6957 if (dump_enabled_p ())
6958 {
6959 dump_printf_loc (MSG_NOTE, vect_location, "as");
6960 for (unsigned i = 0; i < vperm.length (); ++i)
6961 {
6962 if (i != 0
6963 && (repeating_p
6964 ? multiple_p (i, npatterns)
6965 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
6966 dump_printf (MSG_NOTE, ",");
6967 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
6968 vperm[i].first.first, vperm[i].first.second,
6969 vperm[i].second);
6970 }
6971 dump_printf (MSG_NOTE, "\n");
6972 }
6973
6974 /* We can only handle two-vector permutes, everything else should
6975 be lowered on the SLP level. The following is closely inspired
6976 by vect_transform_slp_perm_load and is supposed to eventually
6977 replace it.
6978 ??? As intermediate step do code-gen in the SLP tree representation
6979 somehow? */
6980 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
6981 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
6982 unsigned int index = 0;
6983 poly_uint64 mask_element;
6984 vec_perm_builder mask;
6985 mask.new_vector (nunits, npatterns, nelts_per_pattern);
6986 unsigned int count = mask.encoded_nelts ();
6987 mask.quick_grow (count);
6988 vec_perm_indices indices;
6989 unsigned nperms = 0;
6990 for (unsigned i = 0; i < vperm.length (); ++i)
6991 {
6992 mask_element = vperm[i].second;
6993 if (first_vec.first == -1U
6994 || first_vec == vperm[i].first)
6995 first_vec = vperm[i].first;
6996 else if (second_vec.first == -1U
6997 || second_vec == vperm[i].first)
6998 {
6999 second_vec = vperm[i].first;
7000 mask_element += nunits;
7001 }
7002 else
7003 {
7004 if (dump_enabled_p ())
7005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7006 "permutation requires at "
7007 "least three vectors\n");
7008 gcc_assert (!gsi);
7009 return false;
7010 }
7011
7012 mask[index++] = mask_element;
7013
7014 if (index == count)
7015 {
7016 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
7017 bool identity_p = indices.series_p (0, 1, 0, 1);
7018 if (!identity_p
7019 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
7020 {
7021 if (dump_enabled_p ())
7022 {
7023 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
7024 vect_location,
7025 "unsupported vect permute { ");
7026 for (i = 0; i < count; ++i)
7027 {
7028 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
7029 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
7030 }
7031 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
7032 }
7033 gcc_assert (!gsi);
7034 return false;
7035 }
7036
7037 if (!identity_p)
7038 nperms++;
7039 if (gsi)
7040 {
7041 if (second_vec.first == -1U)
7042 second_vec = first_vec;
7043
7044 slp_tree
7045 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
7046 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
7047
7048 tree mask_vec = NULL_TREE;
7049 if (!identity_p)
7050 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
7051
7052 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
7053 {
7054 tree first_def
7055 = vect_get_slp_vect_def (first_node,
7056 first_vec.second + vi);
7057 tree second_def
7058 = vect_get_slp_vect_def (second_node,
7059 second_vec.second + vi);
7060 vect_add_slp_permutation (vinfo, gsi, node, first_def,
7061 second_def, mask_vec);
7062 }
7063 }
7064
7065 index = 0;
7066 first_vec = std::make_pair (-1U, -1U);
7067 second_vec = std::make_pair (-1U, -1U);
7068 }
7069 }
7070
7071 if (!gsi)
7072 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
7073
7074 return true;
7075 }
7076
7077 /* Vectorize SLP NODE. */
7078
7079 static void
7080 vect_schedule_slp_node (vec_info *vinfo,
7081 slp_tree node, slp_instance instance)
7082 {
7083 gimple_stmt_iterator si;
7084 int i;
7085 slp_tree child;
7086
7087 /* For existing vectors there's nothing to do. */
7088 if (SLP_TREE_VEC_DEFS (node).exists ())
7089 return;
7090
7091 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
7092
7093 /* Vectorize externals and constants. */
7094 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7095 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7096 {
7097 /* ??? vectorizable_shift can end up using a scalar operand which is
7098 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
7099 node in this case. */
7100 if (!SLP_TREE_VECTYPE (node))
7101 return;
7102
7103 vect_create_constant_vectors (vinfo, node);
7104 return;
7105 }
7106
7107 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7108
7109 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
7110 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7111
7112 if (dump_enabled_p ())
7113 dump_printf_loc (MSG_NOTE, vect_location,
7114 "------>vectorizing SLP node starting from: %G",
7115 stmt_info->stmt);
7116
7117 if (STMT_VINFO_DATA_REF (stmt_info)
7118 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7119 {
7120 /* Vectorized loads go before the first scalar load to make it
7121 ready early, vectorized stores go before the last scalar
7122 stmt which is where all uses are ready. */
7123 stmt_vec_info last_stmt_info = NULL;
7124 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
7125 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
7126 else /* DR_IS_WRITE */
7127 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
7128 si = gsi_for_stmt (last_stmt_info->stmt);
7129 }
7130 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
7131 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
7132 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
7133 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7134 {
7135 /* For PHI node vectorization we do not use the insertion iterator. */
7136 si = gsi_none ();
7137 }
7138 else
7139 {
7140 /* Emit other stmts after the children vectorized defs which is
7141 earliest possible. */
7142 gimple *last_stmt = NULL;
7143 bool seen_vector_def = false;
7144 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7145 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7146 {
7147 /* For fold-left reductions we are retaining the scalar
7148 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7149 set so the representation isn't perfect. Resort to the
7150 last scalar def here. */
7151 if (SLP_TREE_VEC_STMTS (child).is_empty ())
7152 {
7153 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
7154 == cycle_phi_info_type);
7155 gphi *phi = as_a <gphi *>
7156 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
7157 if (!last_stmt
7158 || vect_stmt_dominates_stmt_p (last_stmt, phi))
7159 last_stmt = phi;
7160 }
7161 /* We are emitting all vectorized stmts in the same place and
7162 the last one is the last.
7163 ??? Unless we have a load permutation applied and that
7164 figures to re-use an earlier generated load. */
7165 unsigned j;
7166 gimple *vstmt;
7167 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
7168 if (!last_stmt
7169 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7170 last_stmt = vstmt;
7171 }
7172 else if (!SLP_TREE_VECTYPE (child))
7173 {
7174 /* For externals we use unvectorized at all scalar defs. */
7175 unsigned j;
7176 tree def;
7177 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
7178 if (TREE_CODE (def) == SSA_NAME
7179 && !SSA_NAME_IS_DEFAULT_DEF (def))
7180 {
7181 gimple *stmt = SSA_NAME_DEF_STMT (def);
7182 if (!last_stmt
7183 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
7184 last_stmt = stmt;
7185 }
7186 }
7187 else
7188 {
7189 /* For externals we have to look at all defs since their
7190 insertion place is decided per vector. But beware
7191 of pre-existing vectors where we need to make sure
7192 we do not insert before the region boundary. */
7193 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
7194 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
7195 seen_vector_def = true;
7196 else
7197 {
7198 unsigned j;
7199 tree vdef;
7200 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
7201 if (TREE_CODE (vdef) == SSA_NAME
7202 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
7203 {
7204 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
7205 if (!last_stmt
7206 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7207 last_stmt = vstmt;
7208 }
7209 }
7210 }
7211 /* This can happen when all children are pre-existing vectors or
7212 constants. */
7213 if (!last_stmt)
7214 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
7215 if (!last_stmt)
7216 {
7217 gcc_assert (seen_vector_def);
7218 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7219 }
7220 else if (is_a <bb_vec_info> (vinfo)
7221 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
7222 && gimple_could_trap_p (stmt_info->stmt))
7223 {
7224 /* We've constrained possibly trapping operations to all come
7225 from the same basic-block, if vectorized defs would allow earlier
7226 scheduling still force vectorized stmts to the original block.
7227 This is only necessary for BB vectorization since for loop vect
7228 all operations are in a single BB and scalar stmt based
7229 placement doesn't play well with epilogue vectorization. */
7230 gcc_assert (dominated_by_p (CDI_DOMINATORS,
7231 gimple_bb (stmt_info->stmt),
7232 gimple_bb (last_stmt)));
7233 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
7234 }
7235 else if (is_a <gphi *> (last_stmt))
7236 si = gsi_after_labels (gimple_bb (last_stmt));
7237 else
7238 {
7239 si = gsi_for_stmt (last_stmt);
7240 gsi_next (&si);
7241 }
7242 }
7243
7244 bool done_p = false;
7245
7246 /* Handle purely internal nodes. */
7247 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7248 {
7249 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7250 be shared with different SLP nodes (but usually it's the same
7251 operation apart from the case the stmt is only there for denoting
7252 the actual scalar lane defs ...). So do not call vect_transform_stmt
7253 but open-code it here (partly). */
7254 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7255 gcc_assert (done);
7256 done_p = true;
7257 }
7258 if (!done_p)
7259 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7260 }
7261
7262 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7263 For loop vectorization this is done in vectorizable_call, but for SLP
7264 it needs to be deferred until end of vect_schedule_slp, because multiple
7265 SLP instances may refer to the same scalar stmt. */
7266
7267 static void
7268 vect_remove_slp_scalar_calls (vec_info *vinfo,
7269 slp_tree node, hash_set<slp_tree> &visited)
7270 {
7271 gimple *new_stmt;
7272 gimple_stmt_iterator gsi;
7273 int i;
7274 slp_tree child;
7275 tree lhs;
7276 stmt_vec_info stmt_info;
7277
7278 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7279 return;
7280
7281 if (visited.add (node))
7282 return;
7283
7284 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7285 vect_remove_slp_scalar_calls (vinfo, child, visited);
7286
7287 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7288 {
7289 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7290 if (!stmt || gimple_bb (stmt) == NULL)
7291 continue;
7292 if (is_pattern_stmt_p (stmt_info)
7293 || !PURE_SLP_STMT (stmt_info))
7294 continue;
7295 lhs = gimple_call_lhs (stmt);
7296 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7297 gsi = gsi_for_stmt (stmt);
7298 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7299 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7300 }
7301 }
7302
7303 static void
7304 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7305 {
7306 hash_set<slp_tree> visited;
7307 vect_remove_slp_scalar_calls (vinfo, node, visited);
7308 }
7309
7310 /* Vectorize the instance root. */
7311
7312 void
7313 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7314 {
7315 gassign *rstmt = NULL;
7316
7317 if (instance->kind == slp_inst_kind_ctor)
7318 {
7319 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7320 {
7321 gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
7322 tree vect_lhs = gimple_get_lhs (child_stmt);
7323 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7324 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7325 TREE_TYPE (vect_lhs)))
7326 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7327 vect_lhs);
7328 rstmt = gimple_build_assign (root_lhs, vect_lhs);
7329 }
7330 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7331 {
7332 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7333 gimple *child_stmt;
7334 int j;
7335 vec<constructor_elt, va_gc> *v;
7336 vec_alloc (v, nelts);
7337
7338 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7339 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7340 gimple_get_lhs (child_stmt));
7341 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7342 tree rtype
7343 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7344 tree r_constructor = build_constructor (rtype, v);
7345 rstmt = gimple_build_assign (lhs, r_constructor);
7346 }
7347 }
7348 else if (instance->kind == slp_inst_kind_bb_reduc)
7349 {
7350 /* Largely inspired by reduction chain epilogue handling in
7351 vect_create_epilog_for_reduction. */
7352 vec<tree> vec_defs = vNULL;
7353 vect_get_slp_defs (node, &vec_defs);
7354 enum tree_code reduc_code
7355 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7356 /* ??? We actually have to reflect signs somewhere. */
7357 if (reduc_code == MINUS_EXPR)
7358 reduc_code = PLUS_EXPR;
7359 gimple_seq epilogue = NULL;
7360 /* We may end up with more than one vector result, reduce them
7361 to one vector. */
7362 tree vec_def = vec_defs[0];
7363 for (unsigned i = 1; i < vec_defs.length (); ++i)
7364 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7365 vec_def, vec_defs[i]);
7366 vec_defs.release ();
7367 /* ??? Support other schemes than direct internal fn. */
7368 internal_fn reduc_fn;
7369 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7370 || reduc_fn == IFN_LAST)
7371 gcc_unreachable ();
7372 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7373 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7374
7375 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7376 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7377 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7378 update_stmt (gsi_stmt (rgsi));
7379 return;
7380 }
7381 else
7382 gcc_unreachable ();
7383
7384 gcc_assert (rstmt);
7385
7386 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7387 gsi_replace (&rgsi, rstmt, true);
7388 }
7389
7390 struct slp_scc_info
7391 {
7392 bool on_stack;
7393 int dfs;
7394 int lowlink;
7395 };
7396
7397 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7398
7399 static void
7400 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7401 hash_map<slp_tree, slp_scc_info> &scc_info,
7402 int &maxdfs, vec<slp_tree> &stack)
7403 {
7404 bool existed_p;
7405 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7406 gcc_assert (!existed_p);
7407 info->dfs = maxdfs;
7408 info->lowlink = maxdfs;
7409 maxdfs++;
7410
7411 /* Leaf. */
7412 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7413 {
7414 info->on_stack = false;
7415 vect_schedule_slp_node (vinfo, node, instance);
7416 return;
7417 }
7418
7419 info->on_stack = true;
7420 stack.safe_push (node);
7421
7422 unsigned i;
7423 slp_tree child;
7424 /* DFS recurse. */
7425 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7426 {
7427 if (!child)
7428 continue;
7429 slp_scc_info *child_info = scc_info.get (child);
7430 if (!child_info)
7431 {
7432 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7433 /* Recursion might have re-allocated the node. */
7434 info = scc_info.get (node);
7435 child_info = scc_info.get (child);
7436 info->lowlink = MIN (info->lowlink, child_info->lowlink);
7437 }
7438 else if (child_info->on_stack)
7439 info->lowlink = MIN (info->lowlink, child_info->dfs);
7440 }
7441 if (info->lowlink != info->dfs)
7442 return;
7443
7444 auto_vec<slp_tree, 4> phis_to_fixup;
7445
7446 /* Singleton. */
7447 if (stack.last () == node)
7448 {
7449 stack.pop ();
7450 info->on_stack = false;
7451 vect_schedule_slp_node (vinfo, node, instance);
7452 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7453 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7454 phis_to_fixup.quick_push (node);
7455 }
7456 else
7457 {
7458 /* SCC. */
7459 int last_idx = stack.length () - 1;
7460 while (stack[last_idx] != node)
7461 last_idx--;
7462 /* We can break the cycle at PHIs who have at least one child
7463 code generated. Then we could re-start the DFS walk until
7464 all nodes in the SCC are covered (we might have new entries
7465 for only back-reachable nodes). But it's simpler to just
7466 iterate and schedule those that are ready. */
7467 unsigned todo = stack.length () - last_idx;
7468 do
7469 {
7470 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7471 {
7472 slp_tree entry = stack[idx];
7473 if (!entry)
7474 continue;
7475 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7476 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7477 bool ready = !phi;
7478 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7479 if (!child)
7480 {
7481 gcc_assert (phi);
7482 ready = true;
7483 break;
7484 }
7485 else if (scc_info.get (child)->on_stack)
7486 {
7487 if (!phi)
7488 {
7489 ready = false;
7490 break;
7491 }
7492 }
7493 else
7494 {
7495 if (phi)
7496 {
7497 ready = true;
7498 break;
7499 }
7500 }
7501 if (ready)
7502 {
7503 vect_schedule_slp_node (vinfo, entry, instance);
7504 scc_info.get (entry)->on_stack = false;
7505 stack[idx] = NULL;
7506 todo--;
7507 if (phi)
7508 phis_to_fixup.safe_push (entry);
7509 }
7510 }
7511 }
7512 while (todo != 0);
7513
7514 /* Pop the SCC. */
7515 stack.truncate (last_idx);
7516 }
7517
7518 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7519 slp_tree phi_node;
7520 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7521 {
7522 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7523 edge_iterator ei;
7524 edge e;
7525 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7526 {
7527 unsigned dest_idx = e->dest_idx;
7528 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7529 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7530 continue;
7531 /* Simply fill all args. */
7532 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7533 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7534 vect_get_slp_vect_def (child, i),
7535 e, gimple_phi_arg_location (phi, dest_idx));
7536 }
7537 }
7538 }
7539
7540 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7541
7542 void
7543 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
7544 {
7545 slp_instance instance;
7546 unsigned int i;
7547
7548 hash_map<slp_tree, slp_scc_info> scc_info;
7549 int maxdfs = 0;
7550 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7551 {
7552 slp_tree node = SLP_INSTANCE_TREE (instance);
7553 if (dump_enabled_p ())
7554 {
7555 dump_printf_loc (MSG_NOTE, vect_location,
7556 "Vectorizing SLP tree:\n");
7557 /* ??? Dump all? */
7558 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7559 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7560 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7561 vect_print_slp_graph (MSG_NOTE, vect_location,
7562 SLP_INSTANCE_TREE (instance));
7563 }
7564 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7565 have a PHI be the node breaking the cycle. */
7566 auto_vec<slp_tree> stack;
7567 if (!scc_info.get (node))
7568 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7569
7570 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7571 vectorize_slp_instance_root_stmt (node, instance);
7572
7573 if (dump_enabled_p ())
7574 dump_printf_loc (MSG_NOTE, vect_location,
7575 "vectorizing stmts using SLP.\n");
7576 }
7577
7578 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7579 {
7580 slp_tree root = SLP_INSTANCE_TREE (instance);
7581 stmt_vec_info store_info;
7582 unsigned int j;
7583
7584 /* Remove scalar call stmts. Do not do this for basic-block
7585 vectorization as not all uses may be vectorized.
7586 ??? Why should this be necessary? DCE should be able to
7587 remove the stmts itself.
7588 ??? For BB vectorization we can as well remove scalar
7589 stmts starting from the SLP tree root if they have no
7590 uses. */
7591 if (is_a <loop_vec_info> (vinfo))
7592 vect_remove_slp_scalar_calls (vinfo, root);
7593
7594 /* Remove vectorized stores original scalar stmts. */
7595 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7596 {
7597 if (!STMT_VINFO_DATA_REF (store_info)
7598 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7599 break;
7600
7601 store_info = vect_orig_stmt (store_info);
7602 /* Free the attached stmt_vec_info and remove the stmt. */
7603 vinfo->remove_stmt (store_info);
7604
7605 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7606 to not crash in vect_free_slp_tree later. */
7607 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7608 SLP_TREE_REPRESENTATIVE (root) = NULL;
7609 }
7610 }
7611 }
This page took 0.374287 seconds and 5 git commands to generate.