1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
54 slp_tree
, stmt_vector_for_cost
*);
56 static object_allocator
<_slp_tree
> *slp_tree_pool
;
57 static slp_tree slp_first_node
;
62 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
68 while (slp_first_node
)
69 delete slp_first_node
;
75 _slp_tree::operator new (size_t n
)
77 gcc_assert (n
== sizeof (_slp_tree
));
78 return slp_tree_pool
->allocate_raw ();
82 _slp_tree::operator delete (void *node
, size_t n
)
84 gcc_assert (n
== sizeof (_slp_tree
));
85 slp_tree_pool
->remove_raw (node
);
89 /* Initialize a SLP node. */
91 _slp_tree::_slp_tree ()
93 this->prev_node
= NULL
;
95 slp_first_node
->prev_node
= this;
96 this->next_node
= slp_first_node
;
97 slp_first_node
= this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
99 SLP_TREE_SCALAR_OPS (this) = vNULL
;
100 SLP_TREE_VEC_STMTS (this) = vNULL
;
101 SLP_TREE_VEC_DEFS (this) = vNULL
;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL
;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
107 SLP_TREE_CODE (this) = ERROR_MARK
;
108 SLP_TREE_VECTYPE (this) = NULL_TREE
;
109 SLP_TREE_REPRESENTATIVE (this) = NULL
;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits
= 1;
115 /* Tear down a SLP node. */
117 _slp_tree::~_slp_tree ()
120 this->prev_node
->next_node
= this->next_node
;
122 slp_first_node
= this->next_node
;
124 this->next_node
->prev_node
= this->prev_node
;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
137 vect_free_slp_tree (slp_tree node
)
142 if (--SLP_TREE_REF_COUNT (node
) != 0)
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
147 vect_free_slp_tree (child
);
149 /* If the node defines any SLP only patterns then those patterns are no
150 longer valid and should be removed. */
151 stmt_vec_info rep_stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
152 if (rep_stmt_info
&& STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info
))
154 stmt_vec_info stmt_info
= vect_orig_stmt (rep_stmt_info
);
155 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
156 STMT_SLP_TYPE (stmt_info
) = STMT_SLP_TYPE (rep_stmt_info
);
162 /* Return a location suitable for dumpings related to the SLP instance. */
165 _slp_instance::location () const
168 return root_stmt
->stmt
;
170 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
174 /* Free the memory allocated for the SLP instance. */
177 vect_free_slp_instance (slp_instance instance
)
179 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
180 SLP_INSTANCE_LOADS (instance
).release ();
181 instance
->subgraph_entries
.release ();
182 instance
->cost_vec
.release ();
187 /* Create an SLP node for SCALAR_STMTS. */
190 vect_create_new_slp_node (unsigned nops
, tree_code code
)
192 slp_tree node
= new _slp_tree
;
193 SLP_TREE_SCALAR_STMTS (node
) = vNULL
;
194 SLP_TREE_CHILDREN (node
).create (nops
);
195 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
196 SLP_TREE_CODE (node
) = code
;
199 /* Create an SLP node for SCALAR_STMTS. */
202 vect_create_new_slp_node (slp_tree node
,
203 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
205 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
206 SLP_TREE_CHILDREN (node
).create (nops
);
207 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
208 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
209 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
213 /* Create an SLP node for SCALAR_STMTS. */
216 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
218 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
221 /* Create an SLP node for OPS. */
224 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
226 SLP_TREE_SCALAR_OPS (node
) = ops
;
227 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
228 SLP_TREE_LANES (node
) = ops
.length ();
232 /* Create an SLP node for OPS. */
235 vect_create_new_slp_node (vec
<tree
> ops
)
237 return vect_create_new_slp_node (new _slp_tree
, ops
);
241 /* This structure is used in creation of an SLP tree. Each instance
242 corresponds to the same operand in a group of scalar stmts in an SLP
244 typedef struct _slp_oprnd_info
246 /* Def-stmts for the operands. */
247 vec
<stmt_vec_info
> def_stmts
;
250 /* Information about the first statement, its vector def-type, type, the
251 operand itself in case it's constant, and an indication if it's a pattern
254 enum vect_def_type first_dt
;
259 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
261 static vec
<slp_oprnd_info
>
262 vect_create_oprnd_info (int nops
, int group_size
)
265 slp_oprnd_info oprnd_info
;
266 vec
<slp_oprnd_info
> oprnds_info
;
268 oprnds_info
.create (nops
);
269 for (i
= 0; i
< nops
; i
++)
271 oprnd_info
= XNEW (struct _slp_oprnd_info
);
272 oprnd_info
->def_stmts
.create (group_size
);
273 oprnd_info
->ops
.create (group_size
);
274 oprnd_info
->first_dt
= vect_uninitialized_def
;
275 oprnd_info
->first_op_type
= NULL_TREE
;
276 oprnd_info
->any_pattern
= false;
277 oprnds_info
.quick_push (oprnd_info
);
284 /* Free operands info. */
287 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
290 slp_oprnd_info oprnd_info
;
292 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
294 oprnd_info
->def_stmts
.release ();
295 oprnd_info
->ops
.release ();
296 XDELETE (oprnd_info
);
299 oprnds_info
.release ();
303 /* Return true if STMTS contains a pattern statement. */
306 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
308 stmt_vec_info stmt_info
;
310 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
311 if (is_pattern_stmt_p (stmt_info
))
316 /* Return true when all lanes in the external or constant NODE have
320 vect_slp_tree_uniform_p (slp_tree node
)
322 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
323 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
325 /* Pre-exsting vectors. */
326 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
330 tree op
, first
= NULL_TREE
;
331 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
334 else if (!operand_equal_p (first
, op
, 0))
340 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
341 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
345 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
346 stmt_vec_info first_stmt_info
)
348 stmt_vec_info next_stmt_info
= first_stmt_info
;
351 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
356 if (next_stmt_info
== stmt_info
)
358 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
360 result
+= DR_GROUP_GAP (next_stmt_info
);
362 while (next_stmt_info
);
367 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
368 using the method implemented by duplicate_and_interleave. Return true
369 if so, returning the number of intermediate vectors in *NVECTORS_OUT
370 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
374 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
375 tree elt_type
, unsigned int *nvectors_out
,
376 tree
*vector_type_out
,
379 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
380 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
383 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
384 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
385 unsigned int nvectors
= 1;
388 scalar_int_mode int_mode
;
389 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
390 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
392 /* Get the natural vector type for this SLP group size. */
393 tree int_type
= build_nonstandard_integer_type
394 (GET_MODE_BITSIZE (int_mode
), 1);
396 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
398 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
399 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
400 GET_MODE_SIZE (base_vector_mode
)))
402 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
403 together into elements of type INT_TYPE and using the result
404 to build NVECTORS vectors. */
405 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
406 vec_perm_builder
sel1 (nelts
, 2, 3);
407 vec_perm_builder
sel2 (nelts
, 2, 3);
408 poly_int64 half_nelts
= exact_div (nelts
, 2);
409 for (unsigned int i
= 0; i
< 3; ++i
)
412 sel1
.quick_push (i
+ nelts
);
413 sel2
.quick_push (half_nelts
+ i
);
414 sel2
.quick_push (half_nelts
+ i
+ nelts
);
416 vec_perm_indices
indices1 (sel1
, 2, nelts
);
417 vec_perm_indices
indices2 (sel2
, 2, nelts
);
418 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
419 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
422 *nvectors_out
= nvectors
;
424 *vector_type_out
= vector_type
;
427 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
429 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
436 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
442 /* Return true if DTA and DTB match. */
445 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
448 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
449 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
452 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
453 they are of a valid type and that they match the defs of the first stmt of
454 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
455 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
456 indicates swap is required for cond_expr stmts. Specifically, *SWAP
457 is 1 if STMT is cond and operands of comparison need to be swapped;
458 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
459 If there is any operand swap in this function, *SWAP is set to non-zero
461 If there was a fatal error return -1; if the error could be corrected by
462 swapping operands of father node of this one, return 1; if everything is
465 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
467 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
468 vec
<slp_oprnd_info
> *oprnds_info
)
470 stmt_vec_info stmt_info
= stmts
[stmt_num
];
472 unsigned int i
, number_of_oprnds
;
473 enum vect_def_type dt
= vect_uninitialized_def
;
474 slp_oprnd_info oprnd_info
;
475 int first_op_idx
= 1;
476 unsigned int commutative_op
= -1U;
477 bool first_op_cond
= false;
478 bool first
= stmt_num
== 0;
480 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
482 number_of_oprnds
= gimple_call_num_args (stmt
);
484 if (gimple_call_internal_p (stmt
))
486 internal_fn ifn
= gimple_call_internal_fn (stmt
);
487 commutative_op
= first_commutative_argument (ifn
);
489 /* Masked load, only look at mask. */
490 if (ifn
== IFN_MASK_LOAD
)
492 number_of_oprnds
= 1;
493 /* Mask operand index. */
498 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
500 enum tree_code code
= gimple_assign_rhs_code (stmt
);
501 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
502 /* Swap can only be done for cond_expr if asked to, otherwise we
503 could result in different comparison code to the first stmt. */
504 if (code
== COND_EXPR
505 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
507 first_op_cond
= true;
511 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
513 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
514 number_of_oprnds
= gimple_phi_num_args (stmt
);
518 bool swapped
= (swap
!= 0);
519 bool backedge
= false;
520 gcc_assert (!swapped
|| first_op_cond
);
521 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
522 for (i
= 0; i
< number_of_oprnds
; i
++)
526 /* Map indicating how operands of cond_expr should be swapped. */
527 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
528 int *map
= maps
[swap
];
531 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
532 first_op_idx
), map
[i
]);
534 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
536 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
538 oprnd
= gimple_phi_arg_def (stmt
, i
);
539 backedge
= dominated_by_p (CDI_DOMINATORS
,
540 gimple_phi_arg_edge (stmt
, i
)->src
,
541 gimple_bb (stmt_info
->stmt
));
544 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
545 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
546 oprnd
= TREE_OPERAND (oprnd
, 0);
548 oprnd_info
= (*oprnds_info
)[i
];
550 stmt_vec_info def_stmt_info
;
551 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
555 "Build SLP failed: can't analyze def for %T\n",
563 oprnd_info
->def_stmts
.quick_push (NULL
);
564 oprnd_info
->ops
.quick_push (NULL_TREE
);
565 oprnd_info
->first_dt
= vect_uninitialized_def
;
569 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
570 oprnd_info
->ops
.quick_push (oprnd
);
573 && is_pattern_stmt_p (def_stmt_info
))
575 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
577 oprnd_info
->any_pattern
= true;
579 /* If we promote this to external use the original stmt def. */
580 oprnd_info
->ops
.last ()
581 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
584 /* If there's a extern def on a backedge make sure we can
585 code-generate at the region start.
586 ??? This is another case that could be fixed by adjusting
587 how we split the function but at the moment we'd have conflicting
590 && dts
[i
] == vect_external_def
591 && is_a
<bb_vec_info
> (vinfo
)
592 && TREE_CODE (oprnd
) == SSA_NAME
593 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
594 && !dominated_by_p (CDI_DOMINATORS
,
595 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
596 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
598 if (dump_enabled_p ())
599 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
600 "Build SLP failed: extern def %T only defined "
601 "on backedge\n", oprnd
);
607 tree type
= TREE_TYPE (oprnd
);
609 if ((dt
== vect_constant_def
610 || dt
== vect_external_def
)
611 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
612 && (TREE_CODE (type
) == BOOLEAN_TYPE
613 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
618 "Build SLP failed: invalid type of def "
619 "for variable-length SLP %T\n", oprnd
);
623 /* For the swapping logic below force vect_reduction_def
624 for the reduction op in a SLP reduction group. */
625 if (!STMT_VINFO_DATA_REF (stmt_info
)
626 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
627 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
629 dts
[i
] = dt
= vect_reduction_def
;
631 /* Check the types of the definition. */
634 case vect_external_def
:
635 case vect_constant_def
:
636 case vect_internal_def
:
637 case vect_reduction_def
:
638 case vect_induction_def
:
639 case vect_nested_cycle
:
643 /* FORNOW: Not supported. */
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
646 "Build SLP failed: illegal type of def %T\n",
651 oprnd_info
->first_dt
= dt
;
652 oprnd_info
->first_op_type
= type
;
658 /* Now match the operand definition types to that of the first stmt. */
659 for (i
= 0; i
< number_of_oprnds
;)
667 oprnd_info
= (*oprnds_info
)[i
];
669 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
670 oprnd
= oprnd_info
->ops
[stmt_num
];
671 tree type
= TREE_TYPE (oprnd
);
673 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
675 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
677 "Build SLP failed: different operand types\n");
681 /* Not first stmt of the group, check that the def-stmt/s match
682 the def-stmt/s of the first stmt. Allow different definition
683 types for reduction chains: the first stmt must be a
684 vect_reduction_def (a phi node), and the rest
685 end in the reduction chain. */
686 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
687 && !(oprnd_info
->first_dt
== vect_reduction_def
688 && !STMT_VINFO_DATA_REF (stmt_info
)
689 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
691 && !STMT_VINFO_DATA_REF (def_stmt_info
)
692 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
693 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
694 || (!STMT_VINFO_DATA_REF (stmt_info
)
695 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
697 || STMT_VINFO_DATA_REF (def_stmt_info
)
698 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
699 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
700 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
702 /* Try swapping operands if we got a mismatch. For BB
703 vectorization only in case it will clearly improve things. */
704 if (i
== commutative_op
&& !swapped
705 && (!is_a
<bb_vec_info
> (vinfo
)
706 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
708 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
709 || vect_def_types_match
710 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE
, vect_location
,
714 "trying swapped operands\n");
715 std::swap (dts
[i
], dts
[i
+1]);
716 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
717 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
718 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
719 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
724 if (is_a
<bb_vec_info
> (vinfo
)
725 && !oprnd_info
->any_pattern
)
727 /* Now for commutative ops we should see whether we can
728 make the other operand matching. */
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
731 "treating operand as external\n");
732 oprnd_info
->first_dt
= dt
= vect_external_def
;
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
738 "Build SLP failed: different types\n");
743 /* Make sure to demote the overall operand to external. */
744 if (dt
== vect_external_def
)
745 oprnd_info
->first_dt
= vect_external_def
;
746 /* For a SLP reduction chain we want to duplicate the reduction to
747 each of the chain members. That gets us a sane SLP graph (still
748 the stmts are not 100% correct wrt the initial values). */
749 else if ((dt
== vect_internal_def
750 || dt
== vect_reduction_def
)
751 && oprnd_info
->first_dt
== vect_reduction_def
752 && !STMT_VINFO_DATA_REF (stmt_info
)
753 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
754 && !STMT_VINFO_DATA_REF (def_stmt_info
)
755 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
756 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
758 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
759 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
768 if (dump_enabled_p ())
769 dump_printf_loc (MSG_NOTE
, vect_location
,
770 "swapped operands to match def types in %G",
777 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
778 Return true if we can, meaning that this choice doesn't conflict with
779 existing SLP nodes that use STMT_INFO. */
782 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
784 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
786 return useless_type_conversion_p (vectype
, old_vectype
);
788 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
790 /* We maintain the invariant that if any statement in the group is
791 used, all other members of the group have the same vector type. */
792 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
793 stmt_vec_info member_info
= first_info
;
794 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
795 if (is_pattern_stmt_p (member_info
)
796 && !useless_type_conversion_p (vectype
,
797 STMT_VINFO_VECTYPE (member_info
)))
802 for (member_info
= first_info
; member_info
;
803 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
804 STMT_VINFO_VECTYPE (member_info
) = vectype
;
808 else if (!is_pattern_stmt_p (stmt_info
))
810 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
814 if (dump_enabled_p ())
816 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
817 "Build SLP failed: incompatible vector"
818 " types for: %G", stmt_info
->stmt
);
819 dump_printf_loc (MSG_NOTE
, vect_location
,
820 " old vector type: %T\n", old_vectype
);
821 dump_printf_loc (MSG_NOTE
, vect_location
,
822 " new vector type: %T\n", vectype
);
827 /* Return true if call statements CALL1 and CALL2 are similar enough
828 to be combined into the same SLP group. */
831 compatible_calls_p (gcall
*call1
, gcall
*call2
)
833 unsigned int nargs
= gimple_call_num_args (call1
);
834 if (nargs
!= gimple_call_num_args (call2
))
837 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
840 if (gimple_call_internal_p (call1
))
842 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
843 TREE_TYPE (gimple_call_lhs (call2
))))
845 for (unsigned int i
= 0; i
< nargs
; ++i
)
846 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
847 TREE_TYPE (gimple_call_arg (call2
, i
))))
852 if (!operand_equal_p (gimple_call_fn (call1
),
853 gimple_call_fn (call2
), 0))
856 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
862 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
863 caller's attempt to find the vector type in STMT_INFO with the narrowest
864 element type. Return true if VECTYPE is nonnull and if it is valid
865 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
866 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
867 vect_build_slp_tree. */
870 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
871 unsigned int group_size
,
872 tree vectype
, poly_uint64
*max_nunits
)
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
878 "Build SLP failed: unsupported data-type in %G\n",
880 /* Fatal mismatch. */
884 /* If populating the vector type requires unrolling then fail
885 before adjusting *max_nunits for basic-block vectorization. */
886 if (is_a
<bb_vec_info
> (vinfo
)
887 && !multiple_p (group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
889 if (dump_enabled_p ())
890 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
891 "Build SLP failed: unrolling required "
892 "in basic block SLP\n");
893 /* Fatal mismatch. */
897 /* In case of multiple types we need to detect the smallest type. */
898 vect_update_max_nunits (max_nunits
, vectype
);
902 /* Verify if the scalar stmts STMTS are isomorphic, require data
903 permutation or are of unsupported types of operation. Return
904 true if they are, otherwise return false and indicate in *MATCHES
905 which stmts are not isomorphic to the first one. If MATCHES[0]
906 is false then this indicates the comparison could not be
907 carried out or the stmts will never be vectorized by SLP.
909 Note COND_EXPR is possibly isomorphic to another one after swapping its
910 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
911 the first stmt by swapping the two operands of comparison; set SWAP[i]
912 to 2 if stmt I is isormorphic to the first stmt by inverting the code
913 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
914 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
917 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
918 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
919 poly_uint64
*max_nunits
, bool *matches
,
920 bool *two_operators
, tree
*node_vectype
)
923 stmt_vec_info first_stmt_info
= stmts
[0];
924 enum tree_code first_stmt_code
= ERROR_MARK
;
925 enum tree_code alt_stmt_code
= ERROR_MARK
;
926 enum tree_code rhs_code
= ERROR_MARK
;
927 enum tree_code first_cond_code
= ERROR_MARK
;
929 bool need_same_oprnds
= false;
930 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
933 machine_mode optab_op2_mode
;
934 machine_mode vec_mode
;
935 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
936 bool first_stmt_load_p
= false, load_p
= false;
937 bool first_stmt_phi_p
= false, phi_p
= false;
938 bool maybe_soft_fail
= false;
939 tree soft_fail_nunits_vectype
= NULL_TREE
;
941 /* For every stmt in NODE find its def stmt/s. */
942 stmt_vec_info stmt_info
;
943 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
945 gimple
*stmt
= stmt_info
->stmt
;
949 if (dump_enabled_p ())
950 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
952 /* Fail to vectorize statements marked as unvectorizable, throw
954 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
955 || stmt_can_throw_internal (cfun
, stmt
)
956 || gimple_has_volatile_ops (stmt
))
958 if (dump_enabled_p ())
959 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
960 "Build SLP failed: unvectorizable statement %G",
962 /* ??? For BB vectorization we want to commutate operands in a way
963 to shuffle all unvectorizable defs into one operand and have
964 the other still vectorized. The following doesn't reliably
965 work for this though but it's the easiest we can do here. */
966 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
968 /* Fatal mismatch. */
973 lhs
= gimple_get_lhs (stmt
);
974 if (lhs
== NULL_TREE
)
976 if (dump_enabled_p ())
977 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
978 "Build SLP failed: not GIMPLE_ASSIGN nor "
979 "GIMPLE_CALL %G", stmt
);
980 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
982 /* Fatal mismatch. */
988 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
989 &nunits_vectype
, group_size
))
991 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
993 /* Fatal mismatch. */
997 /* Record nunits required but continue analysis, producing matches[]
998 as if nunits was not an issue. This allows splitting of groups
1001 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
1002 nunits_vectype
, max_nunits
))
1004 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
1005 maybe_soft_fail
= true;
1006 soft_fail_nunits_vectype
= nunits_vectype
;
1009 gcc_assert (vectype
);
1011 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
1014 rhs_code
= CALL_EXPR
;
1016 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
1018 else if ((gimple_call_internal_p (call_stmt
)
1019 && (!vectorizable_internal_fn_p
1020 (gimple_call_internal_fn (call_stmt
))))
1021 || gimple_call_tail_p (call_stmt
)
1022 || gimple_call_noreturn_p (call_stmt
)
1023 || !gimple_call_nothrow_p (call_stmt
)
1024 || gimple_call_chain (call_stmt
))
1026 if (dump_enabled_p ())
1027 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1028 "Build SLP failed: unsupported call type %G",
1030 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1032 /* Fatal mismatch. */
1037 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1039 rhs_code
= ERROR_MARK
;
1044 rhs_code
= gimple_assign_rhs_code (stmt
);
1045 load_p
= gimple_vuse (stmt
);
1048 /* Check the operation. */
1051 *node_vectype
= vectype
;
1052 first_stmt_code
= rhs_code
;
1053 first_stmt_load_p
= load_p
;
1054 first_stmt_phi_p
= phi_p
;
1056 /* Shift arguments should be equal in all the packed stmts for a
1057 vector shift with scalar shift operand. */
1058 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1059 || rhs_code
== LROTATE_EXPR
1060 || rhs_code
== RROTATE_EXPR
)
1062 vec_mode
= TYPE_MODE (vectype
);
1064 /* First see if we have a vector/vector shift. */
1065 optab
= optab_for_tree_code (rhs_code
, vectype
,
1069 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
1071 /* No vector/vector shift, try for a vector/scalar shift. */
1072 optab
= optab_for_tree_code (rhs_code
, vectype
,
1077 if (dump_enabled_p ())
1078 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1079 "Build SLP failed: no optab.\n");
1080 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1082 /* Fatal mismatch. */
1086 icode
= (int) optab_handler (optab
, vec_mode
);
1087 if (icode
== CODE_FOR_nothing
)
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1091 "Build SLP failed: "
1092 "op not supported by target.\n");
1093 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1095 /* Fatal mismatch. */
1099 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
1100 if (!VECTOR_MODE_P (optab_op2_mode
))
1102 need_same_oprnds
= true;
1103 first_op1
= gimple_assign_rhs2 (stmt
);
1107 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1109 need_same_oprnds
= true;
1110 first_op1
= gimple_assign_rhs2 (stmt
);
1113 && rhs_code
== BIT_FIELD_REF
)
1115 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1116 if (!is_a
<bb_vec_info
> (vinfo
)
1117 || TREE_CODE (vec
) != SSA_NAME
1118 || !operand_equal_p (TYPE_SIZE (vectype
),
1119 TYPE_SIZE (TREE_TYPE (vec
))))
1121 if (dump_enabled_p ())
1122 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1123 "Build SLP failed: "
1124 "BIT_FIELD_REF not supported\n");
1125 /* Fatal mismatch. */
1131 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
1133 need_same_oprnds
= true;
1134 first_op1
= gimple_call_arg (call_stmt
, 1);
1139 if (first_stmt_code
!= rhs_code
1140 && alt_stmt_code
== ERROR_MARK
)
1141 alt_stmt_code
= rhs_code
;
1142 if ((first_stmt_code
!= rhs_code
1143 && (first_stmt_code
!= IMAGPART_EXPR
1144 || rhs_code
!= REALPART_EXPR
)
1145 && (first_stmt_code
!= REALPART_EXPR
1146 || rhs_code
!= IMAGPART_EXPR
)
1147 /* Handle mismatches in plus/minus by computing both
1148 and merging the results. */
1149 && !((first_stmt_code
== PLUS_EXPR
1150 || first_stmt_code
== MINUS_EXPR
)
1151 && (alt_stmt_code
== PLUS_EXPR
1152 || alt_stmt_code
== MINUS_EXPR
)
1153 && rhs_code
== alt_stmt_code
)
1154 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1155 && (first_stmt_code
== ARRAY_REF
1156 || first_stmt_code
== BIT_FIELD_REF
1157 || first_stmt_code
== INDIRECT_REF
1158 || first_stmt_code
== COMPONENT_REF
1159 || first_stmt_code
== MEM_REF
)))
1160 || first_stmt_load_p
!= load_p
1161 || first_stmt_phi_p
!= phi_p
)
1163 if (dump_enabled_p ())
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1166 "Build SLP failed: different operation "
1167 "in stmt %G", stmt
);
1168 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1169 "original stmt %G", first_stmt_info
->stmt
);
1176 && first_stmt_code
== BIT_FIELD_REF
1177 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1178 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1180 if (dump_enabled_p ())
1181 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1182 "Build SLP failed: different BIT_FIELD_REF "
1183 "arguments in %G", stmt
);
1188 if (!load_p
&& rhs_code
== CALL_EXPR
)
1190 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1191 as_a
<gcall
*> (stmt
)))
1193 if (dump_enabled_p ())
1194 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1195 "Build SLP failed: different calls in %G",
1202 if ((phi_p
|| gimple_could_trap_p (stmt_info
->stmt
))
1203 && (gimple_bb (first_stmt_info
->stmt
)
1204 != gimple_bb (stmt_info
->stmt
)))
1206 if (dump_enabled_p ())
1207 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1208 "Build SLP failed: different BB for PHI "
1209 "or possibly trapping operation in %G", stmt
);
1214 if (need_same_oprnds
)
1216 tree other_op1
= (call_stmt
1217 ? gimple_call_arg (call_stmt
, 1)
1218 : gimple_assign_rhs2 (stmt
));
1219 if (!operand_equal_p (first_op1
, other_op1
, 0))
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1223 "Build SLP failed: different shift "
1224 "arguments in %G", stmt
);
1230 if (!types_compatible_p (vectype
, *node_vectype
))
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1234 "Build SLP failed: different vector type "
1241 /* Grouped store or load. */
1242 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1244 if (REFERENCE_CLASS_P (lhs
))
1252 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1253 if (prev_first_load
)
1255 /* Check that there are no loads from different interleaving
1256 chains in the same node. */
1257 if (prev_first_load
!= first_load
)
1259 if (dump_enabled_p ())
1260 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1262 "Build SLP failed: different "
1263 "interleaving chains in one node %G",
1270 prev_first_load
= first_load
;
1272 } /* Grouped access. */
1277 /* Not grouped load. */
1278 if (dump_enabled_p ())
1279 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1280 "Build SLP failed: not grouped load %G", stmt
);
1282 /* FORNOW: Not grouped loads are not supported. */
1283 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1285 /* Fatal mismatch. */
1290 /* Not memory operation. */
1292 && TREE_CODE_CLASS (rhs_code
) != tcc_binary
1293 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1294 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1295 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1296 && rhs_code
!= VIEW_CONVERT_EXPR
1297 && rhs_code
!= CALL_EXPR
1298 && rhs_code
!= BIT_FIELD_REF
)
1300 if (dump_enabled_p ())
1301 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1302 "Build SLP failed: operation unsupported %G",
1304 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1306 /* Fatal mismatch. */
1311 if (rhs_code
== COND_EXPR
)
1313 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1314 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1315 enum tree_code swap_code
= ERROR_MARK
;
1316 enum tree_code invert_code
= ERROR_MARK
;
1319 first_cond_code
= TREE_CODE (cond_expr
);
1320 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1322 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1323 swap_code
= swap_tree_comparison (cond_code
);
1324 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1327 if (first_cond_code
== cond_code
)
1329 /* Isomorphic can be achieved by swapping. */
1330 else if (first_cond_code
== swap_code
)
1332 /* Isomorphic can be achieved by inverting. */
1333 else if (first_cond_code
== invert_code
)
1337 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1339 "Build SLP failed: different"
1340 " operation %G", stmt
);
1350 for (i
= 0; i
< group_size
; ++i
)
1354 /* If we allowed a two-operation SLP node verify the target can cope
1355 with the permute we are going to use. */
1356 if (alt_stmt_code
!= ERROR_MARK
1357 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1359 *two_operators
= true;
1362 if (maybe_soft_fail
)
1364 unsigned HOST_WIDE_INT const_nunits
;
1365 if (!TYPE_VECTOR_SUBPARTS
1366 (soft_fail_nunits_vectype
).is_constant (&const_nunits
)
1367 || const_nunits
> group_size
)
1371 /* With constant vector elements simulate a mismatch at the
1372 point we need to split. */
1373 unsigned tail
= group_size
& (const_nunits
- 1);
1374 memset (&matches
[group_size
- tail
], 0, sizeof (bool) * tail
);
1382 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1383 Note we never remove apart from at destruction time so we do not
1384 need a special value for deleted that differs from empty. */
1387 typedef vec
<stmt_vec_info
> value_type
;
1388 typedef vec
<stmt_vec_info
> compare_type
;
1389 static inline hashval_t
hash (value_type
);
1390 static inline bool equal (value_type existing
, value_type candidate
);
1391 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1392 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1393 static const bool empty_zero_p
= true;
1394 static inline void mark_empty (value_type
&x
) { x
.release (); }
1395 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1396 static inline void remove (value_type
&x
) { x
.release (); }
1399 bst_traits::hash (value_type x
)
1402 for (unsigned i
= 0; i
< x
.length (); ++i
)
1403 h
.add_int (gimple_uid (x
[i
]->stmt
));
1407 bst_traits::equal (value_type existing
, value_type candidate
)
1409 if (existing
.length () != candidate
.length ())
1411 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1412 if (existing
[i
] != candidate
[i
])
1417 typedef hash_map
<vec
<stmt_vec_info
>, slp_tree
,
1418 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1419 scalar_stmts_to_slp_tree_map_t
;
1422 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1423 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1424 poly_uint64
*max_nunits
,
1425 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1426 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1429 vect_build_slp_tree (vec_info
*vinfo
,
1430 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1431 poly_uint64
*max_nunits
,
1432 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1433 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1435 if (slp_tree
*leader
= bst_map
->get (stmts
))
1437 if (dump_enabled_p ())
1438 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1439 *leader
? "" : "failed ", *leader
);
1442 SLP_TREE_REF_COUNT (*leader
)++;
1443 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1449 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1450 so we can pick up backedge destinations during discovery. */
1451 slp_tree res
= new _slp_tree
;
1452 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1453 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1454 bst_map
->put (stmts
.copy (), res
);
1458 if (dump_enabled_p ())
1459 dump_printf_loc (MSG_NOTE
, vect_location
,
1460 "SLP discovery limit exceeded\n");
1461 bool existed_p
= bst_map
->put (stmts
, NULL
);
1462 gcc_assert (existed_p
);
1463 /* Mark the node invalid so we can detect those when still in use
1464 as backedge destinations. */
1465 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1466 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1467 vect_free_slp_tree (res
);
1468 memset (matches
, 0, sizeof (bool) * group_size
);
1473 poly_uint64 this_max_nunits
= 1;
1474 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1476 matches
, limit
, tree_size
, bst_map
);
1479 bool existed_p
= bst_map
->put (stmts
, NULL
);
1480 gcc_assert (existed_p
);
1481 /* Mark the node invalid so we can detect those when still in use
1482 as backedge destinations. */
1483 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1484 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1485 vect_free_slp_tree (res
);
1489 gcc_assert (res_
== res
);
1490 res
->max_nunits
= this_max_nunits
;
1491 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1492 /* Keep a reference for the bst_map use. */
1493 SLP_TREE_REF_COUNT (res
)++;
1498 /* Recursively build an SLP tree starting from NODE.
1499 Fail (and return a value not equal to zero) if def-stmts are not
1500 isomorphic, require data permutation or are of unsupported types of
1501 operation. Otherwise, return 0.
1502 The value returned is the depth in the SLP tree where a mismatch
1506 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1507 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1508 poly_uint64
*max_nunits
,
1509 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1510 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1512 unsigned nops
, i
, this_tree_size
= 0;
1513 poly_uint64 this_max_nunits
= *max_nunits
;
1517 stmt_vec_info stmt_info
= stmts
[0];
1518 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1519 nops
= gimple_call_num_args (stmt
);
1520 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1522 nops
= gimple_num_ops (stmt
) - 1;
1523 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1526 else if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1527 nops
= gimple_phi_num_args (phi
);
1531 /* If the SLP node is a PHI (induction or reduction), terminate
1533 bool *skip_args
= XALLOCAVEC (bool, nops
);
1534 memset (skip_args
, 0, sizeof (bool) * nops
);
1535 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1536 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1538 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1539 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1541 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1545 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1546 if (def_type
== vect_induction_def
)
1548 /* Induction PHIs are not cycles but walk the initial
1549 value. Only for inner loops through, for outer loops
1550 we need to pick up the value from the actual PHIs
1551 to more easily support peeling and epilogue vectorization. */
1552 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1553 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1554 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1557 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1559 else if (def_type
== vect_reduction_def
1560 || def_type
== vect_double_reduction_def
1561 || def_type
== vect_nested_cycle
)
1563 /* Else def types have to match. */
1564 stmt_vec_info other_info
;
1565 bool all_same
= true;
1566 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1568 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1570 if (other_info
!= stmt_info
)
1573 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1574 /* Reduction initial values are not explicitely represented. */
1575 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1576 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1577 /* Reduction chain backedge defs are filled manually.
1578 ??? Need a better way to identify a SLP reduction chain PHI.
1579 Or a better overall way to SLP match those. */
1580 if (all_same
&& def_type
== vect_reduction_def
)
1581 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1583 else if (def_type
!= vect_internal_def
)
1588 bool two_operators
= false;
1589 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1590 tree vectype
= NULL_TREE
;
1591 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1592 &this_max_nunits
, matches
, &two_operators
,
1596 /* If the SLP node is a load, terminate the recursion unless masked. */
1597 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1598 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1600 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1603 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1608 *max_nunits
= this_max_nunits
;
1610 node
= vect_create_new_slp_node (node
, stmts
, 0);
1611 SLP_TREE_VECTYPE (node
) = vectype
;
1612 /* And compute the load permutation. Whether it is actually
1613 a permutation depends on the unrolling factor which is
1615 vec
<unsigned> load_permutation
;
1617 stmt_vec_info load_info
;
1618 load_permutation
.create (group_size
);
1619 stmt_vec_info first_stmt_info
1620 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1621 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1623 int load_place
= vect_get_place_in_interleaving_chain
1624 (load_info
, first_stmt_info
);
1625 gcc_assert (load_place
!= -1);
1626 load_permutation
.safe_push (load_place
);
1628 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1632 else if (gimple_assign_single_p (stmt_info
->stmt
)
1633 && !gimple_vuse (stmt_info
->stmt
)
1634 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1636 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1637 the same SSA name vector of a compatible type to vectype. */
1638 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1639 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1640 stmt_vec_info estmt_info
;
1641 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1643 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1644 tree bfref
= gimple_assign_rhs1 (estmt
);
1646 if (!known_eq (bit_field_size (bfref
),
1647 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1648 || !constant_multiple_p (bit_field_offset (bfref
),
1649 bit_field_size (bfref
), &lane
))
1654 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1656 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1657 /* ??? We record vectype here but we hide eventually necessary
1658 punning and instead rely on code generation to materialize
1659 VIEW_CONVERT_EXPRs as necessary. We instead should make
1660 this explicit somehow. */
1661 SLP_TREE_VECTYPE (vnode
) = vectype
;
1662 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1663 /* We are always building a permutation node even if it is an identity
1664 permute to shield the rest of the vectorizer from the odd node
1665 representing an actual vector without any scalar ops.
1666 ??? We could hide it completely with making the permute node
1668 node
= vect_create_new_slp_node (node
, stmts
, 1);
1669 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1670 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1671 SLP_TREE_VECTYPE (node
) = vectype
;
1672 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1676 /* Get at the operands, verifying they are compatible. */
1677 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1678 slp_oprnd_info oprnd_info
;
1679 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1681 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
1682 stmts
, i
, &oprnds_info
);
1684 matches
[(res
== -1) ? 0 : i
] = false;
1688 for (i
= 0; i
< group_size
; ++i
)
1691 vect_free_oprnd_info (oprnds_info
);
1696 auto_vec
<slp_tree
, 4> children
;
1698 stmt_info
= stmts
[0];
1700 /* Create SLP_TREE nodes for the definition node/s. */
1701 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1706 /* We're skipping certain operands from processing, for example
1707 outer loop reduction initial defs. */
1710 children
.safe_push (NULL
);
1714 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1716 /* COND_EXPR have one too many eventually if the condition
1718 gcc_assert (i
== 3 && nops
== 4);
1722 if (is_a
<bb_vec_info
> (vinfo
)
1723 && oprnd_info
->first_dt
== vect_internal_def
1724 && !oprnd_info
->any_pattern
)
1726 /* For BB vectorization, if all defs are the same do not
1727 bother to continue the build along the single-lane
1728 graph but use a splat of the scalar value. */
1729 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
1730 for (j
= 1; j
< group_size
; ++j
)
1731 if (oprnd_info
->def_stmts
[j
] != first_def
)
1734 /* But avoid doing this for loads where we may be
1735 able to CSE things, unless the stmt is not
1737 && (!STMT_VINFO_VECTORIZABLE (first_def
)
1738 || !gimple_vuse (first_def
->stmt
)))
1740 if (dump_enabled_p ())
1741 dump_printf_loc (MSG_NOTE
, vect_location
,
1742 "Using a splat of the uniform operand\n");
1743 oprnd_info
->first_dt
= vect_external_def
;
1747 if (oprnd_info
->first_dt
== vect_external_def
1748 || oprnd_info
->first_dt
== vect_constant_def
)
1750 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1751 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1752 oprnd_info
->ops
= vNULL
;
1753 children
.safe_push (invnode
);
1757 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1758 group_size
, &this_max_nunits
,
1760 &this_tree_size
, bst_map
)) != NULL
)
1762 oprnd_info
->def_stmts
= vNULL
;
1763 children
.safe_push (child
);
1767 /* If the SLP build for operand zero failed and operand zero
1768 and one can be commutated try that for the scalar stmts
1769 that failed the match. */
1771 /* A first scalar stmt mismatch signals a fatal mismatch. */
1773 /* ??? For COND_EXPRs we can swap the comparison operands
1774 as well as the arms under some constraints. */
1776 && oprnds_info
[1]->first_dt
== vect_internal_def
1777 && is_gimple_assign (stmt_info
->stmt
)
1778 /* Swapping operands for reductions breaks assumptions later on. */
1779 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1780 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1782 /* See whether we can swap the matching or the non-matching
1784 bool swap_not_matching
= true;
1787 for (j
= 0; j
< group_size
; ++j
)
1789 if (matches
[j
] != !swap_not_matching
)
1791 stmt_vec_info stmt_info
= stmts
[j
];
1792 /* Verify if we can swap operands of this stmt. */
1793 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1795 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1797 if (!swap_not_matching
)
1799 swap_not_matching
= false;
1804 while (j
!= group_size
);
1806 /* Swap mismatched definition stmts. */
1807 if (dump_enabled_p ())
1808 dump_printf_loc (MSG_NOTE
, vect_location
,
1809 "Re-trying with swapped operands of stmts ");
1810 for (j
= 0; j
< group_size
; ++j
)
1811 if (matches
[j
] == !swap_not_matching
)
1813 std::swap (oprnds_info
[0]->def_stmts
[j
],
1814 oprnds_info
[1]->def_stmts
[j
]);
1815 std::swap (oprnds_info
[0]->ops
[j
],
1816 oprnds_info
[1]->ops
[j
]);
1817 if (dump_enabled_p ())
1818 dump_printf (MSG_NOTE
, "%d ", j
);
1820 if (dump_enabled_p ())
1821 dump_printf (MSG_NOTE
, "\n");
1822 /* After swapping some operands we lost track whether an
1823 operand has any pattern defs so be conservative here. */
1824 if (oprnds_info
[0]->any_pattern
|| oprnds_info
[1]->any_pattern
)
1825 oprnds_info
[0]->any_pattern
= oprnds_info
[1]->any_pattern
= true;
1826 /* And try again with scratch 'matches' ... */
1827 bool *tem
= XALLOCAVEC (bool, group_size
);
1828 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1829 group_size
, &this_max_nunits
,
1831 &this_tree_size
, bst_map
)) != NULL
)
1833 oprnd_info
->def_stmts
= vNULL
;
1834 children
.safe_push (child
);
1840 /* If the SLP build failed and we analyze a basic-block
1841 simply treat nodes we fail to build as externally defined
1842 (and thus build vectors from the scalar defs).
1843 The cost model will reject outright expensive cases.
1844 ??? This doesn't treat cases where permutation ultimatively
1845 fails (or we don't try permutation below). Ideally we'd
1846 even compute a permutation that will end up with the maximum
1848 if (is_a
<bb_vec_info
> (vinfo
)
1849 /* ??? Rejecting patterns this way doesn't work. We'd have to
1850 do extra work to cancel the pattern so the uses see the
1852 && !is_pattern_stmt_p (stmt_info
)
1853 && !oprnd_info
->any_pattern
)
1855 /* But if there's a leading vector sized set of matching stmts
1856 fail here so we can split the group. This matches the condition
1857 vect_analyze_slp_instance uses. */
1858 /* ??? We might want to split here and combine the results to support
1859 multiple vector sizes better. */
1860 for (j
= 0; j
< group_size
; ++j
)
1863 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_NOTE
, vect_location
,
1867 "Building vector operands from scalars\n");
1869 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1870 children
.safe_push (child
);
1871 oprnd_info
->ops
= vNULL
;
1876 gcc_assert (child
== NULL
);
1877 FOR_EACH_VEC_ELT (children
, j
, child
)
1879 vect_free_slp_tree (child
);
1880 vect_free_oprnd_info (oprnds_info
);
1884 vect_free_oprnd_info (oprnds_info
);
1886 /* If we have all children of a child built up from uniform scalars
1887 or does more than one possibly expensive vector construction then
1888 just throw that away, causing it built up from scalars.
1889 The exception is the SLP node for the vector store. */
1890 if (is_a
<bb_vec_info
> (vinfo
)
1891 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1892 /* ??? Rejecting patterns this way doesn't work. We'd have to
1893 do extra work to cancel the pattern so the uses see the
1895 && !is_pattern_stmt_p (stmt_info
))
1899 bool all_uniform_p
= true;
1900 unsigned n_vector_builds
= 0;
1901 FOR_EACH_VEC_ELT (children
, j
, child
)
1905 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
1906 all_uniform_p
= false;
1907 else if (!vect_slp_tree_uniform_p (child
))
1909 all_uniform_p
= false;
1910 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
1915 || n_vector_builds
> 1
1916 || (n_vector_builds
== children
.length ()
1917 && is_a
<gphi
*> (stmt_info
->stmt
)))
1921 FOR_EACH_VEC_ELT (children
, j
, child
)
1923 vect_free_slp_tree (child
);
1925 if (dump_enabled_p ())
1926 dump_printf_loc (MSG_NOTE
, vect_location
,
1927 "Building parent vector operands from "
1928 "scalars instead\n");
1933 *tree_size
+= this_tree_size
+ 1;
1934 *max_nunits
= this_max_nunits
;
1938 /* ??? We'd likely want to either cache in bst_map sth like
1939 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1940 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1941 explicit stmts to put in so the keying on 'stmts' doesn't
1942 work (but we have the same issue with nodes that use 'ops'). */
1943 slp_tree one
= new _slp_tree
;
1944 slp_tree two
= new _slp_tree
;
1945 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1946 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1947 SLP_TREE_VECTYPE (one
) = vectype
;
1948 SLP_TREE_VECTYPE (two
) = vectype
;
1949 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1950 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1952 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1953 SLP_TREE_REF_COUNT (child
)++;
1955 /* Here we record the original defs since this
1956 node represents the final lane configuration. */
1957 node
= vect_create_new_slp_node (node
, stmts
, 2);
1958 SLP_TREE_VECTYPE (node
) = vectype
;
1959 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1960 SLP_TREE_CHILDREN (node
).quick_push (one
);
1961 SLP_TREE_CHILDREN (node
).quick_push (two
);
1962 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1963 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1964 enum tree_code ocode
= ERROR_MARK
;
1965 stmt_vec_info ostmt_info
;
1967 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1969 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1970 if (gimple_assign_rhs_code (ostmt
) != code0
)
1972 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1973 ocode
= gimple_assign_rhs_code (ostmt
);
1977 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1979 SLP_TREE_CODE (one
) = code0
;
1980 SLP_TREE_CODE (two
) = ocode
;
1981 SLP_TREE_LANES (one
) = stmts
.length ();
1982 SLP_TREE_LANES (two
) = stmts
.length ();
1983 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1984 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1988 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1989 SLP_TREE_VECTYPE (node
) = vectype
;
1990 SLP_TREE_CHILDREN (node
).splice (children
);
1994 /* Dump a single SLP tree NODE. */
1997 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
2002 stmt_vec_info stmt_info
;
2005 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
2006 dump_user_location_t user_loc
= loc
.get_user_location ();
2007 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
2008 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2010 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
2013 estimated_poly_value (node
->max_nunits
),
2014 SLP_TREE_REF_COUNT (node
));
2015 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
2017 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2018 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
2020 dump_printf_loc (metadata
, user_loc
, "op template: %G",
2021 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
2023 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
2024 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2025 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
2028 dump_printf_loc (metadata
, user_loc
, "\t{ ");
2029 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
2030 dump_printf (metadata
, "%T%s ", op
,
2031 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
2032 dump_printf (metadata
, "}\n");
2034 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2036 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
2037 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
2038 dump_printf (dump_kind
, " %u", j
);
2039 dump_printf (dump_kind
, " }\n");
2041 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
2043 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
2044 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
2045 dump_printf (dump_kind
, " %u[%u]",
2046 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
2047 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
2048 dump_printf (dump_kind
, " }\n");
2050 if (SLP_TREE_CHILDREN (node
).is_empty ())
2052 dump_printf_loc (metadata
, user_loc
, "\tchildren");
2053 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2054 dump_printf (dump_kind
, " %p", (void *)child
);
2055 dump_printf (dump_kind
, "\n");
2059 debug (slp_tree node
)
2061 debug_dump_context ctx
;
2062 vect_print_slp_tree (MSG_NOTE
,
2063 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2067 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2070 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2071 slp_tree node
, hash_set
<slp_tree
> &visited
)
2076 if (visited
.add (node
))
2079 vect_print_slp_tree (dump_kind
, loc
, node
);
2081 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2083 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
2087 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2090 hash_set
<slp_tree
> visited
;
2091 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
2094 /* Mark the tree rooted at NODE with PURE_SLP. */
2097 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
2100 stmt_vec_info stmt_info
;
2103 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2106 if (visited
.add (node
))
2109 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2110 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2112 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2114 vect_mark_slp_stmts (child
, visited
);
2118 vect_mark_slp_stmts (slp_tree node
)
2120 hash_set
<slp_tree
> visited
;
2121 vect_mark_slp_stmts (node
, visited
);
2124 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2127 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2130 stmt_vec_info stmt_info
;
2133 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2136 if (visited
.add (node
))
2139 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2141 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2142 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2143 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2146 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2148 vect_mark_slp_stmts_relevant (child
, visited
);
2152 vect_mark_slp_stmts_relevant (slp_tree node
)
2154 hash_set
<slp_tree
> visited
;
2155 vect_mark_slp_stmts_relevant (node
, visited
);
2159 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2162 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2163 hash_set
<slp_tree
> &visited
)
2165 if (!node
|| visited
.add (node
))
2168 if (SLP_TREE_CHILDREN (node
).length () == 0)
2170 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2172 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2173 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2174 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2175 loads
.safe_push (node
);
2181 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2182 vect_gather_slp_loads (loads
, child
, visited
);
2187 /* Find the last store in SLP INSTANCE. */
2190 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2192 stmt_vec_info last
= NULL
;
2193 stmt_vec_info stmt_vinfo
;
2195 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2197 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2198 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2204 /* Find the first stmt in NODE. */
2207 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2209 stmt_vec_info first
= NULL
;
2210 stmt_vec_info stmt_vinfo
;
2212 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2214 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2216 || get_later_stmt (stmt_vinfo
, first
) == first
)
2223 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2224 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2225 (also containing the first GROUP1_SIZE stmts, since stores are
2226 consecutive), the second containing the remainder.
2227 Return the first stmt in the second group. */
2229 static stmt_vec_info
2230 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2232 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2233 gcc_assert (group1_size
> 0);
2234 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2235 gcc_assert (group2_size
> 0);
2236 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2238 stmt_vec_info stmt_info
= first_vinfo
;
2239 for (unsigned i
= group1_size
; i
> 1; i
--)
2241 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2242 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2244 /* STMT is now the last element of the first group. */
2245 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2246 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2248 DR_GROUP_SIZE (group2
) = group2_size
;
2249 for (stmt_info
= group2
; stmt_info
;
2250 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2252 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2253 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2256 /* For the second group, the DR_GROUP_GAP is that before the original group,
2257 plus skipping over the first vector. */
2258 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2260 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2261 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2263 if (dump_enabled_p ())
2264 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2265 group1_size
, group2_size
);
2270 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2271 statements and a vector of NUNITS elements. */
2274 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2276 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2279 /* Helper that checks to see if a node is a load node. */
2282 vect_is_slp_load_node (slp_tree root
)
2284 return SLP_TREE_DEF_TYPE (root
) == vect_internal_def
2285 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root
))
2286 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root
)));
2290 /* Helper function of optimize_load_redistribution that performs the operation
2294 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2295 vec_info
*vinfo
, unsigned int group_size
,
2296 hash_map
<slp_tree
, slp_tree
> *load_map
,
2299 if (slp_tree
*leader
= load_map
->get (root
))
2305 /* For now, we don't know anything about externals so do not do anything. */
2306 if (!root
|| SLP_TREE_DEF_TYPE (root
) != vect_internal_def
)
2308 else if (SLP_TREE_CODE (root
) == VEC_PERM_EXPR
)
2310 /* First convert this node into a load node and add it to the leaves
2311 list and flatten the permute from a lane to a load one. If it's
2312 unneeded it will be elided later. */
2313 vec
<stmt_vec_info
> stmts
;
2314 stmts
.create (SLP_TREE_LANES (root
));
2315 lane_permutation_t lane_perm
= SLP_TREE_LANE_PERMUTATION (root
);
2316 for (unsigned j
= 0; j
< lane_perm
.length (); j
++)
2318 std::pair
<unsigned, unsigned> perm
= lane_perm
[j
];
2319 node
= SLP_TREE_CHILDREN (root
)[perm
.first
];
2321 if (!vect_is_slp_load_node (node
)
2322 || SLP_TREE_CHILDREN (node
).exists ())
2328 stmts
.quick_push (SLP_TREE_SCALAR_STMTS (node
)[perm
.second
]);
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_NOTE
, vect_location
,
2333 "converting stmts on permute node %p\n", root
);
2335 bool *matches
= XALLOCAVEC (bool, group_size
);
2336 poly_uint64 max_nunits
= 1;
2337 unsigned tree_size
= 0, limit
= 1;
2338 node
= vect_build_slp_tree (vinfo
, stmts
, group_size
, &max_nunits
,
2339 matches
, &limit
, &tree_size
, bst_map
);
2343 load_map
->put (root
, node
);
2348 load_map
->put (root
, NULL
);
2350 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2353 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2357 SLP_TREE_REF_COUNT (value
)++;
2358 SLP_TREE_CHILDREN (root
)[i
] = value
;
2359 /* ??? We know the original leafs of the replaced nodes will
2360 be referenced by bst_map, only the permutes created by
2361 pattern matching are not. */
2362 if (SLP_TREE_REF_COUNT (node
) == 1)
2363 load_map
->remove (node
);
2364 vect_free_slp_tree (node
);
2371 /* Temporary workaround for loads not being CSEd during SLP build. This
2372 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2373 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2374 same DR such that the final operation is equal to a permuted load. Such
2375 NODES are then directly converted into LOADS themselves. The nodes are
2376 CSEd using BST_MAP. */
2379 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
*bst_map
,
2380 vec_info
*vinfo
, unsigned int group_size
,
2381 hash_map
<slp_tree
, slp_tree
> *load_map
,
2387 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
2390 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
2394 SLP_TREE_REF_COUNT (value
)++;
2395 SLP_TREE_CHILDREN (root
)[i
] = value
;
2396 /* ??? We know the original leafs of the replaced nodes will
2397 be referenced by bst_map, only the permutes created by
2398 pattern matching are not. */
2399 if (SLP_TREE_REF_COUNT (node
) == 1)
2400 load_map
->remove (node
);
2401 vect_free_slp_tree (node
);
2406 /* Helper function of vect_match_slp_patterns.
2408 Attempts to match patterns against the slp tree rooted in REF_NODE using
2409 VINFO. Patterns are matched in post-order traversal.
2411 If matching is successful the value in REF_NODE is updated and returned, if
2412 not then it is returned unchanged. */
2415 vect_match_slp_patterns_2 (slp_tree
*ref_node
, vec_info
*vinfo
,
2416 slp_tree_to_load_perm_map_t
*perm_cache
,
2417 slp_compat_nodes_map_t
*compat_cache
,
2418 hash_set
<slp_tree
> *visited
)
2421 slp_tree node
= *ref_node
;
2422 bool found_p
= false;
2423 if (!node
|| visited
->add (node
))
2427 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2428 found_p
|= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node
)[i
],
2429 vinfo
, perm_cache
, compat_cache
,
2432 for (unsigned x
= 0; x
< num__slp_patterns
; x
++)
2434 vect_pattern
*pattern
2435 = slp_patterns
[x
] (perm_cache
, compat_cache
, ref_node
);
2438 pattern
->build (vinfo
);
2447 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2450 The modified tree is returned. Patterns are tried in order and multiple
2451 patterns may match. */
2454 vect_match_slp_patterns (slp_instance instance
, vec_info
*vinfo
,
2455 hash_set
<slp_tree
> *visited
,
2456 slp_tree_to_load_perm_map_t
*perm_cache
,
2457 slp_compat_nodes_map_t
*compat_cache
)
2459 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2460 slp_tree
*ref_node
= &SLP_INSTANCE_TREE (instance
);
2462 if (dump_enabled_p ())
2463 dump_printf_loc (MSG_NOTE
, vect_location
,
2464 "Analyzing SLP tree %p for patterns\n",
2465 SLP_INSTANCE_TREE (instance
));
2467 return vect_match_slp_patterns_2 (ref_node
, vinfo
, perm_cache
, compat_cache
,
2471 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2472 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2473 Return true if we could use IFN_STORE_LANES instead and if that appears
2474 to be the better approach. */
2477 vect_slp_prefer_store_lanes_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
2478 unsigned int group_size
,
2479 unsigned int new_group_size
)
2481 tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2482 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
2485 /* Allow the split if one of the two new groups would operate on full
2486 vectors *within* rather than across one scalar loop iteration.
2487 This is purely a heuristic, but it should work well for group
2488 sizes of 3 and 4, where the possible splits are:
2490 3->2+1: OK if the vector has exactly two elements
2492 4->3+1: Less clear-cut. */
2493 if (multiple_p (group_size
- new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
))
2494 || multiple_p (new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
2496 return vect_store_lanes_supported (vectype
, group_size
, false);
2499 /* Analyze an SLP instance starting from a group of grouped stores. Call
2500 vect_build_slp_tree to build a tree of packed stmts if possible.
2501 Return FALSE if it's impossible to SLP any stmt in the loop. */
2504 vect_analyze_slp_instance (vec_info
*vinfo
,
2505 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2506 stmt_vec_info stmt_info
, slp_instance_kind kind
,
2507 unsigned max_tree_size
, unsigned *limit
);
2509 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2510 of KIND. Return true if successful. */
2513 vect_build_slp_instance (vec_info
*vinfo
,
2514 slp_instance_kind kind
,
2515 vec
<stmt_vec_info
> &scalar_stmts
,
2516 stmt_vec_info root_stmt_info
,
2517 unsigned max_tree_size
, unsigned *limit
,
2518 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2519 /* ??? We need stmt_info for group splitting. */
2520 stmt_vec_info stmt_info_
)
2522 if (dump_enabled_p ())
2524 dump_printf_loc (MSG_NOTE
, vect_location
,
2525 "Starting SLP discovery for\n");
2526 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
2527 dump_printf_loc (MSG_NOTE
, vect_location
,
2528 " %G", scalar_stmts
[i
]->stmt
);
2531 /* Build the tree for the SLP instance. */
2532 unsigned int group_size
= scalar_stmts
.length ();
2533 bool *matches
= XALLOCAVEC (bool, group_size
);
2534 poly_uint64 max_nunits
= 1;
2535 unsigned tree_size
= 0;
2537 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2538 &max_nunits
, matches
, limit
,
2539 &tree_size
, bst_map
);
2542 /* Calculate the unrolling factor based on the smallest type. */
2543 poly_uint64 unrolling_factor
2544 = calculate_unrolling_factor (max_nunits
, group_size
);
2546 if (maybe_ne (unrolling_factor
, 1U)
2547 && is_a
<bb_vec_info
> (vinfo
))
2549 unsigned HOST_WIDE_INT const_max_nunits
;
2550 if (!max_nunits
.is_constant (&const_max_nunits
)
2551 || const_max_nunits
> group_size
)
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2555 "Build SLP failed: store group "
2556 "size not a multiple of the vector size "
2557 "in basic block SLP\n");
2558 vect_free_slp_tree (node
);
2561 /* Fatal mismatch. */
2562 if (dump_enabled_p ())
2563 dump_printf_loc (MSG_NOTE
, vect_location
,
2564 "SLP discovery succeeded but node needs "
2566 memset (matches
, true, group_size
);
2567 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2568 vect_free_slp_tree (node
);
2572 /* Create a new SLP instance. */
2573 slp_instance new_instance
= XNEW (class _slp_instance
);
2574 SLP_INSTANCE_TREE (new_instance
) = node
;
2575 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2576 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2577 SLP_INSTANCE_ROOT_STMT (new_instance
) = root_stmt_info
;
2578 SLP_INSTANCE_KIND (new_instance
) = kind
;
2579 new_instance
->reduc_phis
= NULL
;
2580 new_instance
->cost_vec
= vNULL
;
2581 new_instance
->subgraph_entries
= vNULL
;
2583 if (dump_enabled_p ())
2584 dump_printf_loc (MSG_NOTE
, vect_location
,
2585 "SLP size %u vs. limit %u.\n",
2586 tree_size
, max_tree_size
);
2588 /* Fixup SLP reduction chains. */
2589 if (kind
== slp_inst_kind_reduc_chain
)
2591 /* If this is a reduction chain with a conversion in front
2592 amend the SLP tree with a node for that. */
2594 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2595 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
2597 /* Get at the conversion stmt - we know it's the single use
2598 of the last stmt of the reduction chain. */
2599 use_operand_p use_p
;
2600 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
2601 &use_p
, &scalar_def
);
2603 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
2604 next_info
= vect_stmt_to_vectorize (next_info
);
2605 scalar_stmts
= vNULL
;
2606 scalar_stmts
.create (group_size
);
2607 for (unsigned i
= 0; i
< group_size
; ++i
)
2608 scalar_stmts
.quick_push (next_info
);
2609 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2610 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2611 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2612 SLP_INSTANCE_TREE (new_instance
) = conv
;
2613 /* We also have to fake this conversion stmt as SLP reduction
2614 group so we don't have to mess with too much code
2616 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2617 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2619 /* Fill the backedge child of the PHI SLP node. The
2620 general matching code cannot find it because the
2621 scalar code does not reflect how we vectorize the
2623 use_operand_p use_p
;
2624 imm_use_iterator imm_iter
;
2625 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
2626 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
2627 gimple_get_lhs (scalar_def
))
2628 /* There are exactly two non-debug uses, the reduction
2629 PHI and the loop-closed PHI node. */
2630 if (!is_gimple_debug (USE_STMT (use_p
))
2631 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
2633 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
2634 stmt_vec_info phi_info
2635 = vinfo
->lookup_stmt (USE_STMT (use_p
));
2636 for (unsigned i
= 0; i
< group_size
; ++i
)
2637 phis
.quick_push (phi_info
);
2638 slp_tree
*phi_node
= bst_map
->get (phis
);
2639 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
2640 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
2641 = SLP_INSTANCE_TREE (new_instance
);
2642 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
2646 vinfo
->slp_instances
.safe_push (new_instance
);
2648 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2649 the number of scalar stmts in the root in a few places.
2650 Verify that assumption holds. */
2651 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2652 .length () == group_size
);
2654 if (dump_enabled_p ())
2656 dump_printf_loc (MSG_NOTE
, vect_location
,
2657 "Final SLP tree for instance %p:\n", new_instance
);
2658 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2659 SLP_INSTANCE_TREE (new_instance
));
2667 /* Failed to SLP. */
2668 /* Free the allocated memory. */
2669 scalar_stmts
.release ();
2672 stmt_vec_info stmt_info
= stmt_info_
;
2673 /* Try to break the group up into pieces. */
2674 if (kind
== slp_inst_kind_store
)
2676 /* ??? We could delay all the actual splitting of store-groups
2677 until after SLP discovery of the original group completed.
2678 Then we can recurse to vect_build_slp_instance directly. */
2679 for (i
= 0; i
< group_size
; i
++)
2683 /* For basic block SLP, try to break the group up into multiples of
2685 if (is_a
<bb_vec_info
> (vinfo
)
2686 && (i
> 1 && i
< group_size
))
2689 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2690 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
2691 1 << floor_log2 (i
));
2692 unsigned HOST_WIDE_INT const_nunits
;
2694 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
2696 /* Split into two groups at the first vector boundary. */
2697 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2698 unsigned group1_size
= i
& ~(const_nunits
- 1);
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_NOTE
, vect_location
,
2702 "Splitting SLP group at stmt %u\n", i
);
2703 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2705 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2706 kind
, max_tree_size
,
2708 /* Split the rest at the failure point and possibly
2709 re-analyze the remaining matching part if it has
2710 at least two lanes. */
2712 && (i
+ 1 < group_size
2713 || i
- group1_size
> 1))
2715 stmt_vec_info rest2
= rest
;
2716 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
2717 if (i
- group1_size
> 1)
2718 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
2719 kind
, max_tree_size
,
2722 /* Re-analyze the non-matching tail if it has at least
2724 if (i
+ 1 < group_size
)
2725 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2726 rest
, kind
, max_tree_size
,
2732 /* For loop vectorization split into arbitrary pieces of size > 1. */
2733 if (is_a
<loop_vec_info
> (vinfo
)
2734 && (i
> 1 && i
< group_size
)
2735 && !vect_slp_prefer_store_lanes_p (vinfo
, stmt_info
, group_size
, i
))
2737 unsigned group1_size
= i
;
2739 if (dump_enabled_p ())
2740 dump_printf_loc (MSG_NOTE
, vect_location
,
2741 "Splitting SLP group at stmt %u\n", i
);
2743 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2745 /* Loop vectorization cannot handle gaps in stores, make sure
2746 the split group appears as strided. */
2747 STMT_VINFO_STRIDED_P (rest
) = 1;
2748 DR_GROUP_GAP (rest
) = 0;
2749 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
2750 DR_GROUP_GAP (stmt_info
) = 0;
2752 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2753 kind
, max_tree_size
, limit
);
2754 if (i
+ 1 < group_size
)
2755 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2756 rest
, kind
, max_tree_size
, limit
);
2761 /* Even though the first vector did not all match, we might be able to SLP
2762 (some) of the remainder. FORNOW ignore this possibility. */
2765 /* Failed to SLP. */
2766 if (dump_enabled_p ())
2767 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2772 /* Analyze an SLP instance starting from a group of grouped stores. Call
2773 vect_build_slp_tree to build a tree of packed stmts if possible.
2774 Return FALSE if it's impossible to SLP any stmt in the loop. */
2777 vect_analyze_slp_instance (vec_info
*vinfo
,
2778 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2779 stmt_vec_info stmt_info
,
2780 slp_instance_kind kind
,
2781 unsigned max_tree_size
, unsigned *limit
)
2784 vec
<stmt_vec_info
> scalar_stmts
;
2786 if (is_a
<bb_vec_info
> (vinfo
))
2787 vect_location
= stmt_info
->stmt
;
2789 stmt_vec_info next_info
= stmt_info
;
2790 if (kind
== slp_inst_kind_store
)
2792 /* Collect the stores and store them in scalar_stmts. */
2793 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
2796 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2797 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2800 else if (kind
== slp_inst_kind_reduc_chain
)
2802 /* Collect the reduction stmts and store them in scalar_stmts. */
2803 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
2806 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2807 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2809 /* Mark the first element of the reduction chain as reduction to properly
2810 transform the node. In the reduction analysis phase only the last
2811 element of the chain is marked as reduction. */
2812 STMT_VINFO_DEF_TYPE (stmt_info
)
2813 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2814 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2815 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2817 else if (kind
== slp_inst_kind_ctor
)
2819 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2821 scalar_stmts
.create (CONSTRUCTOR_NELTS (rhs
));
2822 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2824 stmt_vec_info def_info
= vinfo
->lookup_def (val
);
2825 def_info
= vect_stmt_to_vectorize (def_info
);
2826 scalar_stmts
.quick_push (def_info
);
2828 if (dump_enabled_p ())
2829 dump_printf_loc (MSG_NOTE
, vect_location
,
2830 "Analyzing vectorizable constructor: %G\n",
2833 else if (kind
== slp_inst_kind_reduc_group
)
2835 /* Collect reduction statements. */
2836 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2837 scalar_stmts
.create (reductions
.length ());
2838 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2839 if ((STMT_VINFO_RELEVANT_P (next_info
)
2840 || STMT_VINFO_LIVE_P (next_info
))
2841 /* ??? Make sure we didn't skip a conversion around a reduction
2842 path. In that case we'd have to reverse engineer that conversion
2843 stmt following the chain using reduc_idx and from the PHI
2845 && STMT_VINFO_DEF_TYPE (next_info
) == vect_reduction_def
)
2846 scalar_stmts
.quick_push (next_info
);
2847 /* If less than two were relevant/live there's nothing to SLP. */
2848 if (scalar_stmts
.length () < 2)
2854 /* Build the tree for the SLP instance. */
2855 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
2856 kind
== slp_inst_kind_ctor
2858 max_tree_size
, limit
, bst_map
,
2859 kind
== slp_inst_kind_store
2860 ? stmt_info
: NULL
);
2862 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2863 where we should do store group splitting. */
2868 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2869 trees of packed scalar stmts if SLP is possible. */
2872 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2875 stmt_vec_info first_element
;
2876 slp_instance instance
;
2878 DUMP_VECT_SCOPE ("vect_analyze_slp");
2880 unsigned limit
= max_tree_size
;
2882 scalar_stmts_to_slp_tree_map_t
*bst_map
2883 = new scalar_stmts_to_slp_tree_map_t ();
2885 /* Find SLP sequences starting from groups of grouped stores. */
2886 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2887 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2888 STMT_VINFO_GROUPED_ACCESS (first_element
)
2889 ? slp_inst_kind_store
: slp_inst_kind_ctor
,
2890 max_tree_size
, &limit
);
2892 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
2894 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
2896 vect_location
= bb_vinfo
->roots
[i
].root
->stmt
;
2897 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
2898 bb_vinfo
->roots
[i
].stmts
,
2899 bb_vinfo
->roots
[i
].root
,
2900 max_tree_size
, &limit
, bst_map
, NULL
))
2901 bb_vinfo
->roots
[i
].stmts
= vNULL
;
2905 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2907 /* Find SLP sequences starting from reduction chains. */
2908 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2909 if (! STMT_VINFO_RELEVANT_P (first_element
)
2910 && ! STMT_VINFO_LIVE_P (first_element
))
2912 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2913 slp_inst_kind_reduc_chain
,
2914 max_tree_size
, &limit
))
2916 /* Dissolve reduction chain group. */
2917 stmt_vec_info vinfo
= first_element
;
2918 stmt_vec_info last
= NULL
;
2921 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2922 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2923 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2927 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2928 /* It can be still vectorized as part of an SLP reduction. */
2929 loop_vinfo
->reductions
.safe_push (last
);
2932 /* Find SLP sequences starting from groups of reductions. */
2933 if (loop_vinfo
->reductions
.length () > 1)
2934 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2935 slp_inst_kind_reduc_group
, max_tree_size
,
2939 hash_set
<slp_tree
> visited_patterns
;
2940 slp_tree_to_load_perm_map_t perm_cache
;
2941 slp_compat_nodes_map_t compat_cache
;
2943 /* See if any patterns can be found in the SLP tree. */
2944 bool pattern_found
= false;
2945 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2946 pattern_found
|= vect_match_slp_patterns (instance
, vinfo
,
2947 &visited_patterns
, &perm_cache
,
2950 /* If any were found optimize permutations of loads. */
2953 hash_map
<slp_tree
, slp_tree
> load_map
;
2954 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2956 slp_tree root
= SLP_INSTANCE_TREE (instance
);
2957 optimize_load_redistribution (bst_map
, vinfo
, SLP_TREE_LANES (root
),
2964 /* The map keeps a reference on SLP nodes built, release that. */
2965 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2966 it
!= bst_map
->end (); ++it
)
2968 vect_free_slp_tree ((*it
).second
);
2971 if (pattern_found
&& dump_enabled_p ())
2973 dump_printf_loc (MSG_NOTE
, vect_location
,
2974 "Pattern matched SLP tree\n");
2975 hash_set
<slp_tree
> visited
;
2976 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2977 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2978 SLP_INSTANCE_TREE (instance
), visited
);
2981 return opt_result::success ();
2984 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2987 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
2988 vec
<slp_tree
> &vertices
, vec
<int> &leafs
)
2993 if (visited
.add (node
))
2996 node
->vertex
= vertices
.length ();
2997 vertices
.safe_push (node
);
3000 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3004 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
3007 leafs
.safe_push (node
->vertex
);
3010 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3013 vect_slp_build_vertices (vec_info
*info
, vec
<slp_tree
> &vertices
,
3016 hash_set
<slp_tree
> visited
;
3018 slp_instance instance
;
3019 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
3021 unsigned n_v
= vertices
.length ();
3022 unsigned n_l
= leafs
.length ();
3023 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
3025 /* If we added vertices but no entries to the reverse graph we've
3026 added a cycle that is not backwards-reachable. Push the entry
3027 to mimic as leaf then. */
3028 if (vertices
.length () > n_v
3029 && leafs
.length () == n_l
)
3030 leafs
.safe_push (SLP_INSTANCE_TREE (instance
)->vertex
);
3034 /* Apply (reverse) bijectite PERM to VEC. */
3038 vect_slp_permute (vec
<unsigned> perm
,
3039 vec
<T
> &vec
, bool reverse
)
3041 auto_vec
<T
, 64> saved
;
3042 saved
.create (vec
.length ());
3043 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3044 saved
.quick_push (vec
[i
]);
3048 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3049 vec
[perm
[i
]] = saved
[i
];
3050 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3051 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
3055 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3056 vec
[i
] = saved
[perm
[i
]];
3057 for (unsigned i
= 0; i
< vec
.length (); ++i
)
3058 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
3062 /* Return whether permutations PERM_A and PERM_B as recorded in the
3063 PERMS vector are equal. */
3066 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
3067 int perm_a
, int perm_b
)
3069 return (perm_a
== perm_b
3070 || (perms
[perm_a
].length () == perms
[perm_b
].length ()
3071 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
3072 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
3075 /* Optimize the SLP graph of VINFO. */
3078 vect_optimize_slp (vec_info
*vinfo
)
3080 if (vinfo
->slp_instances
.is_empty ())
3085 auto_vec
<slp_tree
> vertices
;
3086 auto_vec
<int> leafs
;
3087 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
3089 struct graph
*slpg
= new_graph (vertices
.length ());
3090 FOR_EACH_VEC_ELT (vertices
, i
, node
)
3094 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3096 add_edge (slpg
, i
, child
->vertex
);
3099 /* Compute (reverse) postorder on the inverted graph. */
3101 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
3103 auto_sbitmap
n_visited (vertices
.length ());
3104 auto_sbitmap
n_materialize (vertices
.length ());
3105 auto_vec
<int> n_perm (vertices
.length ());
3106 auto_vec
<vec
<unsigned> > perms
;
3108 bitmap_clear (n_visited
);
3109 bitmap_clear (n_materialize
);
3110 n_perm
.quick_grow_cleared (vertices
.length ());
3111 perms
.safe_push (vNULL
); /* zero is no permute */
3113 /* Produce initial permutations. */
3114 for (i
= 0; i
< leafs
.length (); ++i
)
3117 slp_tree node
= vertices
[idx
];
3119 /* Handle externals and constants optimistically throughout the
3121 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
3122 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
3125 /* Leafs do not change across iterations. Note leafs also double
3126 as entries to the reverse graph. */
3127 if (!slpg
->vertices
[idx
].succ
)
3128 bitmap_set_bit (n_visited
, idx
);
3129 /* Loads are the only thing generating permutes. */
3130 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3133 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3134 node unpermuted, record this permute. */
3135 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
3136 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
3138 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
3139 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
3140 bool any_permute
= false;
3141 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3143 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
3144 imin
= MIN (imin
, idx
);
3145 imax
= MAX (imax
, idx
);
3146 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
3149 /* If there's no permute no need to split one out. */
3152 /* If the span doesn't match we'd disrupt VF computation, avoid
3154 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
3157 /* For now only handle true permutes, like
3158 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3159 when permuting constants and invariants keeping the permute
3161 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
3162 bitmap_clear (load_index
);
3163 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3164 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
3166 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3167 if (!bitmap_bit_p (load_index
, j
))
3169 if (j
!= SLP_TREE_LANES (node
))
3172 vec
<unsigned> perm
= vNULL
;
3173 perm
.safe_grow (SLP_TREE_LANES (node
), true);
3174 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3175 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
3176 perms
.safe_push (perm
);
3177 n_perm
[idx
] = perms
.length () - 1;
3180 /* In addition to the above we have to mark outgoing permutes facing
3181 non-reduction graph entries that are not represented as to be
3183 for (slp_instance instance
: vinfo
->slp_instances
)
3184 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_ctor
)
3185 bitmap_set_bit (n_materialize
, SLP_INSTANCE_TREE (instance
)->vertex
);
3187 /* Propagate permutes along the graph and compute materialization points. */
3189 unsigned iteration
= 0;
3195 for (i
= vertices
.length (); i
> 0 ; --i
)
3198 slp_tree node
= vertices
[idx
];
3199 /* For leafs there's nothing to do - we've seeded permutes
3201 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
3204 bitmap_set_bit (n_visited
, idx
);
3206 /* We do not handle stores with a permutation. */
3207 stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
);
3208 if (STMT_VINFO_DATA_REF (rep
)
3209 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep
)))
3211 /* We cannot move a permute across an operation that is
3212 not independent on lanes. Note this is an explicit
3213 negative list since that's much shorter than the respective
3214 positive one but it's critical to keep maintaining it. */
3215 if (is_gimple_call (STMT_VINFO_STMT (rep
)))
3216 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep
)))
3218 case CFN_COMPLEX_ADD_ROT90
:
3219 case CFN_COMPLEX_ADD_ROT270
:
3220 case CFN_COMPLEX_MUL
:
3221 case CFN_COMPLEX_MUL_CONJ
:
3227 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
3228 succ
; succ
= succ
->succ_next
)
3230 int succ_idx
= succ
->dest
;
3231 /* Handle unvisited nodes optimistically. */
3232 /* ??? But for constants once we want to handle non-bijective
3233 permutes we have to verify the permute, when unifying lanes,
3234 will not unify different constants. For example see
3235 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3236 if (!bitmap_bit_p (n_visited
, succ_idx
))
3238 int succ_perm
= n_perm
[succ_idx
];
3239 /* Once we materialize succs permutation its output lanes
3240 appear unpermuted to us. */
3241 if (bitmap_bit_p (n_materialize
, succ_idx
))
3245 else if (succ_perm
== 0)
3250 else if (!vect_slp_perms_eq (perms
, perm
, succ_perm
))
3258 /* Pick up pre-computed leaf values. */
3260 else if (!vect_slp_perms_eq (perms
, perm
, n_perm
[idx
]))
3263 /* Make sure we eventually converge. */
3264 gcc_checking_assert (perm
== 0);
3267 bitmap_clear_bit (n_materialize
, idx
);
3274 /* Elide pruning at materialization points in the first
3275 iteration so every node was visited once at least. */
3279 /* Decide on permute materialization. Look whether there's
3280 a use (pred) edge that is permuted differently than us.
3281 In that case mark ourselves so the permutation is applied.
3282 For VEC_PERM_EXPRs the permutation doesn't carry along
3283 from children to parents so force materialization at the
3284 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3285 are a source of an arbitrary permutation again, similar
3286 to constants/externals - that's something we do not yet
3287 optimally handle. */
3288 bool all_preds_permuted
= (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
3289 && slpg
->vertices
[idx
].pred
!= NULL
);
3290 if (all_preds_permuted
)
3291 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
3292 pred
; pred
= pred
->pred_next
)
3294 gcc_checking_assert (bitmap_bit_p (n_visited
, pred
->src
));
3295 int pred_perm
= n_perm
[pred
->src
];
3296 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
3298 all_preds_permuted
= false;
3302 if (!all_preds_permuted
)
3304 if (!bitmap_bit_p (n_materialize
, idx
))
3306 bitmap_set_bit (n_materialize
, idx
);
3310 while (changed
|| iteration
== 1);
3313 for (i
= 0; i
< vertices
.length (); ++i
)
3315 int perm
= n_perm
[i
];
3319 slp_tree node
= vertices
[i
];
3321 /* First permute invariant/external original successors. */
3324 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3326 if (!child
|| SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3329 /* If the vector is uniform there's nothing to do. */
3330 if (vect_slp_tree_uniform_p (child
))
3333 /* We can end up sharing some externals via two_operator
3334 handling. Be prepared to unshare those. */
3335 if (child
->refcnt
!= 1)
3337 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
3338 SLP_TREE_CHILDREN (node
)[j
] = child
3339 = vect_create_new_slp_node
3340 (SLP_TREE_SCALAR_OPS (child
).copy ());
3342 vect_slp_permute (perms
[perm
],
3343 SLP_TREE_SCALAR_OPS (child
), true);
3346 if (bitmap_bit_p (n_materialize
, i
))
3348 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3349 /* For loads simply drop the permutation, the load permutation
3350 already performs the desired permutation. */
3352 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
3354 /* If the node is already a permute node we can apply
3355 the permutation to the lane selection, effectively
3356 materializing it on the incoming vectors. */
3357 if (dump_enabled_p ())
3358 dump_printf_loc (MSG_NOTE
, vect_location
,
3359 "simplifying permute node %p\n",
3362 for (unsigned k
= 0;
3363 k
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++k
)
3364 SLP_TREE_LANE_PERMUTATION (node
)[k
].second
3365 = perms
[perm
][SLP_TREE_LANE_PERMUTATION (node
)[k
].second
];
3369 if (dump_enabled_p ())
3370 dump_printf_loc (MSG_NOTE
, vect_location
,
3371 "inserting permute node in place of %p\n",
3374 /* Make a copy of NODE and in-place change it to a
3375 VEC_PERM node to permute the lanes of the copy. */
3376 slp_tree copy
= new _slp_tree
;
3377 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
3378 SLP_TREE_CHILDREN (node
) = vNULL
;
3379 SLP_TREE_SCALAR_STMTS (copy
)
3380 = SLP_TREE_SCALAR_STMTS (node
).copy ();
3381 vect_slp_permute (perms
[perm
],
3382 SLP_TREE_SCALAR_STMTS (copy
), true);
3383 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
3384 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
3385 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
3386 SLP_TREE_LANE_PERMUTATION (copy
)
3387 = SLP_TREE_LANE_PERMUTATION (node
);
3388 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
3389 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
3391 copy
->max_nunits
= node
->max_nunits
;
3392 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
3393 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
3394 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
3396 /* Now turn NODE into a VEC_PERM. */
3397 SLP_TREE_CHILDREN (node
).safe_push (copy
);
3398 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
3399 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3400 SLP_TREE_LANE_PERMUTATION (node
)
3401 .quick_push (std::make_pair (0, perms
[perm
][j
]));
3402 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
3407 /* Apply the reverse permutation to our stmts. */
3408 vect_slp_permute (perms
[perm
],
3409 SLP_TREE_SCALAR_STMTS (node
), true);
3410 /* And to the load permutation, which we can simply
3411 make regular by design. */
3412 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3414 /* ??? When we handle non-bijective permutes the idea
3415 is that we can force the load-permutation to be
3416 { min, min + 1, min + 2, ... max }. But then the
3417 scalar defs might no longer match the lane content
3418 which means wrong-code with live lane vectorization.
3419 So we possibly have to have NULL entries for those. */
3420 vect_slp_permute (perms
[perm
],
3421 SLP_TREE_LOAD_PERMUTATION (node
), true);
3426 /* Free the perms vector used for propagation. */
3427 while (!perms
.is_empty ())
3428 perms
.pop ().release ();
3432 /* Now elide load permutations that are not necessary. */
3433 for (i
= 0; i
< leafs
.length (); ++i
)
3435 node
= vertices
[leafs
[i
]];
3436 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3439 /* In basic block vectorization we allow any subchain of an interleaving
3441 FORNOW: not in loop SLP because of realignment complications. */
3442 if (is_a
<bb_vec_info
> (vinfo
))
3444 bool subchain_p
= true;
3445 stmt_vec_info next_load_info
= NULL
;
3446 stmt_vec_info load_info
;
3448 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3451 && (next_load_info
!= load_info
3452 || DR_GROUP_GAP (load_info
) != 1))
3457 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
3461 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3467 stmt_vec_info load_info
;
3468 bool this_load_permuted
= false;
3470 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3471 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
3473 this_load_permuted
= true;
3476 stmt_vec_info first_stmt_info
3477 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
3478 if (!this_load_permuted
3479 /* The load requires permutation when unrolling exposes
3480 a gap either because the group is larger than the SLP
3481 group-size or because there is a gap between the groups. */
3482 && (known_eq (LOOP_VINFO_VECT_FACTOR
3483 (as_a
<loop_vec_info
> (vinfo
)), 1U)
3484 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
3485 && DR_GROUP_GAP (first_stmt_info
) == 0)))
3487 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3494 /* Gather loads reachable from the individual SLP graph entries. */
3497 vect_gather_slp_loads (vec_info
*vinfo
)
3500 slp_instance instance
;
3501 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
3503 hash_set
<slp_tree
> visited
;
3504 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
3505 SLP_INSTANCE_TREE (instance
), visited
);
3510 /* For each possible SLP instance decide whether to SLP it and calculate overall
3511 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3512 least one instance. */
3515 vect_make_slp_decision (loop_vec_info loop_vinfo
)
3518 poly_uint64 unrolling_factor
= 1;
3519 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3520 slp_instance instance
;
3521 int decided_to_slp
= 0;
3523 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3525 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3527 /* FORNOW: SLP if you can. */
3528 /* All unroll factors have the form:
3530 GET_MODE_SIZE (vinfo->vector_mode) * X
3532 for some rational X, so they must have a common multiple. */
3534 = force_common_multiple (unrolling_factor
,
3535 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
3537 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3538 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3539 loop-based vectorization. Such stmts will be marked as HYBRID. */
3540 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3544 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
3546 if (decided_to_slp
&& dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE
, vect_location
,
3549 "Decided to SLP %d instances. Unrolling factor ",
3551 dump_dec (MSG_NOTE
, unrolling_factor
);
3552 dump_printf (MSG_NOTE
, "\n");
3555 return (decided_to_slp
> 0);
3558 /* Private data for vect_detect_hybrid_slp. */
3561 loop_vec_info loop_vinfo
;
3562 vec
<stmt_vec_info
> *worklist
;
3565 /* Walker for walk_gimple_op. */
3568 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
3570 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
3571 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
3576 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
3579 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
3580 if (PURE_SLP_STMT (def_stmt_info
))
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
3584 def_stmt_info
->stmt
);
3585 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
3586 dat
->worklist
->safe_push (def_stmt_info
);
3592 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3593 if so, otherwise pushing it to WORKLIST. */
3596 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
3597 vec
<stmt_vec_info
> &worklist
,
3598 stmt_vec_info stmt_info
)
3600 if (dump_enabled_p ())
3601 dump_printf_loc (MSG_NOTE
, vect_location
,
3602 "Processing hybrid candidate : %G", stmt_info
->stmt
);
3603 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
3604 imm_use_iterator iter2
;
3606 use_operand_p use_p
;
3607 def_operand_p def_p
;
3608 bool any_def
= false;
3609 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
3612 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
3614 if (is_gimple_debug (USE_STMT (use_p
)))
3616 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
3617 /* An out-of loop use means this is a loop_vect sink. */
3620 if (dump_enabled_p ())
3621 dump_printf_loc (MSG_NOTE
, vect_location
,
3622 "Found loop_vect sink: %G", stmt_info
->stmt
);
3623 worklist
.safe_push (stmt_info
);
3626 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
3628 if (dump_enabled_p ())
3629 dump_printf_loc (MSG_NOTE
, vect_location
,
3630 "Found loop_vect use: %G", use_info
->stmt
);
3631 worklist
.safe_push (stmt_info
);
3636 /* No def means this is a loo_vect sink. */
3639 if (dump_enabled_p ())
3640 dump_printf_loc (MSG_NOTE
, vect_location
,
3641 "Found loop_vect sink: %G", stmt_info
->stmt
);
3642 worklist
.safe_push (stmt_info
);
3645 if (dump_enabled_p ())
3646 dump_printf_loc (MSG_NOTE
, vect_location
,
3647 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
3648 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
3651 /* Find stmts that must be both vectorized and SLPed. */
3654 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
3656 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3658 /* All stmts participating in SLP are marked pure_slp, all other
3659 stmts are loop_vect.
3660 First collect all loop_vect stmts into a worklist.
3661 SLP patterns cause not all original scalar stmts to appear in
3662 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3663 Rectify this here and do a backward walk over the IL only considering
3664 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3665 mark them as pure_slp. */
3666 auto_vec
<stmt_vec_info
> worklist
;
3667 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
3669 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3670 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
3673 gphi
*phi
= gsi
.phi ();
3674 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
3675 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3676 maybe_push_to_hybrid_worklist (loop_vinfo
,
3677 worklist
, stmt_info
);
3679 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
3682 gimple
*stmt
= gsi_stmt (gsi
);
3683 if (is_gimple_debug (stmt
))
3685 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
3686 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3688 for (gimple_stmt_iterator gsi2
3689 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
3690 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
3692 stmt_vec_info patt_info
3693 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
3694 if (!STMT_SLP_TYPE (patt_info
)
3695 && STMT_VINFO_RELEVANT (patt_info
))
3696 maybe_push_to_hybrid_worklist (loop_vinfo
,
3697 worklist
, patt_info
);
3699 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
3701 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3702 maybe_push_to_hybrid_worklist (loop_vinfo
,
3703 worklist
, stmt_info
);
3707 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3708 mark any SLP vectorized stmt as hybrid.
3709 ??? We're visiting def stmts N times (once for each non-SLP and
3710 once for each hybrid-SLP use). */
3713 dat
.worklist
= &worklist
;
3714 dat
.loop_vinfo
= loop_vinfo
;
3715 memset (&wi
, 0, sizeof (wi
));
3716 wi
.info
= (void *)&dat
;
3717 while (!worklist
.is_empty ())
3719 stmt_vec_info stmt_info
= worklist
.pop ();
3720 /* Since SSA operands are not set up for pattern stmts we need
3721 to use walk_gimple_op. */
3723 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
3728 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3730 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
3731 : vec_info (vec_info::bb
, init_cost (NULL
), shared
), bbs (_bbs
), roots (vNULL
)
3733 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3736 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3739 gphi
*phi
= si
.phi ();
3740 gimple_set_uid (phi
, 0);
3743 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3744 !gsi_end_p (gsi
); gsi_next (&gsi
))
3746 gimple
*stmt
= gsi_stmt (gsi
);
3747 gimple_set_uid (stmt
, 0);
3748 if (is_gimple_debug (stmt
))
3756 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3757 stmts in the basic block. */
3759 _bb_vec_info::~_bb_vec_info ()
3761 /* Reset region marker. */
3762 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3765 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3768 gphi
*phi
= si
.phi ();
3769 gimple_set_uid (phi
, -1);
3771 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3772 !gsi_end_p (gsi
); gsi_next (&gsi
))
3774 gimple
*stmt
= gsi_stmt (gsi
);
3775 gimple_set_uid (stmt
, -1);
3779 for (unsigned i
= 0; i
< roots
.length (); ++i
)
3780 roots
[i
].stmts
.release ();
3784 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3785 given then that child nodes have already been processed, and that
3786 their def types currently match their SLP node's def type. */
3789 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
3790 slp_instance node_instance
,
3791 stmt_vector_for_cost
*cost_vec
)
3793 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
3795 /* Calculate the number of vector statements to be created for the
3796 scalar stmts in this node. For SLP reductions it is equal to the
3797 number of vector statements in the children (which has already been
3798 calculated by the recursive call). Otherwise it is the number of
3799 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3800 VF divided by the number of elements in a vector. */
3801 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3802 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
3804 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
3805 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
3807 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3808 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
3815 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3816 vf
= loop_vinfo
->vectorization_factor
;
3819 unsigned int group_size
= SLP_TREE_LANES (node
);
3820 tree vectype
= SLP_TREE_VECTYPE (node
);
3821 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3822 = vect_get_num_vectors (vf
* group_size
, vectype
);
3825 /* Handle purely internal nodes. */
3826 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3827 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
3829 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
3830 if (is_a
<bb_vec_info
> (vinfo
)
3831 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
3833 if (dump_enabled_p ())
3834 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3835 "desired vector type conflicts with earlier one "
3836 "for %G", stmt_info
->stmt
);
3841 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
3842 node
, node_instance
, cost_vec
);
3845 /* Try to build NODE from scalars, returning true on success.
3846 NODE_INSTANCE is the SLP instance that contains NODE. */
3849 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
3850 slp_instance node_instance
)
3852 stmt_vec_info stmt_info
;
3855 if (!is_a
<bb_vec_info
> (vinfo
)
3856 || node
== SLP_INSTANCE_TREE (node_instance
)
3857 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
3858 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
3861 if (dump_enabled_p ())
3862 dump_printf_loc (MSG_NOTE
, vect_location
,
3863 "Building vector operands of %p from scalars instead\n", node
);
3865 /* Don't remove and free the child nodes here, since they could be
3866 referenced by other structures. The analysis and scheduling phases
3867 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3868 unsigned int group_size
= SLP_TREE_LANES (node
);
3869 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
3870 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
3871 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3872 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3874 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
3875 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
3880 /* Compute the prologue cost for invariant or constant operands represented
3884 vect_prologue_cost_for_slp (slp_tree node
,
3885 stmt_vector_for_cost
*cost_vec
)
3887 /* There's a special case of an existing vector, that costs nothing. */
3888 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
3889 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
3891 /* Without looking at the actual initializer a vector of
3892 constants can be implemented as load from the constant pool.
3893 When all elements are the same we can use a splat. */
3894 tree vectype
= SLP_TREE_VECTYPE (node
);
3895 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
3896 unsigned num_vects_to_check
;
3897 unsigned HOST_WIDE_INT const_nunits
;
3898 unsigned nelt_limit
;
3899 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
3900 && ! multiple_p (const_nunits
, group_size
))
3902 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
3903 nelt_limit
= const_nunits
;
3907 /* If either the vector has variable length or the vectors
3908 are composed of repeated whole groups we only need to
3909 cost construction once. All vectors will be the same. */
3910 num_vects_to_check
= 1;
3911 nelt_limit
= group_size
;
3913 tree elt
= NULL_TREE
;
3915 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
3917 unsigned si
= j
% group_size
;
3919 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
3920 /* ??? We're just tracking whether all operands of a single
3921 vector initializer are the same, ideally we'd check if
3922 we emitted the same one already. */
3923 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
3926 if (nelt
== nelt_limit
)
3928 record_stmt_cost (cost_vec
, 1,
3929 SLP_TREE_DEF_TYPE (node
) == vect_external_def
3930 ? (elt
? scalar_to_vec
: vec_construct
)
3932 NULL
, vectype
, 0, vect_prologue
);
3938 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3939 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3941 Return true if the operations are supported. */
3944 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
3945 slp_instance node_instance
,
3946 hash_set
<slp_tree
> &visited_set
,
3947 vec
<slp_tree
> &visited_vec
,
3948 stmt_vector_for_cost
*cost_vec
)
3953 /* Assume we can code-generate all invariants. */
3955 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
3956 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
3959 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
3961 if (dump_enabled_p ())
3962 dump_printf_loc (MSG_NOTE
, vect_location
,
3963 "Failed cyclic SLP reference in %p\n", node
);
3966 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
3968 /* If we already analyzed the exact same set of scalar stmts we're done.
3969 We share the generated vector stmts for those. */
3970 if (visited_set
.add (node
))
3972 visited_vec
.safe_push (node
);
3975 unsigned visited_rec_start
= visited_vec
.length ();
3976 unsigned cost_vec_rec_start
= cost_vec
->length ();
3977 bool seen_non_constant_child
= false;
3978 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3980 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
3981 visited_set
, visited_vec
,
3985 if (child
&& SLP_TREE_DEF_TYPE (child
) != vect_constant_def
)
3986 seen_non_constant_child
= true;
3988 /* We're having difficulties scheduling nodes with just constant
3989 operands and no scalar stmts since we then cannot compute a stmt
3991 if (!seen_non_constant_child
&& SLP_TREE_SCALAR_STMTS (node
).is_empty ())
3993 if (dump_enabled_p ())
3994 dump_printf_loc (MSG_NOTE
, vect_location
,
3995 "Cannot vectorize all-constant op node %p\n", node
);
4000 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
4002 /* If analysis failed we have to pop all recursive visited nodes
4006 while (visited_vec
.length () >= visited_rec_start
)
4007 visited_set
.remove (visited_vec
.pop ());
4008 cost_vec
->truncate (cost_vec_rec_start
);
4011 /* When the node can be vectorized cost invariant nodes it references.
4012 This is not done in DFS order to allow the refering node
4013 vectorizable_* calls to nail down the invariant nodes vector type
4014 and possibly unshare it if it needs a different vector type than
4017 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
4019 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
4020 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
4021 /* Perform usual caching, note code-generation still
4022 code-gens these nodes multiple times but we expect
4023 to CSE them later. */
4024 && !visited_set
.add (child
))
4026 visited_vec
.safe_push (child
);
4027 /* ??? After auditing more code paths make a "default"
4028 and push the vector type from NODE to all children
4029 if it is not already set. */
4030 /* Compute the number of vectors to be generated. */
4031 tree vector_type
= SLP_TREE_VECTYPE (child
);
4034 /* For shifts with a scalar argument we don't need
4035 to cost or code-generate anything.
4036 ??? Represent this more explicitely. */
4037 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
4038 == shift_vec_info_type
)
4042 unsigned group_size
= SLP_TREE_LANES (child
);
4044 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4045 vf
= loop_vinfo
->vectorization_factor
;
4046 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
4047 = vect_get_num_vectors (vf
* group_size
, vector_type
);
4048 /* And cost them. */
4049 vect_prologue_cost_for_slp (child
, cost_vec
);
4052 /* If this node or any of its children can't be vectorized, try pruning
4053 the tree here rather than felling the whole thing. */
4054 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
4056 /* We'll need to revisit this for invariant costing and number
4057 of vectorized stmt setting. */
4065 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4066 region and that can be vectorized using vectorizable_live_operation
4067 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4068 scalar code computing it to be retained. */
4071 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
4072 slp_instance instance
,
4073 stmt_vector_for_cost
*cost_vec
,
4074 hash_set
<stmt_vec_info
> &svisited
,
4075 hash_set
<slp_tree
> &visited
)
4077 if (visited
.add (node
))
4081 stmt_vec_info stmt_info
;
4082 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
4083 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4085 if (svisited
.contains (stmt_info
))
4087 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4088 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
4089 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
4090 /* Only the pattern root stmt computes the original scalar value. */
4092 bool mark_visited
= true;
4093 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4094 ssa_op_iter op_iter
;
4095 def_operand_p def_p
;
4096 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4098 imm_use_iterator use_iter
;
4100 stmt_vec_info use_stmt_info
;
4101 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4102 if (!is_gimple_debug (use_stmt
))
4104 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
4106 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4108 STMT_VINFO_LIVE_P (stmt_info
) = true;
4109 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
4110 NULL
, node
, instance
, i
,
4112 /* ??? So we know we can vectorize the live stmt
4113 from one SLP node. If we cannot do so from all
4114 or none consistently we'd have to record which
4115 SLP node (and lane) we want to use for the live
4116 operation. So make sure we can code-generate
4118 mark_visited
= false;
4120 STMT_VINFO_LIVE_P (stmt_info
) = false;
4124 /* We have to verify whether we can insert the lane extract
4125 before all uses. The following is a conservative approximation.
4126 We cannot put this into vectorizable_live_operation because
4127 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4129 Note that while the fact that we emit code for loads at the
4130 first load should make this a non-problem leafs we construct
4131 from scalars are vectorized after the last scalar def.
4132 ??? If we'd actually compute the insert location during
4133 analysis we could use sth less conservative than the last
4134 scalar stmt in the node for the dominance check. */
4135 /* ??? What remains is "live" uses in vector CTORs in the same
4136 SLP graph which is where those uses can end up code-generated
4137 right after their definition instead of close to their original
4138 use. But that would restrict us to code-generate lane-extracts
4139 from the latest stmt in a node. So we compensate for this
4140 during code-generation, simply not replacing uses for those
4141 hopefully rare cases. */
4142 if (STMT_VINFO_LIVE_P (stmt_info
))
4143 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4144 if (!is_gimple_debug (use_stmt
)
4145 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
4146 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
4147 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
4149 if (dump_enabled_p ())
4150 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4151 "Cannot determine insertion place for "
4153 STMT_VINFO_LIVE_P (stmt_info
) = false;
4154 mark_visited
= true;
4158 svisited
.add (stmt_info
);
4162 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4163 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4164 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
4165 cost_vec
, svisited
, visited
);
4168 /* Analyze statements in SLP instances of VINFO. Return true if the
4169 operations are supported. */
4172 vect_slp_analyze_operations (vec_info
*vinfo
)
4174 slp_instance instance
;
4177 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4179 hash_set
<slp_tree
> visited
;
4180 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
4182 auto_vec
<slp_tree
> visited_vec
;
4183 stmt_vector_for_cost cost_vec
;
4184 cost_vec
.create (2);
4185 if (is_a
<bb_vec_info
> (vinfo
))
4186 vect_location
= instance
->location ();
4187 if (!vect_slp_analyze_node_operations (vinfo
,
4188 SLP_INSTANCE_TREE (instance
),
4189 instance
, visited
, visited_vec
,
4191 /* Instances with a root stmt require vectorized defs for the
4193 || (SLP_INSTANCE_ROOT_STMT (instance
)
4194 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
4195 != vect_internal_def
4196 /* Make sure we vectorized with the expected type. */
4197 || !useless_type_conversion_p
4198 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
4199 (instance
->root_stmt
->stmt
))),
4200 TREE_TYPE (SLP_TREE_VECTYPE
4201 (SLP_INSTANCE_TREE (instance
)))))))
4203 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4204 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4205 if (dump_enabled_p ())
4206 dump_printf_loc (MSG_NOTE
, vect_location
,
4207 "removing SLP instance operations starting from: %G",
4209 vect_free_slp_instance (instance
);
4210 vinfo
->slp_instances
.ordered_remove (i
);
4211 cost_vec
.release ();
4212 while (!visited_vec
.is_empty ())
4213 visited
.remove (visited_vec
.pop ());
4219 /* For BB vectorization remember the SLP graph entry
4221 if (is_a
<bb_vec_info
> (vinfo
))
4222 instance
->cost_vec
= cost_vec
;
4225 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
4226 cost_vec
.release ();
4231 /* Compute vectorizable live stmts. */
4232 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
4234 hash_set
<stmt_vec_info
> svisited
;
4235 hash_set
<slp_tree
> visited
;
4236 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4238 vect_location
= instance
->location ();
4239 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
4240 instance
, &instance
->cost_vec
, svisited
,
4245 return !vinfo
->slp_instances
.is_empty ();
4248 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4249 closing the eventual chain. */
4252 get_ultimate_leader (slp_instance instance
,
4253 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
4255 auto_vec
<slp_instance
*, 8> chain
;
4257 while (*(tem
= instance_leader
.get (instance
)) != instance
)
4259 chain
.safe_push (tem
);
4262 while (!chain
.is_empty ())
4263 *chain
.pop () = instance
;
4267 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4270 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
4271 slp_instance instance
, slp_tree node
,
4272 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
4273 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
4274 hash_set
<slp_tree
> &visited
)
4276 stmt_vec_info stmt_info
;
4279 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4282 slp_instance
&stmt_instance
4283 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
4286 else if (stmt_instance
!= instance
)
4288 /* If we're running into a previously marked stmt make us the
4289 leader of the current ultimate leader. This keeps the
4290 leader chain acyclic and works even when the current instance
4291 connects two previously independent graph parts. */
4292 slp_instance stmt_leader
4293 = get_ultimate_leader (stmt_instance
, instance_leader
);
4294 if (stmt_leader
!= instance
)
4295 instance_leader
.put (stmt_leader
, instance
);
4297 stmt_instance
= instance
;
4300 if (!SLP_TREE_SCALAR_STMTS (node
).is_empty () && visited
.add (node
))
4304 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4305 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4306 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
4307 instance_leader
, visited
);
4310 /* Partition the SLP graph into pieces that can be costed independently. */
4313 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
4315 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4317 /* First walk the SLP graph assigning each involved scalar stmt a
4318 corresponding SLP graph entry and upon visiting a previously
4319 marked stmt, make the stmts leader the current SLP graph entry. */
4320 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
4321 hash_map
<slp_instance
, slp_instance
> instance_leader
;
4322 hash_set
<slp_tree
> visited
;
4323 slp_instance instance
;
4324 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4326 instance_leader
.put (instance
, instance
);
4327 vect_bb_partition_graph_r (bb_vinfo
,
4328 instance
, SLP_INSTANCE_TREE (instance
),
4329 stmt_to_instance
, instance_leader
,
4333 /* Then collect entries to each independent subgraph. */
4334 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4336 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
4337 leader
->subgraph_entries
.safe_push (instance
);
4338 if (dump_enabled_p ()
4339 && leader
!= instance
)
4340 dump_printf_loc (MSG_NOTE
, vect_location
,
4341 "instance %p is leader of %p\n",
4346 /* Compute the scalar cost of the SLP node NODE and its children
4347 and return it. Do not account defs that are marked in LIFE and
4348 update LIFE according to uses of NODE. */
4351 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
4352 slp_tree node
, vec
<bool, va_heap
> *life
,
4353 stmt_vector_for_cost
*cost_vec
,
4354 hash_set
<slp_tree
> &visited
)
4357 stmt_vec_info stmt_info
;
4360 if (visited
.add (node
))
4363 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4365 ssa_op_iter op_iter
;
4366 def_operand_p def_p
;
4371 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4372 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4374 /* If there is a non-vectorized use of the defs then the scalar
4375 stmt is kept live in which case we do not account it or any
4376 required defs in the SLP children in the scalar cost. This
4377 way we make the vectorization more costly when compared to
4379 if (!STMT_VINFO_LIVE_P (stmt_info
))
4381 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4383 imm_use_iterator use_iter
;
4385 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4386 if (!is_gimple_debug (use_stmt
))
4388 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
4391 (vect_stmt_to_vectorize (use_stmt_info
)))
4402 /* Count scalar stmts only once. */
4403 if (gimple_visited_p (orig_stmt
))
4405 gimple_set_visited (orig_stmt
, true);
4407 vect_cost_for_stmt kind
;
4408 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
4410 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
4413 kind
= scalar_store
;
4415 else if (vect_nop_conversion_p (orig_stmt_info
))
4417 /* For single-argument PHIs assume coalescing which means zero cost
4418 for the scalar and the vector PHIs. This avoids artificially
4419 favoring the vector path (but may pessimize it in some cases). */
4420 else if (is_a
<gphi
*> (orig_stmt_info
->stmt
)
4421 && gimple_phi_num_args
4422 (as_a
<gphi
*> (orig_stmt_info
->stmt
)) == 1)
4426 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
4427 SLP_TREE_VECTYPE (node
), 0, vect_body
);
4430 auto_vec
<bool, 20> subtree_life
;
4431 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4433 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4435 /* Do not directly pass LIFE to the recursive call, copy it to
4436 confine changes in the callee to the current child/subtree. */
4437 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4439 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
4440 for (unsigned j
= 0;
4441 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
4443 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
4444 if (perm
.first
== i
)
4445 subtree_life
[perm
.second
] = (*life
)[j
];
4450 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
4451 subtree_life
.safe_splice (*life
);
4453 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
4455 subtree_life
.truncate (0);
4460 /* Comparator for the loop-index sorted cost vectors. */
4463 li_cost_vec_cmp (const void *a_
, const void *b_
)
4465 auto *a
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)a_
;
4466 auto *b
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)b_
;
4467 if (a
->first
< b
->first
)
4469 else if (a
->first
== b
->first
)
4474 /* Check if vectorization of the basic block is profitable for the
4475 subgraph denoted by SLP_INSTANCES. */
4478 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
4479 vec
<slp_instance
> slp_instances
)
4481 slp_instance instance
;
4483 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
4484 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
4486 if (dump_enabled_p ())
4488 dump_printf_loc (MSG_NOTE
, vect_location
, "Costing subgraph: \n");
4489 hash_set
<slp_tree
> visited
;
4490 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4491 vect_print_slp_graph (MSG_NOTE
, vect_location
,
4492 SLP_INSTANCE_TREE (instance
), visited
);
4495 /* Calculate scalar cost and sum the cost for the vector stmts
4496 previously collected. */
4497 stmt_vector_for_cost scalar_costs
= vNULL
;
4498 stmt_vector_for_cost vector_costs
= vNULL
;
4499 hash_set
<slp_tree
> visited
;
4500 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4502 auto_vec
<bool, 20> life
;
4503 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
4505 if (SLP_INSTANCE_ROOT_STMT (instance
))
4506 record_stmt_cost (&scalar_costs
, 1, scalar_stmt
,
4507 SLP_INSTANCE_ROOT_STMT (instance
), 0, vect_body
);
4508 vect_bb_slp_scalar_cost (bb_vinfo
,
4509 SLP_INSTANCE_TREE (instance
),
4510 &life
, &scalar_costs
, visited
);
4511 vector_costs
.safe_splice (instance
->cost_vec
);
4512 instance
->cost_vec
.release ();
4514 /* Unset visited flag. */
4515 stmt_info_for_cost
*cost
;
4516 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
4517 gimple_set_visited (cost
->stmt_info
->stmt
, false);
4519 if (dump_enabled_p ())
4520 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4522 /* When costing non-loop vectorization we need to consider each covered
4523 loop independently and make sure vectorization is profitable. For
4524 now we assume a loop may be not entered or executed an arbitrary
4525 number of iterations (??? static information can provide more
4526 precise info here) which means we can simply cost each containing
4527 loops stmts separately. */
4529 /* First produce cost vectors sorted by loop index. */
4530 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
4531 li_scalar_costs (scalar_costs
.length ());
4532 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
4533 li_vector_costs (vector_costs
.length ());
4534 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
4536 unsigned l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
4537 li_scalar_costs
.quick_push (std::make_pair (l
, cost
));
4539 /* Use a random used loop as fallback in case the first vector_costs
4540 entry does not have a stmt_info associated with it. */
4541 unsigned l
= li_scalar_costs
[0].first
;
4542 FOR_EACH_VEC_ELT (vector_costs
, i
, cost
)
4544 /* We inherit from the previous COST, invariants, externals and
4545 extracts immediately follow the cost for the related stmt. */
4546 if (cost
->stmt_info
)
4547 l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
4548 li_vector_costs
.quick_push (std::make_pair (l
, cost
));
4550 li_scalar_costs
.qsort (li_cost_vec_cmp
);
4551 li_vector_costs
.qsort (li_cost_vec_cmp
);
4553 /* Now cost the portions individually. */
4556 while (si
< li_scalar_costs
.length ()
4557 && vi
< li_vector_costs
.length ())
4559 unsigned sl
= li_scalar_costs
[si
].first
;
4560 unsigned vl
= li_vector_costs
[vi
].first
;
4563 if (dump_enabled_p ())
4564 dump_printf_loc (MSG_NOTE
, vect_location
,
4565 "Scalar %d and vector %d loop part do not "
4566 "match up, skipping scalar part\n", sl
, vl
);
4567 /* Skip the scalar part, assuming zero cost on the vector side. */
4572 while (si
< li_scalar_costs
.length ()
4573 && li_scalar_costs
[si
].first
== sl
);
4577 void *scalar_target_cost_data
= init_cost (NULL
);
4580 add_stmt_cost (bb_vinfo
, scalar_target_cost_data
,
4581 li_scalar_costs
[si
].second
);
4584 while (si
< li_scalar_costs
.length ()
4585 && li_scalar_costs
[si
].first
== sl
);
4587 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
4588 destroy_cost_data (scalar_target_cost_data
);
4590 /* Complete the target-specific vector cost calculation. */
4591 void *vect_target_cost_data
= init_cost (NULL
);
4594 add_stmt_cost (bb_vinfo
, vect_target_cost_data
,
4595 li_vector_costs
[vi
].second
);
4598 while (vi
< li_vector_costs
.length ()
4599 && li_vector_costs
[vi
].first
== vl
);
4600 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
4601 &vec_inside_cost
, &vec_epilogue_cost
);
4602 destroy_cost_data (vect_target_cost_data
);
4604 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
4606 if (dump_enabled_p ())
4608 dump_printf_loc (MSG_NOTE
, vect_location
,
4609 "Cost model analysis for part in loop %d:\n", sl
);
4610 dump_printf (MSG_NOTE
, " Vector cost: %d\n",
4611 vec_inside_cost
+ vec_outside_cost
);
4612 dump_printf (MSG_NOTE
, " Scalar cost: %d\n", scalar_cost
);
4615 /* Vectorization is profitable if its cost is more than the cost of scalar
4616 version. Note that we err on the vector side for equal cost because
4617 the cost estimate is otherwise quite pessimistic (constant uses are
4618 free on the scalar side but cost a load on the vector side for
4620 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
4622 scalar_costs
.release ();
4623 vector_costs
.release ();
4627 if (vi
< li_vector_costs
.length ())
4629 if (dump_enabled_p ())
4630 dump_printf_loc (MSG_NOTE
, vect_location
,
4631 "Excess vector cost for part in loop %d:\n",
4632 li_vector_costs
[vi
].first
);
4633 scalar_costs
.release ();
4634 vector_costs
.release ();
4638 scalar_costs
.release ();
4639 vector_costs
.release ();
4643 /* qsort comparator for lane defs. */
4646 vld_cmp (const void *a_
, const void *b_
)
4648 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
4649 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
4650 return a
->first
- b
->first
;
4653 /* Return true if USE_STMT is a vector lane insert into VEC and set
4654 *THIS_LANE to the lane number that is set. */
4657 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
4659 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
4661 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
4663 ? gimple_assign_rhs1 (use_ass
) != vec
4664 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
4665 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
4666 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
4667 || !constant_multiple_p
4668 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
4669 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
4675 /* Find any vectorizable constructors and add them to the grouped_store
4679 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
4681 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
4682 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
4683 !gsi_end_p (gsi
); gsi_next (&gsi
))
4685 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
4689 tree rhs
= gimple_assign_rhs1 (assign
);
4690 if (gimple_assign_rhs_code (assign
) == CONSTRUCTOR
)
4692 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
4693 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
4694 CONSTRUCTOR_NELTS (rhs
))
4695 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
4696 || uniform_vector_p (rhs
))
4701 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
4702 if (TREE_CODE (val
) != SSA_NAME
4703 || !bb_vinfo
->lookup_def (val
))
4705 if (j
!= CONSTRUCTOR_NELTS (rhs
))
4708 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
4709 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
4711 else if (gimple_assign_rhs_code (assign
) == BIT_INSERT_EXPR
4712 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
4713 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
4714 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
4715 && integer_zerop (gimple_assign_rhs3 (assign
))
4716 && useless_type_conversion_p
4717 (TREE_TYPE (TREE_TYPE (rhs
)),
4718 TREE_TYPE (gimple_assign_rhs2 (assign
)))
4719 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
4721 /* We start to match on insert to lane zero but since the
4722 inserts need not be ordered we'd have to search both
4723 the def and the use chains. */
4724 tree vectype
= TREE_TYPE (rhs
);
4725 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
4726 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
4727 auto_sbitmap
lanes (nlanes
);
4728 bitmap_clear (lanes
);
4729 bitmap_set_bit (lanes
, 0);
4730 tree def
= gimple_assign_lhs (assign
);
4731 lane_defs
.quick_push
4732 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
4733 unsigned lanes_found
= 1;
4734 /* Start with the use chains, the last stmt will be the root. */
4735 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
4738 use_operand_p use_p
;
4740 if (!single_imm_use (def
, &use_p
, &use_stmt
))
4743 if (!bb_vinfo
->lookup_stmt (use_stmt
)
4744 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
4745 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
4747 if (bitmap_bit_p (lanes
, this_lane
))
4750 bitmap_set_bit (lanes
, this_lane
);
4751 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
4752 lane_defs
.quick_push (std::make_pair
4753 (this_lane
, gimple_assign_rhs2 (use_ass
)));
4754 last
= bb_vinfo
->lookup_stmt (use_ass
);
4755 def
= gimple_assign_lhs (use_ass
);
4757 while (lanes_found
< nlanes
);
4758 if (lanes_found
< nlanes
)
4760 /* Now search the def chain. */
4761 def
= gimple_assign_rhs1 (assign
);
4764 if (TREE_CODE (def
) != SSA_NAME
4765 || !has_single_use (def
))
4767 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
4769 if (!bb_vinfo
->lookup_stmt (def_stmt
)
4770 || !vect_slp_is_lane_insert (def_stmt
,
4771 NULL_TREE
, &this_lane
)
4772 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
4774 if (bitmap_bit_p (lanes
, this_lane
))
4777 bitmap_set_bit (lanes
, this_lane
);
4778 lane_defs
.quick_push (std::make_pair
4780 gimple_assign_rhs2 (def_stmt
)));
4781 def
= gimple_assign_rhs1 (def_stmt
);
4783 while (lanes_found
< nlanes
);
4785 if (lanes_found
== nlanes
)
4787 /* Sort lane_defs after the lane index and register the root. */
4788 lane_defs
.qsort (vld_cmp
);
4789 vec
<stmt_vec_info
> stmts
;
4790 stmts
.create (nlanes
);
4791 for (unsigned i
= 0; i
< nlanes
; ++i
)
4792 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
4793 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
4800 /* Walk the grouped store chains and replace entries with their
4801 pattern variant if any. */
4804 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
4806 stmt_vec_info first_element
;
4809 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
4811 /* We also have CTORs in this array. */
4812 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
4814 if (STMT_VINFO_IN_PATTERN_P (first_element
))
4816 stmt_vec_info orig
= first_element
;
4817 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
4818 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
4819 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
4820 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
4821 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
4822 vinfo
->grouped_stores
[i
] = first_element
;
4824 stmt_vec_info prev
= first_element
;
4825 while (DR_GROUP_NEXT_ELEMENT (prev
))
4827 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
4828 if (STMT_VINFO_IN_PATTERN_P (elt
))
4830 stmt_vec_info orig
= elt
;
4831 elt
= STMT_VINFO_RELATED_STMT (elt
);
4832 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
4833 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
4834 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
4836 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
4842 /* Check if the region described by BB_VINFO can be vectorized, returning
4843 true if so. When returning false, set FATAL to true if the same failure
4844 would prevent vectorization at other vector sizes, false if it is still
4845 worth trying other sizes. N_STMTS is the number of statements in the
4849 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
4850 vec
<int> *dataref_groups
)
4852 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4854 slp_instance instance
;
4856 poly_uint64 min_vf
= 2;
4858 /* The first group of checks is independent of the vector size. */
4861 /* Analyze the data references. */
4863 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
4865 if (dump_enabled_p ())
4866 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4867 "not vectorized: unhandled data-ref in basic "
4872 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
4874 if (dump_enabled_p ())
4875 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4876 "not vectorized: unhandled data access in "
4881 vect_slp_check_for_constructors (bb_vinfo
);
4883 /* If there are no grouped stores and no constructors in the region
4884 there is no need to continue with pattern recog as vect_analyze_slp
4885 will fail anyway. */
4886 if (bb_vinfo
->grouped_stores
.is_empty ()
4887 && bb_vinfo
->roots
.is_empty ())
4889 if (dump_enabled_p ())
4890 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4891 "not vectorized: no grouped stores in "
4896 /* While the rest of the analysis below depends on it in some way. */
4899 vect_pattern_recog (bb_vinfo
);
4901 /* Update store groups from pattern processing. */
4902 vect_fixup_store_groups_with_patterns (bb_vinfo
);
4904 /* Check the SLP opportunities in the basic block, analyze and build SLP
4906 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
4908 if (dump_enabled_p ())
4910 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4911 "Failed to SLP the basic block.\n");
4912 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4913 "not vectorized: failed to find SLP opportunities "
4914 "in basic block.\n");
4919 /* Optimize permutations. */
4920 vect_optimize_slp (bb_vinfo
);
4922 /* Gather the loads reachable from the SLP graph entries. */
4923 vect_gather_slp_loads (bb_vinfo
);
4925 vect_record_base_alignments (bb_vinfo
);
4927 /* Analyze and verify the alignment of data references and the
4928 dependence in the SLP instances. */
4929 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
4931 vect_location
= instance
->location ();
4932 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
4933 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
4935 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4936 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4937 if (dump_enabled_p ())
4938 dump_printf_loc (MSG_NOTE
, vect_location
,
4939 "removing SLP instance operations starting from: %G",
4941 vect_free_slp_instance (instance
);
4942 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
4946 /* Mark all the statements that we want to vectorize as pure SLP and
4948 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4949 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
4950 if (stmt_vec_info root
= SLP_INSTANCE_ROOT_STMT (instance
))
4952 STMT_SLP_TYPE (root
) = pure_slp
;
4953 if (is_gimple_assign (root
->stmt
)
4954 && gimple_assign_rhs_code (root
->stmt
) == BIT_INSERT_EXPR
)
4956 /* ??? We should probably record the whole vector of
4957 root stmts so we do not have to back-track here... */
4958 for (unsigned n
= SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
));
4961 root
= bb_vinfo
->lookup_def (gimple_assign_rhs1 (root
->stmt
));
4962 STMT_SLP_TYPE (root
) = pure_slp
;
4969 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
4972 if (!vect_slp_analyze_operations (bb_vinfo
))
4974 if (dump_enabled_p ())
4975 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4976 "not vectorized: bad operation in basic block.\n");
4980 vect_bb_partition_graph (bb_vinfo
);
4985 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4986 basic blocks in BBS, returning true on success.
4987 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4990 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
4991 vec
<int> *dataref_groups
, unsigned int n_stmts
)
4993 bb_vec_info bb_vinfo
;
4994 auto_vector_modes vector_modes
;
4996 /* Autodetect first vector size we try. */
4997 machine_mode next_vector_mode
= VOIDmode
;
4998 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
4999 unsigned int mode_i
= 0;
5001 vec_info_shared shared
;
5003 machine_mode autodetected_vector_mode
= VOIDmode
;
5006 bool vectorized
= false;
5008 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
5010 bool first_time_p
= shared
.datarefs
.is_empty ();
5011 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
5013 bb_vinfo
->shared
->save_datarefs ();
5015 bb_vinfo
->shared
->check_datarefs ();
5016 bb_vinfo
->vector_mode
= next_vector_mode
;
5018 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
))
5020 if (dump_enabled_p ())
5022 dump_printf_loc (MSG_NOTE
, vect_location
,
5023 "***** Analysis succeeded with vector mode"
5024 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
5025 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
5028 bb_vinfo
->shared
->check_datarefs ();
5031 slp_instance instance
;
5032 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
5034 if (instance
->subgraph_entries
.is_empty ())
5037 vect_location
= instance
->location ();
5038 if (!unlimited_cost_model (NULL
)
5039 && !vect_bb_vectorization_profitable_p
5040 (bb_vinfo
, instance
->subgraph_entries
))
5042 if (dump_enabled_p ())
5043 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5044 "not vectorized: vectorization is not "
5049 if (!dbg_cnt (vect_slp
))
5052 if (!vectorized
&& dump_enabled_p ())
5053 dump_printf_loc (MSG_NOTE
, vect_location
,
5054 "Basic block will be vectorized "
5058 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
5060 unsigned HOST_WIDE_INT bytes
;
5061 if (dump_enabled_p ())
5064 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
5065 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
5066 "basic block part vectorized using %wu "
5067 "byte vectors\n", bytes
);
5069 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
5070 "basic block part vectorized using "
5071 "variable length vectors\n");
5077 if (dump_enabled_p ())
5078 dump_printf_loc (MSG_NOTE
, vect_location
,
5079 "***** Analysis failed with vector mode %s\n",
5080 GET_MODE_NAME (bb_vinfo
->vector_mode
));
5084 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
5087 while (mode_i
< vector_modes
.length ()
5088 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
5090 if (dump_enabled_p ())
5091 dump_printf_loc (MSG_NOTE
, vect_location
,
5092 "***** The result for vector mode %s would"
5094 GET_MODE_NAME (vector_modes
[mode_i
]));
5100 if (mode_i
< vector_modes
.length ()
5101 && VECTOR_MODE_P (autodetected_vector_mode
)
5102 && (related_vector_mode (vector_modes
[mode_i
],
5103 GET_MODE_INNER (autodetected_vector_mode
))
5104 == autodetected_vector_mode
)
5105 && (related_vector_mode (autodetected_vector_mode
,
5106 GET_MODE_INNER (vector_modes
[mode_i
]))
5107 == vector_modes
[mode_i
]))
5109 if (dump_enabled_p ())
5110 dump_printf_loc (MSG_NOTE
, vect_location
,
5111 "***** Skipping vector mode %s, which would"
5112 " repeat the analysis for %s\n",
5113 GET_MODE_NAME (vector_modes
[mode_i
]),
5114 GET_MODE_NAME (autodetected_vector_mode
));
5119 || mode_i
== vector_modes
.length ()
5120 || autodetected_vector_mode
== VOIDmode
5121 /* If vect_slp_analyze_bb_1 signaled that analysis for all
5122 vector sizes will fail do not bother iterating. */
5126 /* Try the next biggest vector size. */
5127 next_vector_mode
= vector_modes
[mode_i
++];
5128 if (dump_enabled_p ())
5129 dump_printf_loc (MSG_NOTE
, vect_location
,
5130 "***** Re-trying analysis with vector mode %s\n",
5131 GET_MODE_NAME (next_vector_mode
));
5136 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
5137 true if anything in the basic-block was vectorized. */
5140 vect_slp_bbs (vec
<basic_block
> bbs
)
5142 vec
<data_reference_p
> datarefs
= vNULL
;
5143 auto_vec
<int> dataref_groups
;
5145 int current_group
= 0;
5147 for (unsigned i
= 0; i
< bbs
.length (); i
++)
5149 basic_block bb
= bbs
[i
];
5150 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
5153 gimple
*stmt
= gsi_stmt (gsi
);
5154 if (is_gimple_debug (stmt
))
5159 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
5160 vect_location
= stmt
;
5162 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
5163 &dataref_groups
, current_group
))
5166 /* New BBs always start a new DR group. */
5170 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
);
5173 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5174 true if anything in the basic-block was vectorized. */
5177 vect_slp_bb (basic_block bb
)
5179 auto_vec
<basic_block
> bbs
;
5181 return vect_slp_bbs (bbs
);
5184 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
5185 true if anything in the basic-block was vectorized. */
5188 vect_slp_function (function
*fun
)
5191 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
5192 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
5194 /* For the moment split the function into pieces to avoid making
5195 the iteration on the vector mode moot. Split at points we know
5196 to not handle well which is CFG merges (SLP discovery doesn't
5197 handle non-loop-header PHIs) and loop exits. Since pattern
5198 recog requires reverse iteration to visit uses before defs
5199 simply chop RPO into pieces. */
5200 auto_vec
<basic_block
> bbs
;
5201 for (unsigned i
= 0; i
< n
; i
++)
5203 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
5206 /* Split when a BB is not dominated by the first block. */
5207 if (!bbs
.is_empty ()
5208 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
5210 if (dump_enabled_p ())
5211 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5212 "splitting region at dominance boundary bb%d\n",
5216 /* Split when the loop determined by the first block
5217 is exited. This is because we eventually insert
5218 invariants at region begin. */
5219 else if (!bbs
.is_empty ()
5220 && bbs
[0]->loop_father
!= bb
->loop_father
5221 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
5223 if (dump_enabled_p ())
5224 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5225 "splitting region at loop %d exit at bb%d\n",
5226 bbs
[0]->loop_father
->num
, bb
->index
);
5230 if (split
&& !bbs
.is_empty ())
5232 r
|= vect_slp_bbs (bbs
);
5234 bbs
.quick_push (bb
);
5239 /* When we have a stmt ending this block and defining a
5240 value we have to insert on edges when inserting after it for
5241 a vector containing its definition. Avoid this for now. */
5242 if (gimple
*last
= last_stmt (bb
))
5243 if (gimple_get_lhs (last
)
5244 && is_ctrl_altering_stmt (last
))
5246 if (dump_enabled_p ())
5247 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5248 "splitting region at control altering "
5249 "definition %G", last
);
5250 r
|= vect_slp_bbs (bbs
);
5255 if (!bbs
.is_empty ())
5256 r
|= vect_slp_bbs (bbs
);
5263 /* Build a variable-length vector in which the elements in ELTS are repeated
5264 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5265 RESULTS and add any new instructions to SEQ.
5267 The approach we use is:
5269 (1) Find a vector mode VM with integer elements of mode IM.
5271 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5272 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5273 from small vectors to IM.
5275 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5277 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5278 correct byte contents.
5280 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5282 We try to find the largest IM for which this sequence works, in order
5283 to cut down on the number of interleaves. */
5286 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
5287 vec
<tree
> elts
, unsigned int nresults
,
5290 unsigned int nelts
= elts
.length ();
5291 tree element_type
= TREE_TYPE (vector_type
);
5293 /* (1) Find a vector mode VM with integer elements of mode IM. */
5294 unsigned int nvectors
= 1;
5295 tree new_vector_type
;
5297 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
5298 &nvectors
, &new_vector_type
,
5302 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5303 unsigned int partial_nelts
= nelts
/ nvectors
;
5304 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
5306 tree_vector_builder partial_elts
;
5307 auto_vec
<tree
, 32> pieces (nvectors
* 2);
5308 pieces
.quick_grow_cleared (nvectors
* 2);
5309 for (unsigned int i
= 0; i
< nvectors
; ++i
)
5311 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5312 ELTS' has mode IM. */
5313 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
5314 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
5315 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
5316 tree t
= gimple_build_vector (seq
, &partial_elts
);
5317 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
5318 TREE_TYPE (new_vector_type
), t
);
5320 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5321 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
5324 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5325 correct byte contents.
5327 Conceptually, we need to repeat the following operation log2(nvectors)
5328 times, where hi_start = nvectors / 2:
5330 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5331 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5333 However, if each input repeats every N elements and the VF is
5334 a multiple of N * 2, the HI result is the same as the LO result.
5335 This will be true for the first N1 iterations of the outer loop,
5336 followed by N2 iterations for which both the LO and HI results
5339 N1 + N2 = log2(nvectors)
5341 Each "N1 iteration" doubles the number of redundant vectors and the
5342 effect of the process as a whole is to have a sequence of nvectors/2**N1
5343 vectors that repeats 2**N1 times. Rather than generate these redundant
5344 vectors, we halve the number of vectors for each N1 iteration. */
5345 unsigned int in_start
= 0;
5346 unsigned int out_start
= nvectors
;
5347 unsigned int new_nvectors
= nvectors
;
5348 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
5350 unsigned int hi_start
= new_nvectors
/ 2;
5351 unsigned int out_i
= 0;
5352 for (unsigned int in_i
= 0; in_i
< new_nvectors
; ++in_i
)
5355 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
5359 tree output
= make_ssa_name (new_vector_type
);
5360 tree input1
= pieces
[in_start
+ (in_i
/ 2)];
5361 tree input2
= pieces
[in_start
+ (in_i
/ 2) + hi_start
];
5362 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
5364 permutes
[in_i
& 1]);
5365 gimple_seq_add_stmt (seq
, stmt
);
5366 pieces
[out_start
+ out_i
] = output
;
5369 std::swap (in_start
, out_start
);
5370 new_nvectors
= out_i
;
5373 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5374 results
.reserve (nresults
);
5375 for (unsigned int i
= 0; i
< nresults
; ++i
)
5376 if (i
< new_nvectors
)
5377 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
5378 pieces
[in_start
+ i
]));
5380 results
.quick_push (results
[i
- new_nvectors
]);
5384 /* For constant and loop invariant defs in OP_NODE this function creates
5385 vector defs that will be used in the vectorized stmts and stores them
5386 to SLP_TREE_VEC_DEFS of OP_NODE. */
5389 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
5391 unsigned HOST_WIDE_INT nunits
;
5393 unsigned j
, number_of_places_left_in_vector
;
5396 int group_size
= op_node
->ops
.length ();
5397 unsigned int vec_num
, i
;
5398 unsigned number_of_copies
= 1;
5400 gimple_seq ctor_seq
= NULL
;
5401 auto_vec
<tree
, 16> permute_results
;
5403 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5404 vector_type
= SLP_TREE_VECTYPE (op_node
);
5406 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
5407 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
5408 auto_vec
<tree
> voprnds (number_of_vectors
);
5410 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5411 created vectors. It is greater than 1 if unrolling is performed.
5413 For example, we have two scalar operands, s1 and s2 (e.g., group of
5414 strided accesses of size two), while NUNITS is four (i.e., four scalars
5415 of this type can be packed in a vector). The output vector will contain
5416 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5419 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5420 containing the operands.
5422 For example, NUNITS is four as before, and the group size is 8
5423 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5424 {s5, s6, s7, s8}. */
5426 /* When using duplicate_and_interleave, we just need one element for
5427 each scalar statement. */
5428 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
5429 nunits
= group_size
;
5431 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
5433 number_of_places_left_in_vector
= nunits
;
5435 tree_vector_builder
elts (vector_type
, nunits
, 1);
5436 elts
.quick_grow (nunits
);
5437 stmt_vec_info insert_after
= NULL
;
5438 for (j
= 0; j
< number_of_copies
; j
++)
5441 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
5443 /* Create 'vect_ = {op0,op1,...,opn}'. */
5444 number_of_places_left_in_vector
--;
5446 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
5448 if (CONSTANT_CLASS_P (op
))
5450 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5452 /* Can't use VIEW_CONVERT_EXPR for booleans because
5453 of possibly different sizes of scalar value and
5455 if (integer_zerop (op
))
5456 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
5457 else if (integer_onep (op
))
5458 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
5463 op
= fold_unary (VIEW_CONVERT_EXPR
,
5464 TREE_TYPE (vector_type
), op
);
5465 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
5469 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
5471 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5474 = build_all_ones_cst (TREE_TYPE (vector_type
));
5476 = build_zero_cst (TREE_TYPE (vector_type
));
5477 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
5478 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
5484 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
5487 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
5490 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
5494 elts
[number_of_places_left_in_vector
] = op
;
5495 if (!CONSTANT_CLASS_P (op
))
5497 /* For BB vectorization we have to compute an insert location
5498 when a def is inside the analyzed region since we cannot
5499 simply insert at the BB start in this case. */
5500 stmt_vec_info opdef
;
5501 if (TREE_CODE (orig_op
) == SSA_NAME
5502 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
5503 && is_a
<bb_vec_info
> (vinfo
)
5504 && (opdef
= vinfo
->lookup_def (orig_op
)))
5507 insert_after
= opdef
;
5509 insert_after
= get_later_stmt (insert_after
, opdef
);
5512 if (number_of_places_left_in_vector
== 0)
5515 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
5516 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
5517 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
5520 if (permute_results
.is_empty ())
5521 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
5522 elts
, number_of_vectors
,
5524 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
5526 if (!gimple_seq_empty_p (ctor_seq
))
5530 gimple_stmt_iterator gsi
;
5531 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
5533 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
5534 gsi_insert_seq_before (&gsi
, ctor_seq
,
5535 GSI_CONTINUE_LINKING
);
5537 else if (!stmt_ends_bb_p (insert_after
->stmt
))
5539 gsi
= gsi_for_stmt (insert_after
->stmt
);
5540 gsi_insert_seq_after (&gsi
, ctor_seq
,
5541 GSI_CONTINUE_LINKING
);
5545 /* When we want to insert after a def where the
5546 defining stmt throws then insert on the fallthru
5548 edge e
= find_fallthru_edge
5549 (gimple_bb (insert_after
->stmt
)->succs
);
5551 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
5552 gcc_assert (!new_bb
);
5556 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
5559 voprnds
.quick_push (vec_cst
);
5560 insert_after
= NULL
;
5561 number_of_places_left_in_vector
= nunits
;
5563 elts
.new_vector (vector_type
, nunits
, 1);
5564 elts
.quick_grow (nunits
);
5569 /* Since the vectors are created in the reverse order, we should invert
5571 vec_num
= voprnds
.length ();
5572 for (j
= vec_num
; j
!= 0; j
--)
5574 vop
= voprnds
[j
- 1];
5575 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5578 /* In case that VF is greater than the unrolling factor needed for the SLP
5579 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5580 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5581 to replicate the vectors. */
5582 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
5583 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
5585 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5588 /* Get the Ith vectorized definition from SLP_NODE. */
5591 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
5593 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
5594 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
5596 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
5599 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5602 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
5604 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
5605 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
5608 gimple
*vec_def_stmt
;
5609 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
5610 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
5613 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
5616 /* Get N vectorized definitions for SLP_NODE. */
5619 vect_get_slp_defs (vec_info
*,
5620 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
5623 n
= SLP_TREE_CHILDREN (slp_node
).length ();
5625 for (unsigned i
= 0; i
< n
; ++i
)
5627 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
5628 vec
<tree
> vec_defs
= vNULL
;
5629 vect_get_slp_defs (child
, &vec_defs
);
5630 vec_oprnds
->quick_push (vec_defs
);
5634 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5635 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5636 permute statements for the SLP node NODE. Store the number of vector
5637 permute instructions in *N_PERMS and the number of vector load
5638 instructions in *N_LOADS. */
5641 vect_transform_slp_perm_load (vec_info
*vinfo
,
5642 slp_tree node
, vec
<tree
> dr_chain
,
5643 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
5644 bool analyze_only
, unsigned *n_perms
,
5645 unsigned int *n_loads
)
5647 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
5649 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5650 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
5651 unsigned int mask_element
;
5654 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
5657 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5659 mode
= TYPE_MODE (vectype
);
5660 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5662 /* Initialize the vect stmts of NODE to properly insert the generated
5665 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
5666 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
5667 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
5669 /* Generate permutation masks for every NODE. Number of masks for each NODE
5670 is equal to GROUP_SIZE.
5671 E.g., we have a group of three nodes with three loads from the same
5672 location in each node, and the vector size is 4. I.e., we have a
5673 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5674 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5675 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5678 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5679 The last mask is illegal since we assume two operands for permute
5680 operation, and the mask element values can't be outside that range.
5681 Hence, the last mask must be converted into {2,5,5,5}.
5682 For the first two permutations we need the first and the second input
5683 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5684 we need the second and the third vectors: {b1,c1,a2,b2} and
5687 int vect_stmts_counter
= 0;
5688 unsigned int index
= 0;
5689 int first_vec_index
= -1;
5690 int second_vec_index
= -1;
5694 vec_perm_builder mask
;
5695 unsigned int nelts_to_build
;
5696 unsigned int nvectors_per_build
;
5697 unsigned int in_nlanes
;
5698 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
5699 && multiple_p (nunits
, group_size
));
5702 /* A single vector contains a whole number of copies of the node, so:
5703 (a) all permutes can use the same mask; and
5704 (b) the permutes only need a single vector input. */
5705 mask
.new_vector (nunits
, group_size
, 3);
5706 nelts_to_build
= mask
.encoded_nelts ();
5707 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
5708 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
5712 /* We need to construct a separate mask for each vector statement. */
5713 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
5714 if (!nunits
.is_constant (&const_nunits
)
5715 || !vf
.is_constant (&const_vf
))
5717 mask
.new_vector (const_nunits
, const_nunits
, 1);
5718 nelts_to_build
= const_vf
* group_size
;
5719 nvectors_per_build
= 1;
5720 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
5722 auto_sbitmap
used_in_lanes (in_nlanes
);
5723 bitmap_clear (used_in_lanes
);
5725 unsigned int count
= mask
.encoded_nelts ();
5726 mask
.quick_grow (count
);
5727 vec_perm_indices indices
;
5729 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
5731 unsigned int iter_num
= j
/ group_size
;
5732 unsigned int stmt_num
= j
% group_size
;
5733 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
5734 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
5735 bitmap_set_bit (used_in_lanes
, i
);
5738 first_vec_index
= 0;
5743 /* Enforced before the loop when !repeating_p. */
5744 unsigned int const_nunits
= nunits
.to_constant ();
5745 vec_index
= i
/ const_nunits
;
5746 mask_element
= i
% const_nunits
;
5747 if (vec_index
== first_vec_index
5748 || first_vec_index
== -1)
5750 first_vec_index
= vec_index
;
5752 else if (vec_index
== second_vec_index
5753 || second_vec_index
== -1)
5755 second_vec_index
= vec_index
;
5756 mask_element
+= const_nunits
;
5760 if (dump_enabled_p ())
5761 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5762 "permutation requires at "
5763 "least three vectors %G",
5765 gcc_assert (analyze_only
);
5769 gcc_assert (mask_element
< 2 * const_nunits
);
5772 if (mask_element
!= index
)
5774 mask
[index
++] = mask_element
;
5776 if (index
== count
&& !noop_p
)
5778 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
5779 if (!can_vec_perm_const_p (mode
, indices
))
5781 if (dump_enabled_p ())
5783 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5785 "unsupported vect permute { ");
5786 for (i
= 0; i
< count
; ++i
)
5788 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5789 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5791 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5793 gcc_assert (analyze_only
);
5804 tree mask_vec
= NULL_TREE
;
5807 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5809 if (second_vec_index
== -1)
5810 second_vec_index
= first_vec_index
;
5812 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
5814 /* Generate the permute statement if necessary. */
5815 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
5816 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
5820 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
5822 = vect_create_destination_var (gimple_assign_lhs (stmt
),
5824 perm_dest
= make_ssa_name (perm_dest
);
5826 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5827 first_vec
, second_vec
,
5829 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
5833 /* If mask was NULL_TREE generate the requested
5834 identity transform. */
5835 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
5837 /* Store the vector statement in NODE. */
5838 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
5843 first_vec_index
= -1;
5844 second_vec_index
= -1;
5852 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5855 /* Enforced above when !repeating_p. */
5856 unsigned int const_nunits
= nunits
.to_constant ();
5858 bool load_seen
= false;
5859 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
5861 if (i
% const_nunits
== 0)
5867 if (bitmap_bit_p (used_in_lanes
, i
))
5878 /* Produce the next vector result for SLP permutation NODE by adding a vector
5879 statement at GSI. If MASK_VEC is nonnull, add:
5881 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
5885 <new SSA name> = FIRST_DEF. */
5888 vect_add_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5889 slp_tree node
, tree first_def
, tree second_def
,
5892 tree vectype
= SLP_TREE_VECTYPE (node
);
5894 /* ??? We SLP match existing vector element extracts but
5895 allow punning which we need to re-instantiate at uses
5896 but have no good way of explicitly representing. */
5897 if (!types_compatible_p (TREE_TYPE (first_def
), vectype
))
5900 = gimple_build_assign (make_ssa_name (vectype
),
5901 build1 (VIEW_CONVERT_EXPR
, vectype
, first_def
));
5902 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
5903 first_def
= gimple_assign_lhs (conv_stmt
);
5906 tree perm_dest
= make_ssa_name (vectype
);
5909 if (!types_compatible_p (TREE_TYPE (second_def
), vectype
))
5912 = gimple_build_assign (make_ssa_name (vectype
),
5913 build1 (VIEW_CONVERT_EXPR
,
5914 vectype
, second_def
));
5915 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
5916 second_def
= gimple_assign_lhs (conv_stmt
);
5918 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5919 first_def
, second_def
,
5923 /* We need a copy here in case the def was external. */
5924 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
5925 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
5926 /* Store the vector statement in NODE. */
5927 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
5930 /* Vectorize the SLP permutations in NODE as specified
5931 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5932 child number and lane number.
5933 Interleaving of two two-lane two-child SLP subtrees (not supported):
5934 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5935 A blend of two four-lane two-child SLP subtrees:
5936 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5937 Highpart of a four-lane one-child SLP subtree (not supported):
5938 [ { 0, 2 }, { 0, 3 } ]
5939 Where currently only a subset is supported by code generating below. */
5942 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5943 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
5945 tree vectype
= SLP_TREE_VECTYPE (node
);
5947 /* ??? We currently only support all same vector input and output types
5948 while the SLP IL should really do a concat + select and thus accept
5949 arbitrary mismatches. */
5952 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5953 bool repeating_p
= multiple_p (nunits
, SLP_TREE_LANES (node
));
5954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5956 if (!vect_maybe_update_slp_op_vectype (child
, vectype
)
5957 || !types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
5959 if (dump_enabled_p ())
5960 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5961 "Unsupported lane permutation\n");
5964 if (SLP_TREE_LANES (child
) != SLP_TREE_LANES (node
))
5965 repeating_p
= false;
5968 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
5969 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
5970 if (dump_enabled_p ())
5972 dump_printf_loc (MSG_NOTE
, vect_location
,
5973 "vectorizing permutation");
5974 for (unsigned i
= 0; i
< perm
.length (); ++i
)
5975 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
5977 dump_printf (MSG_NOTE
, " (repeat %d)\n", SLP_TREE_LANES (node
));
5978 dump_printf (MSG_NOTE
, "\n");
5981 /* REPEATING_P is true if every output vector is guaranteed to use the
5982 same permute vector. We can handle that case for both variable-length
5983 and constant-length vectors, but we only handle other cases for
5984 constant-length vectors.
5988 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
5989 mask vector that we want to build.
5991 - NCOPIES to the number of copies of PERM that we need in order
5992 to build the necessary permute mask vectors.
5994 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
5995 for each permute mask vector. This is only relevant when GSI is
5998 unsigned nelts_per_pattern
;
6000 unsigned noutputs_per_mask
;
6003 /* We need a single permute mask vector that has the form:
6005 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
6007 In other words, the original n-element permute in PERM is
6008 "unrolled" to fill a full vector. The stepped vector encoding
6009 that we use for permutes requires 3n elements. */
6010 npatterns
= SLP_TREE_LANES (node
);
6011 nelts_per_pattern
= ncopies
= 3;
6012 noutputs_per_mask
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
6016 /* Calculate every element of every permute mask vector explicitly,
6017 instead of relying on the pattern described above. */
6018 if (!nunits
.is_constant (&npatterns
))
6020 nelts_per_pattern
= ncopies
= 1;
6021 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
6022 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&ncopies
))
6024 noutputs_per_mask
= 1;
6026 unsigned olanes
= ncopies
* SLP_TREE_LANES (node
);
6027 gcc_assert (repeating_p
|| multiple_p (olanes
, nunits
));
6029 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
6030 from the { SLP operand, scalar lane } permutation as recorded in the
6031 SLP node as intermediate step. This part should already work
6032 with SLP children with arbitrary number of lanes. */
6033 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
6034 auto_vec
<unsigned> active_lane
;
6035 vperm
.create (olanes
);
6036 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
6037 for (unsigned i
= 0; i
< ncopies
; ++i
)
6039 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
6041 std::pair
<unsigned, unsigned> p
= perm
[pi
];
6042 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
6044 vperm
.quick_push ({{p
.first
, 0}, p
.second
+ active_lane
[p
.first
]});
6047 /* We checked above that the vectors are constant-length. */
6048 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
6049 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
6050 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
6051 vperm
.quick_push ({{p
.first
, vi
}, vl
});
6054 /* Advance to the next group. */
6055 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
6056 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
6059 if (dump_enabled_p ())
6061 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
6062 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
6066 ? multiple_p (i
, npatterns
)
6067 : multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
))))
6068 dump_printf (MSG_NOTE
, ",");
6069 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
6070 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
6073 dump_printf (MSG_NOTE
, "\n");
6076 /* We can only handle two-vector permutes, everything else should
6077 be lowered on the SLP level. The following is closely inspired
6078 by vect_transform_slp_perm_load and is supposed to eventually
6080 ??? As intermediate step do code-gen in the SLP tree representation
6082 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
6083 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
6084 unsigned int index
= 0;
6085 poly_uint64 mask_element
;
6086 vec_perm_builder mask
;
6087 mask
.new_vector (nunits
, npatterns
, nelts_per_pattern
);
6088 unsigned int count
= mask
.encoded_nelts ();
6089 mask
.quick_grow (count
);
6090 vec_perm_indices indices
;
6091 unsigned nperms
= 0;
6092 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
6094 mask_element
= vperm
[i
].second
;
6095 if (first_vec
.first
== -1U
6096 || first_vec
== vperm
[i
].first
)
6097 first_vec
= vperm
[i
].first
;
6098 else if (second_vec
.first
== -1U
6099 || second_vec
== vperm
[i
].first
)
6101 second_vec
= vperm
[i
].first
;
6102 mask_element
+= nunits
;
6106 if (dump_enabled_p ())
6107 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6108 "permutation requires at "
6109 "least three vectors\n");
6114 mask
[index
++] = mask_element
;
6118 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2, nunits
);
6119 bool identity_p
= indices
.series_p (0, 1, 0, 1);
6121 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6123 if (dump_enabled_p ())
6125 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
6127 "unsupported vect permute { ");
6128 for (i
= 0; i
< count
; ++i
)
6130 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
6131 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
6133 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
6143 if (second_vec
.first
== -1U)
6144 second_vec
= first_vec
;
6147 first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
],
6148 second_node
= SLP_TREE_CHILDREN (node
)[second_vec
.first
];
6150 tree mask_vec
= NULL_TREE
;
6152 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
6154 for (unsigned int vi
= 0; vi
< noutputs_per_mask
; ++vi
)
6157 = vect_get_slp_vect_def (first_node
,
6158 first_vec
.second
+ vi
);
6160 = vect_get_slp_vect_def (second_node
,
6161 second_vec
.second
+ vi
);
6162 vect_add_slp_permutation (vinfo
, gsi
, node
, first_def
,
6163 second_def
, mask_vec
);
6168 first_vec
= std::make_pair (-1U, -1U);
6169 second_vec
= std::make_pair (-1U, -1U);
6174 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
6179 /* Vectorize SLP NODE. */
6182 vect_schedule_slp_node (vec_info
*vinfo
,
6183 slp_tree node
, slp_instance instance
)
6185 gimple_stmt_iterator si
;
6189 /* For existing vectors there's nothing to do. */
6190 if (SLP_TREE_VEC_DEFS (node
).exists ())
6193 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
6195 /* Vectorize externals and constants. */
6196 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
6197 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
6199 /* ??? vectorizable_shift can end up using a scalar operand which is
6200 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
6201 node in this case. */
6202 if (!SLP_TREE_VECTYPE (node
))
6205 vect_create_constant_vectors (vinfo
, node
);
6209 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
6211 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
6212 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
6214 if (dump_enabled_p ())
6215 dump_printf_loc (MSG_NOTE
, vect_location
,
6216 "------>vectorizing SLP node starting from: %G",
6219 if (STMT_VINFO_DATA_REF (stmt_info
)
6220 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
6222 /* Vectorized loads go before the first scalar load to make it
6223 ready early, vectorized stores go before the last scalar
6224 stmt which is where all uses are ready. */
6225 stmt_vec_info last_stmt_info
= NULL
;
6226 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
6227 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
6228 else /* DR_IS_WRITE */
6229 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
6230 si
= gsi_for_stmt (last_stmt_info
->stmt
);
6232 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
6233 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
6234 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
6235 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
6237 /* For PHI node vectorization we do not use the insertion iterator. */
6242 /* Emit other stmts after the children vectorized defs which is
6243 earliest possible. */
6244 gimple
*last_stmt
= NULL
;
6245 bool seen_vector_def
= false;
6246 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6247 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
6249 /* For fold-left reductions we are retaining the scalar
6250 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
6251 set so the representation isn't perfect. Resort to the
6252 last scalar def here. */
6253 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
6255 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
6256 == cycle_phi_info_type
);
6257 gphi
*phi
= as_a
<gphi
*>
6258 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
6260 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
6263 /* We are emitting all vectorized stmts in the same place and
6264 the last one is the last.
6265 ??? Unless we have a load permutation applied and that
6266 figures to re-use an earlier generated load. */
6269 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
6271 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
6274 else if (!SLP_TREE_VECTYPE (child
))
6276 /* For externals we use unvectorized at all scalar defs. */
6279 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
6280 if (TREE_CODE (def
) == SSA_NAME
6281 && !SSA_NAME_IS_DEFAULT_DEF (def
))
6283 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
6285 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
6291 /* For externals we have to look at all defs since their
6292 insertion place is decided per vector. But beware
6293 of pre-existing vectors where we need to make sure
6294 we do not insert before the region boundary. */
6295 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
6296 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
6297 seen_vector_def
= true;
6302 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
6303 if (TREE_CODE (vdef
) == SSA_NAME
6304 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
6306 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
6308 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
6313 /* This can happen when all children are pre-existing vectors or
6316 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
6319 gcc_assert (seen_vector_def
);
6320 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
6322 else if (is_ctrl_altering_stmt (last_stmt
))
6324 /* We split regions to vectorize at control altering stmts
6325 with a definition so this must be an external which
6326 we can insert at the start of the region. */
6327 si
= gsi_after_labels (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]);
6329 else if (is_a
<bb_vec_info
> (vinfo
)
6330 && gimple_bb (last_stmt
) != gimple_bb (stmt_info
->stmt
)
6331 && gimple_could_trap_p (stmt_info
->stmt
))
6333 /* We've constrained possibly trapping operations to all come
6334 from the same basic-block, if vectorized defs would allow earlier
6335 scheduling still force vectorized stmts to the original block.
6336 This is only necessary for BB vectorization since for loop vect
6337 all operations are in a single BB and scalar stmt based
6338 placement doesn't play well with epilogue vectorization. */
6339 gcc_assert (dominated_by_p (CDI_DOMINATORS
,
6340 gimple_bb (stmt_info
->stmt
),
6341 gimple_bb (last_stmt
)));
6342 si
= gsi_after_labels (gimple_bb (stmt_info
->stmt
));
6344 else if (is_a
<gphi
*> (last_stmt
))
6345 si
= gsi_after_labels (gimple_bb (last_stmt
));
6348 si
= gsi_for_stmt (last_stmt
);
6353 bool done_p
= false;
6355 /* Handle purely internal nodes. */
6356 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
6358 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6359 be shared with different SLP nodes (but usually it's the same
6360 operation apart from the case the stmt is only there for denoting
6361 the actual scalar lane defs ...). So do not call vect_transform_stmt
6362 but open-code it here (partly). */
6363 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
6368 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
6371 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6372 For loop vectorization this is done in vectorizable_call, but for SLP
6373 it needs to be deferred until end of vect_schedule_slp, because multiple
6374 SLP instances may refer to the same scalar stmt. */
6377 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
6378 slp_tree node
, hash_set
<slp_tree
> &visited
)
6381 gimple_stmt_iterator gsi
;
6385 stmt_vec_info stmt_info
;
6387 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
6390 if (visited
.add (node
))
6393 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6394 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
6396 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
6398 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
6399 if (!stmt
|| gimple_bb (stmt
) == NULL
)
6401 if (is_pattern_stmt_p (stmt_info
)
6402 || !PURE_SLP_STMT (stmt_info
))
6404 lhs
= gimple_call_lhs (stmt
);
6405 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
6406 gsi
= gsi_for_stmt (stmt
);
6407 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
6408 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
6413 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
6415 hash_set
<slp_tree
> visited
;
6416 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
6419 /* Vectorize the instance root. */
6422 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
6424 gassign
*rstmt
= NULL
;
6426 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
6431 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
6433 tree vect_lhs
= gimple_get_lhs (child_stmt
);
6434 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
6435 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
6436 TREE_TYPE (vect_lhs
)))
6437 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
6439 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
6443 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
6445 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
6448 vec
<constructor_elt
, va_gc
> *v
;
6449 vec_alloc (v
, nelts
);
6451 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
6453 CONSTRUCTOR_APPEND_ELT (v
,
6455 gimple_get_lhs (child_stmt
));
6457 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
6458 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
6459 tree r_constructor
= build_constructor (rtype
, v
);
6460 rstmt
= gimple_build_assign (lhs
, r_constructor
);
6465 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
6466 gsi_replace (&rgsi
, rstmt
, true);
6476 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6479 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
6480 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
6481 int &maxdfs
, vec
<slp_tree
> &stack
)
6484 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
6485 gcc_assert (!existed_p
);
6487 info
->lowlink
= maxdfs
;
6491 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
6493 info
->on_stack
= false;
6494 vect_schedule_slp_node (vinfo
, node
, instance
);
6498 info
->on_stack
= true;
6499 stack
.safe_push (node
);
6504 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
6508 slp_scc_info
*child_info
= scc_info
.get (child
);
6511 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
6512 /* Recursion might have re-allocated the node. */
6513 info
= scc_info
.get (node
);
6514 child_info
= scc_info
.get (child
);
6515 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
6517 else if (child_info
->on_stack
)
6518 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
6520 if (info
->lowlink
!= info
->dfs
)
6523 auto_vec
<slp_tree
, 4> phis_to_fixup
;
6526 if (stack
.last () == node
)
6529 info
->on_stack
= false;
6530 vect_schedule_slp_node (vinfo
, node
, instance
);
6531 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
6532 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
6533 phis_to_fixup
.quick_push (node
);
6538 int last_idx
= stack
.length () - 1;
6539 while (stack
[last_idx
] != node
)
6541 /* We can break the cycle at PHIs who have at least one child
6542 code generated. Then we could re-start the DFS walk until
6543 all nodes in the SCC are covered (we might have new entries
6544 for only back-reachable nodes). But it's simpler to just
6545 iterate and schedule those that are ready. */
6546 unsigned todo
= stack
.length () - last_idx
;
6549 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
6551 slp_tree entry
= stack
[idx
];
6554 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
6555 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
6557 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
6564 else if (scc_info
.get (child
)->on_stack
)
6582 vect_schedule_slp_node (vinfo
, entry
, instance
);
6583 scc_info
.get (entry
)->on_stack
= false;
6587 phis_to_fixup
.safe_push (entry
);
6594 stack
.truncate (last_idx
);
6597 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6599 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
6601 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
6604 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
6606 unsigned dest_idx
= e
->dest_idx
;
6607 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
6608 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
6610 /* Simply fill all args. */
6611 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
6612 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
6613 vect_get_slp_vect_def (child
, i
),
6614 e
, gimple_phi_arg_location (phi
, dest_idx
));
6619 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6622 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
6624 slp_instance instance
;
6627 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
6629 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6631 slp_tree node
= SLP_INSTANCE_TREE (instance
);
6632 if (dump_enabled_p ())
6634 dump_printf_loc (MSG_NOTE
, vect_location
,
6635 "Vectorizing SLP tree:\n");
6636 if (SLP_INSTANCE_ROOT_STMT (instance
))
6637 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
6638 SLP_INSTANCE_ROOT_STMT (instance
)->stmt
);
6639 vect_print_slp_graph (MSG_NOTE
, vect_location
,
6640 SLP_INSTANCE_TREE (instance
));
6642 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6643 have a PHI be the node breaking the cycle. */
6644 auto_vec
<slp_tree
> stack
;
6645 if (!scc_info
.get (node
))
6646 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
6648 if (SLP_INSTANCE_ROOT_STMT (instance
))
6649 vectorize_slp_instance_root_stmt (node
, instance
);
6651 if (dump_enabled_p ())
6652 dump_printf_loc (MSG_NOTE
, vect_location
,
6653 "vectorizing stmts using SLP.\n");
6656 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6658 slp_tree root
= SLP_INSTANCE_TREE (instance
);
6659 stmt_vec_info store_info
;
6662 /* Remove scalar call stmts. Do not do this for basic-block
6663 vectorization as not all uses may be vectorized.
6664 ??? Why should this be necessary? DCE should be able to
6665 remove the stmts itself.
6666 ??? For BB vectorization we can as well remove scalar
6667 stmts starting from the SLP tree root if they have no
6669 if (is_a
<loop_vec_info
> (vinfo
))
6670 vect_remove_slp_scalar_calls (vinfo
, root
);
6672 /* Remove vectorized stores original scalar stmts. */
6673 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
6675 if (!STMT_VINFO_DATA_REF (store_info
)
6676 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
6679 store_info
= vect_orig_stmt (store_info
);
6680 /* Free the attached stmt_vec_info and remove the stmt. */
6681 vinfo
->remove_stmt (store_info
);
6683 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
6684 to not crash in vect_free_slp_tree later. */
6685 if (SLP_TREE_REPRESENTATIVE (root
) == store_info
)
6686 SLP_TREE_REPRESENTATIVE (root
) = NULL
;